From e71d20d3928dd03e1159ec5a08cfa902cf85cd31 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Tue, 10 Dec 2024 20:42:52 +0100
Subject: [PATCH 001/583] Emit nbtree vacuum cycle id in nbtree xlog through
 forced FPIs (#9932)

This fixes neondatabase/neon#9929.

## Postgres repo PRS:
- PG17: https://github.com/neondatabase/postgres/pull/538
- PG16: https://github.com/neondatabase/postgres/pull/539
- PG15: https://github.com/neondatabase/postgres/pull/540
- PG14: https://github.com/neondatabase/postgres/pull/541

## Problem
see #9929

## Summary of changes

We update the split code to force the code to emit an FPI whenever the
cycle ID might be interesting for concurrent btree vacuum.
---
 .../regress/test_nbtree_pagesplit_cycleid.py  | 124 ++++++++++++++++++
 vendor/postgres-v14                           |   2 +-
 vendor/postgres-v15                           |   2 +-
 vendor/postgres-v16                           |   2 +-
 vendor/postgres-v17                           |   2 +-
 vendor/revisions.json                         |   8 +-
 6 files changed, 132 insertions(+), 8 deletions(-)
 create mode 100644 test_runner/regress/test_nbtree_pagesplit_cycleid.py

diff --git a/test_runner/regress/test_nbtree_pagesplit_cycleid.py b/test_runner/regress/test_nbtree_pagesplit_cycleid.py
new file mode 100644
index 0000000000..558557aeba
--- /dev/null
+++ b/test_runner/regress/test_nbtree_pagesplit_cycleid.py
@@ -0,0 +1,124 @@
+import threading
+import time
+
+from fixtures.neon_fixtures import NeonEnv
+
+BTREE_NUM_CYCLEID_PAGES = """
+    WITH raw_pages AS (
+        SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, NULL, NULL) page
+        FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) blkno
+    ),
+    parsed_pages AS (
+        /* cycle ID is the last 2 bytes of the btree page */
+        SELECT blkno, SUBSTRING(page FROM 8191 FOR 2) as cycle_id
+        FROM raw_pages
+    )
+    SELECT count(*),
+           encode(cycle_id, 'hex')
+     FROM parsed_pages
+    WHERE encode(cycle_id, 'hex') != '0000'
+    GROUP BY encode(cycle_id, 'hex');
+    """
+
+
+def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    endpoint = env.endpoints.create_start("main")
+
+    ses1 = endpoint.connect().cursor()
+    ses1.execute("ALTER SYSTEM SET autovacuum = off;")
+    ses1.execute("ALTER SYSTEM SET enable_seqscan = off;")
+    ses1.execute("ALTER SYSTEM SET full_page_writes = off;")
+    ses1.execute("SELECT pg_reload_conf();")
+    ses1.execute("CREATE EXTENSION neon_test_utils;")
+    # prepare a large index
+    ses1.execute("CREATE TABLE t(id integer GENERATED ALWAYS AS IDENTITY, txt text);")
+    ses1.execute("CREATE UNIQUE INDEX t_uidx ON t(id);")
+    ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 2035) i;")
+
+    ses1.execute("SELECT neon_xlogflush();")
+    ses1.execute(BTREE_NUM_CYCLEID_PAGES)
+    pages = ses1.fetchall()
+    assert (
+        len(pages) == 0
+    ), f"0 back splits with cycle ID expected, real {len(pages)} first {pages[0]}"
+    # Delete enough tuples to clear the first index page.
+    # (there are up to 407 rows per 8KiB page; 406 for non-rightmost leafs.
+    ses1.execute("DELETE FROM t WHERE id <= 406;")
+    # Make sure the page is cleaned up
+    ses1.execute("VACUUM (FREEZE, INDEX_CLEANUP ON) t;")
+
+    # Do another delete-then-indexcleanup cycle, to move the pages from
+    # "dead" to "reusable"
+    ses1.execute("DELETE FROM t WHERE id <= 446;")
+    ses1.execute("VACUUM (FREEZE, INDEX_CLEANUP ON) t;")
+
+    # Make sure the vacuum we're about to trigger in s3 has cleanup work to do
+    ses1.execute("DELETE FROM t WHERE id <= 610;")
+
+    # Flush wal, for checking purposes
+    ses1.execute("SELECT neon_xlogflush();")
+    ses1.execute(BTREE_NUM_CYCLEID_PAGES)
+    pages = ses1.fetchall()
+    assert len(pages) == 0, f"No back splits with cycle ID expected, got batches of {pages} instead"
+
+    ses2 = endpoint.connect().cursor()
+    ses3 = endpoint.connect().cursor()
+
+    # Session 2 pins a btree page, which prevents vacuum from processing that
+    # page, thus allowing us to reliably split pages while a concurrent vacuum
+    # is running.
+    ses2.execute("BEGIN;")
+    ses2.execute(
+        "DECLARE foo NO SCROLL CURSOR FOR SELECT row_number() over () FROM t ORDER BY id ASC"
+    )
+    ses2.execute("FETCH FROM foo;")  # pins the leaf page with id 611
+    wait_evt = threading.Event()
+
+    # Session 3 runs the VACUUM command. Note that this will block, and
+    # therefore must run on another thread.
+    # We rely on this running quickly enough to hit the pinned page from
+    # session 2 by the time we start other work again in session 1, but
+    # technically there is a race where the thread (and/or PostgreSQL process)
+    # don't get to that pinned page with vacuum until >2s after evt.set() was
+    # called, and session 1 thus might already have split pages.
+    def vacuum_freeze_t(ses3, evt: threading.Event):
+        # Begin parallel vacuum that should hit the index
+        evt.set()
+        # this'll hang until s2 fetches enough new data from its cursor.
+        # this is technically a race with the time.sleep(2) below, but if this
+        # command doesn't hit
+        ses3.execute("VACUUM (FREEZE, INDEX_CLEANUP on, DISABLE_PAGE_SKIPPING on) t;")
+
+    ses3t = threading.Thread(target=vacuum_freeze_t, args=(ses3, wait_evt))
+    ses3t.start()
+    wait_evt.wait()
+    # Make extra sure we got the thread started and vacuum is stuck, by waiting
+    # some time even after wait_evt got set. This isn't truly reliable (it is
+    # possible
+    time.sleep(2)
+
+    # Insert 2 pages worth of new data.
+    # This should reuse the one empty page, plus another page at the end of
+    # the index relation; with split ordering
+    #    old_blk -> blkno=1 -> old_blk + 1.
+    # As this is run while vacuum in session 3 is happening, these splits
+    # should receive cycle IDs where applicable.
+    ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 812) i;")
+    # unpin the btree page, allowing s3's vacuum to complete
+    ses2.execute("FETCH ALL FROM foo;")
+    ses2.execute("ROLLBACK;")
+    # flush WAL to make sure PS is up-to-date
+    ses1.execute("SELECT neon_xlogflush();")
+    # check that our expectations are correct
+    ses1.execute(BTREE_NUM_CYCLEID_PAGES)
+    pages = ses1.fetchall()
+    assert (
+        len(pages) == 1 and pages[0][0] == 3
+    ), f"3 page splits with cycle ID expected; actual {pages}"
+
+    # final cleanup
+    ses3t.join()
+    ses1.close()
+    ses2.close()
+    ses3.close()
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 373f9decad..13ff324150 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 373f9decad933d2d46f321231032ae8b0da81acd
+Subproject commit 13ff324150fceaac72920e01742addc053db9462
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 972e325e62..8736b10c1d 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 972e325e62b455957adbbdd8580e31275bb5b8c9
+Subproject commit 8736b10c1d93d11b9c0489872dd529c4c0f5338f
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index dff6615a8e..81428621f7 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit dff6615a8e48a10bb17a03fa3c00635f1ace7a92
+Subproject commit 81428621f7c04aed03671cf80a928e0a36d92505
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index a10d95be67..471c449ab8 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit a10d95be67265e0f10a422ba0457f5a7af01de71
+Subproject commit 471c449ab8f8ff5988b6bfb9eafa0a79772ad562
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 8a73e14dcf..ba0f34e23e 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.2",
-    "a10d95be67265e0f10a422ba0457f5a7af01de71"
+    "471c449ab8f8ff5988b6bfb9eafa0a79772ad562"
   ],
   "v16": [
     "16.6",
-    "dff6615a8e48a10bb17a03fa3c00635f1ace7a92"
+    "81428621f7c04aed03671cf80a928e0a36d92505"
   ],
   "v15": [
     "15.10",
-    "972e325e62b455957adbbdd8580e31275bb5b8c9"
+    "8736b10c1d93d11b9c0489872dd529c4c0f5338f"
   ],
   "v14": [
     "14.15",
-    "373f9decad933d2d46f321231032ae8b0da81acd"
+    "13ff324150fceaac72920e01742addc053db9462"
   ]
 }

From 597125e124b3d92ef3a0ca243722aa0f99238037 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Wed, 11 Dec 2024 01:51:05 +0100
Subject: [PATCH 002/583] Disable readstream's reliance on seqscan readahead
 (#9860)

Neon doesn't have seqscan detection of its own, so stop read_stream from
trying to utilize that readahead, and instead make it issue readahead of
its own.

## Problem

@knizhnik noticed that we didn't issue smgrprefetch[v] calls for
seqscans in PG17 due to the move to the read_stream API, which assumes
that the underlying IO facilities do seqscan detection for readahead.
That is a wrong assumption when Neon is involved, so let's remove the
code that applies that assumption.

## Summary of changes
Remove the cases where seqscans are detected and prefetch is disabled as
a consequence, and instead don't do that detection.

PG PR: https://github.com/neondatabase/postgres/pull/532
---
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 471c449ab8..01fa3c4866 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 471c449ab8f8ff5988b6bfb9eafa0a79772ad562
+Subproject commit 01fa3c48664ca030cfb69bb4a350aa9df4691d88
diff --git a/vendor/revisions.json b/vendor/revisions.json
index ba0f34e23e..7329aa437f 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v17": [
     "17.2",
-    "471c449ab8f8ff5988b6bfb9eafa0a79772ad562"
+    "01fa3c48664ca030cfb69bb4a350aa9df4691d88"
   ],
   "v16": [
     "16.6",

From 38415a9816dab1ccd1fadea77857aafb369abdf6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 11 Dec 2024 09:16:11 +0000
Subject: [PATCH 003/583] pageserver: fix ingest handling of CLog truncate
 (#10080)

## Problem

In #9786 we stop storing SLRUs on non-zero shards.

However, there was one code path during ingest that still tries to
enumerate SLRU relations on all shards. This fails if it sees a tenant
who has never seen any write to an SLRU, or who has done such thorough
compaction+GC that it has dropped its SLRU directory key.

## Summary of changes

- Avoid trying to list SLRU relations on nonzero shards
---
 pageserver/src/walingest.rs | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 30c8965d51..b7712cfac7 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -877,22 +877,24 @@ impl WalIngest {
         // will block waiting for the last valid LSN to advance up to
         // it. So we use the previous record's LSN in the get calls
         // instead.
-        for segno in modification
-            .tline
-            .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
-            .await?
-        {
-            let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
+        if modification.tline.get_shard_identity().is_shard_zero() {
+            for segno in modification
+                .tline
+                .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
+                .await?
+            {
+                let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
 
-            let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
-                pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno)
-            });
+                let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
+                    pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno)
+                });
 
-            if may_delete {
-                modification
-                    .drop_slru_segment(SlruKind::Clog, segno, ctx)
-                    .await?;
-                trace!("Drop CLOG segment {:>04X}", segno);
+                if may_delete {
+                    modification
+                        .drop_slru_segment(SlruKind::Clog, segno, ctx)
+                        .await?;
+                    trace!("Drop CLOG segment {:>04X}", segno);
+                }
             }
         }
 

From d7aeca2f343675f17f2adee05082f89644715469 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Wed, 11 Dec 2024 10:41:34 +0100
Subject: [PATCH 004/583] CI(deploy): create git tags/releases before
 triggering deploy workflows (#10022)

## Problem

When dev deployments are disabled (or fail), the tags for releases
aren't created. It makes more sense to have tag and release creation
before the deployment to prevent situations like
[this](https://github.com/neondatabase/neon/pull/9959).

It is not enough to move the tag creation before the deployment. If the
deployment fails, re-running the job isn't possible because the API call
to create the tag will fail.

## Summary of changes

- Tag/Release creation now happens before the deployment
- The two steps for tag and release have been merged into a bigger one
- There's new checks to ensure the that if the tags/releases already
exist as expected, things will continue just fine.
---
 .github/workflows/build_and_test.yml | 93 +++++++++++++++++++---------
 1 file changed, 64 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index cb966f292e..6023d1bb6f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1066,6 +1066,70 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
+      - name: Create git tag and GitHub release
+        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
+        uses: actions/github-script@v7
+        with:
+          retries: 5
+          script: |
+            const tag = "${{ needs.tag.outputs.build-tag }}";
+
+            try {
+              const existingRef = await github.rest.git.getRef({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                ref: `tags/${tag}`,
+              });
+
+              if (existingRef.data.object.sha !== context.sha) {
+                throw new Error(`Tag ${tag} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`);
+              }
+
+              console.log(`Tag ${tag} already exists and points to ${context.sha} as expected.`);
+            } catch (error) {
+              if (error.status !== 404) {
+                throw error;
+              }
+
+              console.log(`Tag ${tag} does not exist. Creating it...`);
+              await github.rest.git.createRef({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                ref: `refs/tags/${tag}`,
+                sha: context.sha,
+              });
+              console.log(`Tag ${tag} created successfully.`);
+            }
+
+            # TODO: check how GitHub releases looks for proxy/compute releases and enable them if they're ok
+            if (context.ref !== 'refs/heads/release') {
+              console.log(`GitHub release skipped for ${context.ref}.`);
+              return;
+            }
+
+            try {
+              const existingRelease = await github.rest.repos.getReleaseByTag({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                tag: tag,
+              });
+
+              console.log(`Release for tag ${tag} already exists (ID: ${existingRelease.data.id}).`);
+            } catch (error) {
+              if (error.status !== 404) {
+                throw error;
+              }
+
+              console.log(`Release for tag ${tag} does not exist. Creating it...`);
+              await github.rest.repos.createRelease({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                tag_name: tag,
+                generate_release_notes: true,
+              });
+              console.log(`Release for tag ${tag} created successfully.`);
+            }
+
       - name: Trigger deploy workflow
         env:
           GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
@@ -1115,35 +1179,6 @@ jobs:
             exit 1
           fi
 
-      - name: Create git tag
-        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
-        uses: actions/github-script@v7
-        with:
-          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
-          retries: 5
-          script: |
-            await github.rest.git.createRef({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
-              sha: context.sha,
-            })
-
-      # TODO: check how GitHub releases looks for proxy releases and enable it if it's ok
-      - name: Create GitHub release
-        if: github.ref_name == 'release'
-        uses: actions/github-script@v7
-        with:
-          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
-          retries: 5
-          script: |
-            await github.rest.repos.createRelease({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              tag_name: "${{ needs.tag.outputs.build-tag }}",
-              generate_release_notes: true,
-            })
-
   # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
   promote-compatibility-data:
     needs: [ deploy ]

From 665369c439d6ec1d107dea7ccc80ce64b080297a Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 11 Dec 2024 12:35:02 +0000
Subject: [PATCH 005/583] wal_decoder: fix compact key protobuf encoding
 (#10074)

## Problem

Protobuf doesn't support 128 bit integers, so we encode the keys as two
64 bit integers. Issue is that when we split the 128 bit compact key we
use signed 64 bit integers to represent the two halves. This may result
in a negative lower half when relnode is larger than `0x00800000`. When
we convert the lower half to an i128 we get a negative `CompactKey`.

## Summary of Changes

Use unsigned integers when encoding into Protobuf.

## Deployment

* Prod: We disabled the interpreted proto, so no compat concerns.
* Staging: Disable the interpreted proto, do one release, and then
release the fixed version.
We do this because a negative int32 will convert to a large uint32 value
and could give
a key in the actual pageserver space. In production we would around this
by adding new
fields to the proto and deprecating the old ones, but we can make our
lives easy here.
* Pre-prod: Same as staging
---
 libs/pageserver_api/src/key.rs               |  2 +-
 libs/wal_decoder/proto/interpreted_wal.proto |  4 +-
 libs/wal_decoder/src/wire_format.rs          | 65 +++++++++++++++++++-
 3 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 37dff6fe46..373329c9b4 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -24,7 +24,7 @@ pub struct Key {
 
 /// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
 /// a struct of fields.
-#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
+#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)]
 pub struct CompactKey(i128);
 
 /// The storage key size.
diff --git a/libs/wal_decoder/proto/interpreted_wal.proto b/libs/wal_decoder/proto/interpreted_wal.proto
index 0393392c1a..d68484d30f 100644
--- a/libs/wal_decoder/proto/interpreted_wal.proto
+++ b/libs/wal_decoder/proto/interpreted_wal.proto
@@ -37,7 +37,7 @@ message ValueMeta {
 }
 
 message CompactKey {
-  int64 high = 1;
-  int64 low = 2;
+  uint64 high = 1;
+  uint64 low = 2;
 }
 
diff --git a/libs/wal_decoder/src/wire_format.rs b/libs/wal_decoder/src/wire_format.rs
index 5a343054c3..944ee5c919 100644
--- a/libs/wal_decoder/src/wire_format.rs
+++ b/libs/wal_decoder/src/wire_format.rs
@@ -236,8 +236,8 @@ impl From<ValueMeta> for proto::ValueMeta {
 impl From<CompactKey> for proto::CompactKey {
     fn from(value: CompactKey) -> Self {
         proto::CompactKey {
-            high: (value.raw() >> 64) as i64,
-            low: value.raw() as i64,
+            high: (value.raw() >> 64) as u64,
+            low: value.raw() as u64,
         }
     }
 }
@@ -354,3 +354,64 @@ impl From<proto::CompactKey> for CompactKey {
         (((value.high as i128) << 64) | (value.low as i128)).into()
     }
 }
+
+#[test]
+fn test_compact_key_with_large_relnode() {
+    use pageserver_api::key::Key;
+
+    let inputs = vec![
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x007FFFFF,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x00800000,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x00800001,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0xFFFFFFFF,
+            field3: 0xFFFFFFFF,
+            field4: 0xFFFFFFFF,
+            field5: 0x0,
+            field6: 0x0,
+        },
+    ];
+
+    for input in inputs {
+        assert!(input.is_valid_key_on_write_path());
+        let compact = input.to_compact();
+        let proto: proto::CompactKey = compact.into();
+        let from_proto: CompactKey = proto.into();
+
+        assert_eq!(
+            compact, from_proto,
+            "Round trip failed for key with relnode={:#x}",
+            input.field4
+        );
+    }
+}

From 9ae980bf4f320c20dc48adce90e92e74fd2ea45c Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 11 Dec 2024 14:37:08 +0100
Subject: [PATCH 006/583] page_service: don't count time spent in Batcher
 towards smgr latency metrics (#10075)

## Problem

With pipelining enabled, the time a request spends in the batcher stage
counts towards the smgr op latency.

If pipelining is disabled, that time is not accounted for.

In practice, this results in a jump in smgr getpage latencies in various
dashboards and degrades the internal SLO.

## Solution

In a similar vein to #10042 and with a similar rationale, this PR stops
counting the time spent in batcher stage towards smgr op latency.

The smgr op latency metric is reduced to the actual execution time.

Time spent in batcher stage is tracked in a separate histogram.
I expect to remove that histogram after batching rollout is complete,
but it will be helpful in the meantime to reason about the rollout.
---
 pageserver/src/metrics.rs         | 168 +++++++++++++++++++++---------
 pageserver/src/page_service.rs    |  13 ++-
 pageserver/src/tenant/throttle.rs |  19 ++--
 test_runner/fixtures/metrics.py   |   1 +
 4 files changed, 143 insertions(+), 58 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 96ee157856..b4e20cb8b9 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -16,7 +16,6 @@ use postgres_backend::{is_expected_io_error, QueryError};
 use pq_proto::framed::ConnectionError;
 use strum::{EnumCount, VariantNames};
 use strum_macros::{IntoStaticStr, VariantNames};
-use tracing::warn;
 use utils::id::TimelineId;
 
 /// Prometheus histogram buckets (in seconds) for operations in the critical
@@ -1225,32 +1224,58 @@ pub(crate) mod virtual_file_io_engine {
 
 pub(crate) struct SmgrOpTimer(Option<SmgrOpTimerInner>);
 pub(crate) struct SmgrOpTimerInner {
-    global_latency_histo: Histogram,
+    global_execution_latency_histo: Histogram,
+    per_timeline_execution_latency_histo: Option<Histogram>,
 
-    // Optional because not all op types are tracked per-timeline
-    per_timeline_latency_histo: Option<Histogram>,
+    global_batch_wait_time: Histogram,
+    per_timeline_batch_wait_time: Histogram,
 
     global_flush_in_progress_micros: IntCounter,
     per_timeline_flush_in_progress_micros: IntCounter,
 
-    start: Instant,
-    throttled: Duration,
-    op: SmgrQueryType,
+    timings: SmgrOpTimerState,
+}
+
+#[derive(Debug)]
+enum SmgrOpTimerState {
+    Received {
+        received_at: Instant,
+    },
+    ThrottleDoneExecutionStarting {
+        received_at: Instant,
+        throttle_started_at: Instant,
+        started_execution_at: Instant,
+    },
 }
 
 pub(crate) struct SmgrOpFlushInProgress {
-    base: Instant,
+    flush_started_at: Instant,
     global_micros: IntCounter,
     per_timeline_micros: IntCounter,
 }
 
 impl SmgrOpTimer {
-    pub(crate) fn deduct_throttle(&mut self, throttle: &Option<Duration>) {
-        let Some(throttle) = throttle else {
-            return;
-        };
+    pub(crate) fn observe_throttle_done_execution_starting(&mut self, throttle: &ThrottleResult) {
         let inner = self.0.as_mut().expect("other public methods consume self");
-        inner.throttled += *throttle;
+        match (&mut inner.timings, throttle) {
+            (SmgrOpTimerState::Received { received_at }, throttle) => match throttle {
+                ThrottleResult::NotThrottled { start } => {
+                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                        received_at: *received_at,
+                        throttle_started_at: *start,
+                        started_execution_at: *start,
+                    };
+                }
+                ThrottleResult::Throttled { start, end } => {
+                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                        received_at: *start,
+                        throttle_started_at: *start,
+                        started_execution_at: *end,
+                    };
+                }
+            },
+            (x, _) => panic!("called in unexpected state: {x:?}"),
+        }
     }
 
     pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
@@ -1263,7 +1288,7 @@ impl SmgrOpTimer {
             ..
         } = inner;
         SmgrOpFlushInProgress {
-            base: flush_start,
+            flush_started_at: flush_start,
             global_micros: global_flush_in_progress_micros,
             per_timeline_micros: per_timeline_flush_in_progress_micros,
         }
@@ -1274,32 +1299,42 @@ impl SmgrOpTimer {
         let inner = self.0.take()?;
 
         let now = Instant::now();
-        let elapsed = now - inner.start;
 
-        let elapsed = match elapsed.checked_sub(inner.throttled) {
-            Some(elapsed) => elapsed,
-            None => {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
-                    Lazy::new(|| {
-                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
-                            RateLimit::new(Duration::from_secs(10))
-                        })))
-                    });
-                let mut guard = LOGGED.lock().unwrap();
-                let rate_limit = &mut guard[inner.op];
-                rate_limit.call(|| {
-                    warn!(op=?inner.op, ?elapsed, ?inner.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
-                });
-                elapsed // un-throttled time, more info than just saturating to 0
+        let batch;
+        let execution;
+        let throttle;
+        match inner.timings {
+            SmgrOpTimerState::Received { received_at } => {
+                batch = (now - received_at).as_secs_f64();
+                // TODO: use label for dropped requests.
+                // This is quite rare in practice, only during tenant/pageservers shutdown.
+                throttle = Duration::ZERO;
+                execution = Duration::ZERO.as_secs_f64();
             }
-        };
+            SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                received_at,
+                throttle_started_at,
+                started_execution_at,
+            } => {
+                batch = (throttle_started_at - received_at).as_secs_f64();
+                throttle = started_execution_at - throttle_started_at;
+                execution = (now - started_execution_at).as_secs_f64();
+            }
+        }
 
-        let elapsed = elapsed.as_secs_f64();
+        // update time spent in batching
+        inner.global_batch_wait_time.observe(batch);
+        inner.per_timeline_batch_wait_time.observe(batch);
 
-        inner.global_latency_histo.observe(elapsed);
-        if let Some(per_timeline_getpage_histo) = &inner.per_timeline_latency_histo {
-            per_timeline_getpage_histo.observe(elapsed);
+        // time spent in throttle metric is updated by throttle impl
+        let _ = throttle;
+
+        // update metrics for execution latency
+        inner.global_execution_latency_histo.observe(execution);
+        if let Some(per_timeline_execution_latency_histo) =
+            &inner.per_timeline_execution_latency_histo
+        {
+            per_timeline_execution_latency_histo.observe(execution);
         }
 
         Some((now, inner))
@@ -1325,12 +1360,12 @@ impl SmgrOpFlushInProgress {
         // Last call is tracked in `now`.
         let mut observe_guard = scopeguard::guard(
             || {
-                let elapsed = now - self.base;
+                let elapsed = now - self.flush_started_at;
                 self.global_micros
                     .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
                 self.per_timeline_micros
                     .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
-                self.base = now;
+                self.flush_started_at = now;
             },
             |mut observe| {
                 observe();
@@ -1377,6 +1412,8 @@ pub(crate) struct SmgrQueryTimePerTimeline {
     per_timeline_batch_size: Histogram,
     global_flush_in_progress_micros: IntCounter,
     per_timeline_flush_in_progress_micros: IntCounter,
+    global_batch_wait_time: Histogram,
+    per_timeline_batch_wait_time: Histogram,
 }
 
 static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1399,12 +1436,15 @@ static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|
     .expect("failed to define a metric")
 });
 
+// Alias so all histograms recording per-timeline smgr timings use the same buckets.
+static SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS: &[f64] = CRITICAL_OP_BUCKETS;
+
 static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_smgr_query_seconds",
-        "Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
+        "Time spent _executing_ smgr query handling, excluding batch and throttle delays.",
         &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
-        CRITICAL_OP_BUCKETS.into(),
+        SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
     )
     .expect("failed to define a metric")
 });
@@ -1462,7 +1502,7 @@ static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy<Vec<f64>> = Lazy::new(|| {
 static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_smgr_query_seconds_global",
-        "Time spent on smgr query handling, aggregated by query type.",
+        "Like pageserver_smgr_query_seconds, but aggregated to instance level.",
         &["smgr_query_type"],
         SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(),
     )
@@ -1559,6 +1599,25 @@ static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL: Lazy<IntCounter> = Lazy
     .expect("failed to define a metric")
 });
 
+static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_page_service_pagestream_batch_wait_time_seconds",
+        "Time a request spent waiting in its batch until the batch moved to throttle&execution.",
+        &["tenant_id", "shard_id", "timeline_id"],
+        SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_page_service_pagestream_batch_wait_time_seconds_global",
+        "Like pageserver_page_service_pagestream_batch_wait_time_seconds, but aggregated to instance level.",
+        SMGR_QUERY_TIME_GLOBAL_BUCKETS.to_vec(),
+    )
+    .expect("failed to define a metric")
+});
+
 impl SmgrQueryTimePerTimeline {
     pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
         let tenant_id = tenant_shard_id.tenant_id.to_string();
@@ -1599,6 +1658,11 @@ impl SmgrQueryTimePerTimeline {
             .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
             .unwrap();
 
+        let global_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL.clone();
+        let per_timeline_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME
+            .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
+            .unwrap();
+
         let global_flush_in_progress_micros =
             PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
         let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
@@ -1614,9 +1678,11 @@ impl SmgrQueryTimePerTimeline {
             per_timeline_batch_size,
             global_flush_in_progress_micros,
             per_timeline_flush_in_progress_micros,
+            global_batch_wait_time,
+            per_timeline_batch_wait_time,
         }
     }
-    pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer {
+    pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer {
         self.global_started[op as usize].inc();
 
         let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
@@ -1627,15 +1693,15 @@ impl SmgrQueryTimePerTimeline {
         };
 
         SmgrOpTimer(Some(SmgrOpTimerInner {
-            global_latency_histo: self.global_latency[op as usize].clone(),
-            per_timeline_latency_histo,
-            start: started_at,
-            op,
-            throttled: Duration::ZERO,
+            global_execution_latency_histo: self.global_latency[op as usize].clone(),
+            per_timeline_execution_latency_histo: per_timeline_latency_histo,
+            timings: SmgrOpTimerState::Received { received_at },
             global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
             per_timeline_flush_in_progress_micros: self
                 .per_timeline_flush_in_progress_micros
                 .clone(),
+            global_batch_wait_time: self.global_batch_wait_time.clone(),
+            per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(),
         }))
     }
 
@@ -2889,6 +2955,11 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
         ]);
+        let _ = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
     }
 }
 
@@ -2919,6 +2990,7 @@ use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;
 use crate::tenant::tasks::BackgroundLoopKind;
+use crate::tenant::throttle::ThrottleResult;
 use crate::tenant::Timeline;
 
 /// Maintain a per timeline gauge in addition to the global gauge.
@@ -3773,6 +3845,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
         &REMOTE_ONDEMAND_DOWNLOADED_BYTES,
         &CIRCUIT_BREAKERS_BROKEN,
         &CIRCUIT_BREAKERS_UNBROKEN,
+        &PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL,
     ]
     .into_iter()
     .for_each(|c| {
@@ -3820,6 +3893,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
         &WAL_REDO_BYTES_HISTOGRAM,
         &WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
         &PAGE_SERVICE_BATCH_SIZE_GLOBAL,
+        &PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL,
     ]
     .into_iter()
     .for_each(|h| {
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 97d94bbe7f..d00ec11a76 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -575,7 +575,10 @@ enum BatchedFeMessage {
 }
 
 impl BatchedFeMessage {
-    async fn throttle(&mut self, cancel: &CancellationToken) -> Result<(), QueryError> {
+    async fn throttle_and_record_start_processing(
+        &mut self,
+        cancel: &CancellationToken,
+    ) -> Result<(), QueryError> {
         let (shard, tokens, timers) = match self {
             BatchedFeMessage::Exists { shard, timer, .. }
             | BatchedFeMessage::Nblocks { shard, timer, .. }
@@ -603,7 +606,7 @@ impl BatchedFeMessage {
             }
         };
         for timer in timers {
-            timer.deduct_throttle(&throttled);
+            timer.observe_throttle_done_execution_starting(&throttled);
         }
         Ok(())
     }
@@ -1230,7 +1233,7 @@ impl PageServerHandler {
                 }
             };
 
-            if let Err(cancelled) = msg.throttle(&self.cancel).await {
+            if let Err(cancelled) = msg.throttle_and_record_start_processing(&self.cancel).await {
                 break cancelled;
             }
 
@@ -1397,7 +1400,9 @@ impl PageServerHandler {
                             return Err(e);
                         }
                     };
-                    batch.throttle(&self.cancel).await?;
+                    batch
+                        .throttle_and_record_start_processing(&self.cancel)
+                        .await?;
                     self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
                         .await?;
                 }
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index 54c0e59daa..8ab6a0e060 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -58,6 +58,11 @@ pub struct Stats {
     pub sum_throttled_usecs: u64,
 }
 
+pub enum ThrottleResult {
+    NotThrottled { start: Instant },
+    Throttled { start: Instant, end: Instant },
+}
+
 impl<M> Throttle<M>
 where
     M: Metric,
@@ -122,15 +127,15 @@ where
         self.inner.load().rate_limiter.steady_rps()
     }
 
-    pub async fn throttle(&self, key_count: usize) -> Option<Duration> {
+    pub async fn throttle(&self, key_count: usize) -> ThrottleResult {
         let inner = self.inner.load_full(); // clones the `Inner` Arc
 
-        if !inner.enabled {
-            return None;
-        }
-
         let start = std::time::Instant::now();
 
+        if !inner.enabled {
+            return ThrottleResult::NotThrottled { start };
+        }
+
         self.metric.accounting_start();
         self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
         let did_throttle = inner.rate_limiter.acquire(key_count).await;
@@ -145,9 +150,9 @@ where
                 .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
             let observation = Observation { wait_time };
             self.metric.observe_throttling(&observation);
-            Some(wait_time)
+            ThrottleResult::Throttled { start, end: now }
         } else {
-            None
+            ThrottleResult::NotThrottled { start }
         }
     }
 }
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index a591e088ef..c5295360c3 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -178,6 +178,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     counter("pageserver_timeline_wal_records_received"),
     counter("pageserver_page_service_pagestream_flush_in_progress_micros"),
     *histogram("pageserver_page_service_batch_size"),
+    *histogram("pageserver_page_service_pagestream_batch_wait_time_seconds"),
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     # "pageserver_directory_entries_count", -- only used if above a certain threshold
     # "pageserver_broken_tenants_count" -- used only for broken

From a53db7385151d3bc9b18d7e70b110cee36d5d32c Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 11 Dec 2024 14:28:18 +0000
Subject: [PATCH 007/583] pageserver: don't drop multixact slrus on non zero
 shards (#10086)

## Problem

We get slru truncation commands on non-zero shards.
Compaction will drop the slru dir keys and ingest will fail when
receiving such records.
https://github.com/neondatabase/neon/pull/10080 fixed it for clog, but
not for multixact.

## Summary of changes

Only truncate multixact slrus on shard zero. I audited the rest of the
ingest code and it looks
fine from this pov.
---
 pageserver/src/walingest.rs | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index b7712cfac7..e5b23fed51 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1049,16 +1049,18 @@ impl WalIngest {
 
         // Delete all the segments except the last one. The last segment can still
         // contain, possibly partially, valid data.
-        while segment != endsegment {
-            modification
-                .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
-                .await?;
+        if modification.tline.get_shard_identity().is_shard_zero() {
+            while segment != endsegment {
+                modification
+                    .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
+                    .await?;
 
-            /* move to next segment, handling wraparound correctly */
-            if segment == maxsegment {
-                segment = 0;
-            } else {
-                segment += 1;
+                /* move to next segment, handling wraparound correctly */
+                if segment == maxsegment {
+                    segment = 0;
+                } else {
+                    segment += 1;
+                }
             }
         }
 

From c79c1dd8e90a8d813c0e02d8dcbfaaae99779b8e Mon Sep 17 00:00:00 2001
From: Mikhail Kot <mikhail@neon.tech>
Date: Wed, 11 Dec 2024 15:03:11 +0000
Subject: [PATCH 008/583] compute_ctl: don't panic if control plane can't be
 reached (#10078)

## Problem

If the control plane cannot be reached for some reason, compute_ctl
panics

## Summary of changes

panic is removed in favour of returning an error.
Code is reformatted a bit for more flat control flow

Resolves: #5391
---
 compute_tools/src/bin/compute_ctl.rs | 79 ++++++++++++++--------------
 1 file changed, 40 insertions(+), 39 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index e73ccd908e..bb248734a8 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -246,47 +246,48 @@ fn try_spec_from_cli(
     let compute_id = matches.get_one::<String>("compute-id");
     let control_plane_uri = matches.get_one::<String>("control-plane-uri");
 
-    let spec;
-    let mut live_config_allowed = false;
-    match spec_json {
-        // First, try to get cluster spec from the cli argument
-        Some(json) => {
-            info!("got spec from cli argument {}", json);
-            spec = Some(serde_json::from_str(json)?);
-        }
-        None => {
-            // Second, try to read it from the file if path is provided
-            if let Some(sp) = spec_path {
-                let path = Path::new(sp);
-                let file = File::open(path)?;
-                spec = Some(serde_json::from_reader(file)?);
-                live_config_allowed = true;
-            } else if let Some(id) = compute_id {
-                if let Some(cp_base) = control_plane_uri {
-                    live_config_allowed = true;
-                    spec = match get_spec_from_control_plane(cp_base, id) {
-                        Ok(s) => s,
-                        Err(e) => {
-                            error!("cannot get response from control plane: {}", e);
-                            panic!("neither spec nor confirmation that compute is in the Empty state was received");
-                        }
-                    };
-                } else {
-                    panic!("must specify both --control-plane-uri and --compute-id or none");
-                }
-            } else {
-                panic!(
-                    "compute spec should be provided by one of the following ways: \
-                    --spec OR --spec-path OR --control-plane-uri and --compute-id"
-                );
-            }
-        }
+    // First, try to get cluster spec from the cli argument
+    if let Some(spec_json) = spec_json {
+        info!("got spec from cli argument {}", spec_json);
+        return Ok(CliSpecParams {
+            spec: Some(serde_json::from_str(spec_json)?),
+            live_config_allowed: false,
+        });
+    }
+
+    // Second, try to read it from the file if path is provided
+    if let Some(spec_path) = spec_path {
+        let file = File::open(Path::new(spec_path))?;
+        return Ok(CliSpecParams {
+            spec: Some(serde_json::from_reader(file)?),
+            live_config_allowed: true,
+        });
+    }
+
+    let Some(compute_id) = compute_id else {
+        panic!(
+            "compute spec should be provided by one of the following ways: \
+                --spec OR --spec-path OR --control-plane-uri and --compute-id"
+        );
+    };
+    let Some(control_plane_uri) = control_plane_uri else {
+        panic!("must specify both --control-plane-uri and --compute-id or none");
     };
 
-    Ok(CliSpecParams {
-        spec,
-        live_config_allowed,
-    })
+    match get_spec_from_control_plane(control_plane_uri, compute_id) {
+        Ok(spec) => Ok(CliSpecParams {
+            spec,
+            live_config_allowed: true,
+        }),
+        Err(e) => {
+            error!(
+                "cannot get response from control plane: {}\n\
+                neither spec nor confirmation that compute is in the Empty state was received",
+                e
+            );
+            Err(e)
+        }
+    }
 }
 
 struct CliSpecParams {

From b987648e713b7842be1a41ab2eca898a9c90eefb Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Wed, 11 Dec 2024 16:28:10 +0100
Subject: [PATCH 009/583] Enable LFC for all the PG versions. (#10068)

## Problem
We added support for LFC for tests but are still using it only for the
PG17 release.

## Summary of changes
LFC is enabled for all PG versions. Errors in tests with LFC enabled now
block merging as usual. We keep tests with disabled LFC for PG17
release. Tests on debug builds with LFC enabled still don't affect
permission to merge.
---
 .github/workflows/_build-and-test-locally.yml |  2 +-
 .github/workflows/build_and_test.yml          | 18 ++++++++++--------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 42c32a23e3..7d47f78d6b 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -283,7 +283,7 @@ jobs:
           submodules: true
 
       - name: Pytest regression tests
-        continue-on-error: ${{ matrix.lfc_state == 'with-lfc' }}
+        continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
         uses: ./.github/actions/run-python-test-set
         timeout-minutes: 60
         with:
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 6023d1bb6f..ee22f2ff54 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -255,15 +255,17 @@ jobs:
       build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
       build-tag: ${{ needs.tag.outputs.build-tag }}
       build-type: ${{ matrix.build-type }}
-      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
-      # run without LFC on v17 release only
+      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds.
+      # Run without LFC on v17 release and debug builds only. For all the other cases LFC is enabled. Failure on the
+      # debug build with LFC enabled doesn't block merging.
       test-cfg: |
-        ${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v15", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v16", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v17", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v17", "lfc_state": "with-lfc"}]'
-                                           || '[{"pg_version":"v17", "lfc_state": "without-lfc"}]' }}
+        ${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v15", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v16", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "without-lfc"}]'
+                                           || '[{"pg_version":"v17", "lfc_state": "without-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "with-lfc" }]' }}
     secrets: inherit
 
   # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking

From e4bb1ca7d82ed1d6663459fcfea3f6be78be00ea Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Wed, 11 Dec 2024 18:46:50 +0300
Subject: [PATCH 010/583] Increase neon_local http client to compute timeout in
 reconfigure. (#10088)

Seems like 30s sometimes not enough when CI runners are overloaded,
causing pull_timeline flakiness.

ref
https://github.com/neondatabase/neon/issues/9731#issuecomment-2535946443
---
 control_plane/src/endpoint.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 35067c95b6..1fdf326051 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -810,7 +810,7 @@ impl Endpoint {
         }
 
         let client = reqwest::Client::builder()
-            .timeout(Duration::from_secs(30))
+            .timeout(Duration::from_secs(120))
             .build()
             .unwrap();
         let response = client

From dee2041cd3a7bd74b732933890345cd354fd1c80 Mon Sep 17 00:00:00 2001
From: Mikhail Kot <mikhail@neon.tech>
Date: Wed, 11 Dec 2024 16:23:59 +0000
Subject: [PATCH 011/583] walproposer: fix link error on debian 12 / ubuntu 22
 (#10090)

## Problem

Linking walproposer library (e.g. `cargo t`) produces linker errors:
/home/myrrc/neon/pgxn/neon/walproposer_compat.c:169: undefined reference
to `pg_snprintf'

The library with these symbols (libpgcommon.a) is present

## Summary of changes

Changed order of libraries resolution for linker
---
 libs/walproposer/build.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs
index 3f549889b8..8d5b1ade35 100644
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -30,9 +30,9 @@ fn main() -> anyhow::Result<()> {
     let pgxn_neon = std::fs::canonicalize(pgxn_neon)?;
     let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?;
 
+    println!("cargo:rustc-link-lib=static=walproposer");
     println!("cargo:rustc-link-lib=static=pgport");
     println!("cargo:rustc-link-lib=static=pgcommon");
-    println!("cargo:rustc-link-lib=static=walproposer");
     println!("cargo:rustc-link-search={walproposer_lib_search_str}");
 
     // Rebuild crate when libwalproposer.a changes

From ef233e91ef7446cbf70898aafdc98b8335c53c59 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Wed, 11 Dec 2024 16:43:26 +0000
Subject: [PATCH 012/583] Update compute_installed_extensions metric: (#9891)

add owned_by_superuser field to filter out system extensions.

While on it, also correct related code:
- fix the metric setting: use set() instead of inc() in a loop.
inc() is not idempotent and can lead to incorrect results
if the function called multiple times. Currently it is only called at
compute start, but this will change soon.
- fix the return type of the installed_extensions endpoint
to match the metric. Currently it is only used in the test.
---
 compute_tools/src/http/openapi_spec.yaml      |  6 +-
 compute_tools/src/installed_extensions.rs     | 55 ++++++++++++-------
 libs/compute_api/src/responses.rs             |  4 +-
 .../regress/test_installed_extensions.py      | 28 +++++-----
 4 files changed, 56 insertions(+), 37 deletions(-)

diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index 7b9a62c545..24a67cac71 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -537,12 +537,14 @@ components:
             properties:
               extname:
                 type: string
-              versions:
-                type: array
+              version:
+                type: string
                 items:
                   type: string
               n_databases:
                 type: integer
+              owned_by_superuser:
+                type: integer
 
     SetRoleGrantsRequest:
       type: object
diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
index 5f62f08858..0ab259ddf1 100644
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -1,7 +1,6 @@
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
 use metrics::proto::MetricFamily;
 use std::collections::HashMap;
-use std::collections::HashSet;
 
 use anyhow::Result;
 use postgres::{Client, NoTls};
@@ -38,61 +37,77 @@ fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
 /// Connect to every database (see list_dbs above) and get the list of installed extensions.
 ///
 /// Same extension can be installed in multiple databases with different versions,
-/// we only keep the highest and lowest version across all databases.
+/// so we report a separate metric (number of databases where it is installed)
+/// for each extension version.
 pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result<InstalledExtensions> {
     conf.application_name("compute_ctl:get_installed_extensions");
     let mut client = conf.connect(NoTls)?;
-
     let databases: Vec<String> = list_dbs(&mut client)?;
 
-    let mut extensions_map: HashMap<String, InstalledExtension> = HashMap::new();
+    let mut extensions_map: HashMap<(String, String, String), InstalledExtension> = HashMap::new();
     for db in databases.iter() {
         conf.dbname(db);
         let mut db_client = conf.connect(NoTls)?;
-        let extensions: Vec<(String, String)> = db_client
+        let extensions: Vec<(String, String, i32)> = db_client
             .query(
-                "SELECT extname, extversion FROM pg_catalog.pg_extension;",
+                "SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension",
                 &[],
             )?
             .iter()
-            .map(|row| (row.get("extname"), row.get("extversion")))
+            .map(|row| {
+                (
+                    row.get("extname"),
+                    row.get("extversion"),
+                    row.get("extowner"),
+                )
+            })
             .collect();
 
-        for (extname, v) in extensions.iter() {
+        for (extname, v, extowner) in extensions.iter() {
             let version = v.to_string();
 
-            // increment the number of databases where the version of extension is installed
-            INSTALLED_EXTENSIONS
-                .with_label_values(&[extname, &version])
-                .inc();
+            // check if the extension is owned by superuser
+            // 10 is the oid of superuser
+            let owned_by_superuser = if *extowner == 10 { "1" } else { "0" };
 
             extensions_map
-                .entry(extname.to_string())
+                .entry((
+                    extname.to_string(),
+                    version.clone(),
+                    owned_by_superuser.to_string(),
+                ))
                 .and_modify(|e| {
-                    e.versions.insert(version.clone());
                     // count the number of databases where the extension is installed
                     e.n_databases += 1;
                 })
                 .or_insert(InstalledExtension {
                     extname: extname.to_string(),
-                    versions: HashSet::from([version.clone()]),
+                    version: version.clone(),
                     n_databases: 1,
+                    owned_by_superuser: owned_by_superuser.to_string(),
                 });
         }
     }
 
-    let res = InstalledExtensions {
-        extensions: extensions_map.into_values().collect(),
-    };
+    for (key, ext) in extensions_map.iter() {
+        let (extname, version, owned_by_superuser) = key;
+        let n_databases = ext.n_databases as u64;
 
-    Ok(res)
+        INSTALLED_EXTENSIONS
+            .with_label_values(&[extname, version, owned_by_superuser])
+            .set(n_databases);
+    }
+
+    Ok(InstalledExtensions {
+        extensions: extensions_map.into_values().collect(),
+    })
 }
 
 static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "compute_installed_extensions",
         "Number of databases where the version of extension is installed",
-        &["extension_name", "version"]
+        &["extension_name", "version", "owned_by_superuser"]
     )
     .expect("failed to define a metric")
 });
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 79234be720..0d65f6a38d 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -1,6 +1,5 @@
 //! Structs representing the JSON formats used in the compute_ctl's HTTP API.
 
-use std::collections::HashSet;
 use std::fmt::Display;
 
 use chrono::{DateTime, Utc};
@@ -163,8 +162,9 @@ pub enum ControlPlaneComputeStatus {
 #[derive(Clone, Debug, Default, Serialize)]
 pub struct InstalledExtension {
     pub extname: String,
-    pub versions: HashSet<String>,
+    pub version: String,
     pub n_databases: u32, // Number of databases using this extension
+    pub owned_by_superuser: String,
 }
 
 #[derive(Clone, Debug, Default, Serialize)]
diff --git a/test_runner/regress/test_installed_extensions.py b/test_runner/regress/test_installed_extensions.py
index 04ccec5875..4e51e7e10c 100644
--- a/test_runner/regress/test_installed_extensions.py
+++ b/test_runner/regress/test_installed_extensions.py
@@ -30,7 +30,7 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
     info("Extensions: %s", res["extensions"])
     # 'plpgsql' is a default extension that is always installed.
     assert any(
-        ext["extname"] == "plpgsql" and ext["versions"] == ["1.0"] for ext in res["extensions"]
+        ext["extname"] == "plpgsql" and ext["version"] == "1.0" for ext in res["extensions"]
     ), "The 'plpgsql' extension is missing"
 
     # check that the neon_test_utils extension is not installed
@@ -63,7 +63,7 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
     # and has the expected version
     assert any(
         ext["extname"] == "neon_test_utils"
-        and ext["versions"] == [neon_test_utils_version]
+        and ext["version"] == neon_test_utils_version
         and ext["n_databases"] == 1
         for ext in res["extensions"]
     )
@@ -75,9 +75,8 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
     # check that the neon extension is installed and has expected versions
     for ext in res["extensions"]:
         if ext["extname"] == "neon":
-            assert ext["n_databases"] == 2
-            ext["versions"].sort()
-            assert ext["versions"] == ["1.1", "1.2"]
+            assert ext["version"] in ["1.1", "1.2"]
+            assert ext["n_databases"] == 1
 
     with pg_conn.cursor() as cur:
         cur.execute("ALTER EXTENSION neon UPDATE TO '1.3'")
@@ -90,9 +89,8 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
     # check that the neon_test_utils extension is updated
     for ext in res["extensions"]:
         if ext["extname"] == "neon":
-            assert ext["n_databases"] == 2
-            ext["versions"].sort()
-            assert ext["versions"] == ["1.2", "1.3"]
+            assert ext["version"] in ["1.2", "1.3"]
+            assert ext["n_databases"] == 1
 
     # check that /metrics endpoint is available
     # ensure that we see the metric before and after restart
@@ -100,13 +98,15 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
     info("Metrics: %s", res)
     m = parse_metrics(res)
     neon_m = m.query_all(
-        "compute_installed_extensions", {"extension_name": "neon", "version": "1.2"}
+        "compute_installed_extensions",
+        {"extension_name": "neon", "version": "1.2", "owned_by_superuser": "1"},
     )
     assert len(neon_m) == 1
     for sample in neon_m:
-        assert sample.value == 2
+        assert sample.value == 1
     neon_m = m.query_all(
-        "compute_installed_extensions", {"extension_name": "neon", "version": "1.3"}
+        "compute_installed_extensions",
+        {"extension_name": "neon", "version": "1.3", "owned_by_superuser": "1"},
     )
     assert len(neon_m) == 1
     for sample in neon_m:
@@ -138,14 +138,16 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
         info("After restart metrics: %s", res)
         m = parse_metrics(res)
         neon_m = m.query_all(
-            "compute_installed_extensions", {"extension_name": "neon", "version": "1.2"}
+            "compute_installed_extensions",
+            {"extension_name": "neon", "version": "1.2", "owned_by_superuser": "1"},
         )
         assert len(neon_m) == 1
         for sample in neon_m:
             assert sample.value == 1
 
         neon_m = m.query_all(
-            "compute_installed_extensions", {"extension_name": "neon", "version": "1.3"}
+            "compute_installed_extensions",
+            {"extension_name": "neon", "version": "1.3", "owned_by_superuser": "1"},
         )
         assert len(neon_m) == 1
         for sample in neon_m:

From a3e80448e8af32f13bcfa3a605e7b9a22d74a01e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 11 Dec 2024 19:16:33 +0000
Subject: [PATCH 013/583] pageserver/storcon: add patch endpoints for tenant
 config metrics  (#10020)

## Problem

Cplane and storage controller tenant config changes are not additive.
Any change overrides all existing tenant configs. This would be fine if
both did client side patching, but that's not the case.

Once this merges, we must update cplane to use the PATCH endpoint.

## Summary of changes

### High Level

Allow for patching of tenant configuration with a `PATCH
/v1/tenant/config` endpoint.
It takes the same data as it's PUT counterpart. For example the payload
below will update `gc_period` and unset `compaction_period`. All other
fields are left in their original state.
```
{
  "tenant_id": "1234",
  "gc_period": "10s",
  "compaction_period": null
}
```

### Low Level
* PS and storcon gain `PATCH /v1/tenant/config` endpoints. PS endpoint
is only used for cplane managed instances.
* `storcon_cli` is updated to have separate commands for
`set-tenant-config` and `patch-tenant-config`

Related https://github.com/neondatabase/cloud/issues/21043
---
 control_plane/src/pageserver.rs               |   2 +-
 control_plane/storcon_cli/src/main.rs         |  34 ++-
 libs/pageserver_api/src/models.rs             | 261 +++++++++++++++++-
 libs/utils/src/lib.rs                         |   2 +
 libs/utils/src/try_rcu.rs                     |  77 ++++++
 pageserver/client/src/mgmt_api.rs             |   8 +-
 pageserver/pagebench/src/cmd/aux_files.rs     |   2 +-
 pageserver/src/http/openapi_spec.yml          |  22 +-
 pageserver/src/http/routes.rs                 |  46 ++-
 pageserver/src/tenant.rs                      |  28 +-
 pageserver/src/tenant/config.rs               | 125 ++++++++-
 storage_controller/src/http.rs                |  33 ++-
 storage_controller/src/service.rs             |  80 +++++-
 test_runner/fixtures/pageserver/http.py       |  29 +-
 .../regress/test_disk_usage_eviction.py       |   4 +-
 .../regress/test_ingestion_layer_size.py      |   2 +-
 .../regress/test_layers_from_future.py        |   2 +-
 .../test_pageserver_crash_consistency.py      |   2 +-
 .../regress/test_storage_controller.py        |   4 +-
 test_runner/regress/test_tenant_conf.py       |  83 +++++-
 .../regress/test_threshold_based_eviction.py  |   2 +-
 .../regress/test_timeline_detach_ancestor.py  |   2 +-
 22 files changed, 800 insertions(+), 50 deletions(-)
 create mode 100644 libs/utils/src/try_rcu.rs

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 1d1455b95b..9d3f018345 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -435,7 +435,7 @@ impl PageServerNode {
     ) -> anyhow::Result<()> {
         let config = Self::parse_config(settings)?;
         self.http_client
-            .tenant_config(&models::TenantConfigRequest { tenant_id, config })
+            .set_tenant_config(&models::TenantConfigRequest { tenant_id, config })
             .await?;
 
         Ok(())
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index e879424532..df07216fde 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -9,8 +9,8 @@ use pageserver_api::{
     },
     models::{
         EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantShardSplitRequest,
-        TenantShardSplitResponse,
+        ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest,
+        TenantShardSplitRequest, TenantShardSplitResponse,
     },
     shard::{ShardStripeSize, TenantShardId},
 };
@@ -116,9 +116,19 @@ enum Command {
         #[arg(long)]
         tenant_shard_id: TenantShardId,
     },
-    /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
+    /// Set the pageserver tenant configuration of a tenant: this is the configuration structure
     /// that is passed through to pageservers, and does not affect storage controller behavior.
-    TenantConfig {
+    /// Any previous tenant configs are overwritten.
+    SetTenantConfig {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        config: String,
+    },
+    /// Patch the pageserver tenant configuration of a tenant. Any fields with null values in the
+    /// provided JSON are unset from the tenant config and all fields with non-null values are set.
+    /// Unspecified fields are not changed.
+    PatchTenantConfig {
         #[arg(long)]
         tenant_id: TenantId,
         #[arg(long)]
@@ -549,11 +559,21 @@ async fn main() -> anyhow::Result<()> {
                 )
                 .await?;
         }
-        Command::TenantConfig { tenant_id, config } => {
+        Command::SetTenantConfig { tenant_id, config } => {
             let tenant_conf = serde_json::from_str(&config)?;
 
             vps_client
-                .tenant_config(&TenantConfigRequest {
+                .set_tenant_config(&TenantConfigRequest {
+                    tenant_id,
+                    config: tenant_conf,
+                })
+                .await?;
+        }
+        Command::PatchTenantConfig { tenant_id, config } => {
+            let tenant_conf = serde_json::from_str(&config)?;
+
+            vps_client
+                .patch_tenant_config(&TenantConfigPatchRequest {
                     tenant_id,
                     config: tenant_conf,
                 })
@@ -736,7 +756,7 @@ async fn main() -> anyhow::Result<()> {
             threshold,
         } => {
             vps_client
-                .tenant_config(&TenantConfigRequest {
+                .set_tenant_config(&TenantConfigRequest {
                     tenant_id,
                     config: TenantConfig {
                         eviction_policy: Some(EvictionPolicy::LayerAccessThreshold(
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 5488f7b2c2..5690b643f0 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -17,7 +17,7 @@ use std::{
 
 use byteorder::{BigEndian, ReadBytesExt};
 use postgres_ffi::BLCKSZ;
-use serde::{Deserialize, Serialize};
+use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use serde_with::serde_as;
 use utils::{
     completion,
@@ -325,6 +325,115 @@ impl Default for ShardParameters {
     }
 }
 
+#[derive(Debug, Default, Clone, Eq, PartialEq)]
+pub enum FieldPatch<T> {
+    Upsert(T),
+    Remove,
+    #[default]
+    Noop,
+}
+
+impl<T> FieldPatch<T> {
+    fn is_noop(&self) -> bool {
+        matches!(self, FieldPatch::Noop)
+    }
+
+    pub fn apply(self, target: &mut Option<T>) {
+        match self {
+            Self::Upsert(v) => *target = Some(v),
+            Self::Remove => *target = None,
+            Self::Noop => {}
+        }
+    }
+
+    pub fn map<U, E, F: FnOnce(T) -> Result<U, E>>(self, map: F) -> Result<FieldPatch<U>, E> {
+        match self {
+            Self::Upsert(v) => Ok(FieldPatch::<U>::Upsert(map(v)?)),
+            Self::Remove => Ok(FieldPatch::<U>::Remove),
+            Self::Noop => Ok(FieldPatch::<U>::Noop),
+        }
+    }
+}
+
+impl<'de, T: Deserialize<'de>> Deserialize<'de> for FieldPatch<T> {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        Option::deserialize(deserializer).map(|opt| match opt {
+            None => FieldPatch::Remove,
+            Some(val) => FieldPatch::Upsert(val),
+        })
+    }
+}
+
+impl<T: Serialize> Serialize for FieldPatch<T> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        match self {
+            FieldPatch::Upsert(val) => serializer.serialize_some(val),
+            FieldPatch::Remove => serializer.serialize_none(),
+            FieldPatch::Noop => unreachable!(),
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
+#[serde(default)]
+pub struct TenantConfigPatch {
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub checkpoint_distance: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub checkpoint_timeout: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_target_size: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_period: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_threshold: FieldPatch<usize>,
+    // defer parsing compaction_algorithm, like eviction_policy
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_horizon: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_period: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_creation_threshold: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub pitr_interval: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub walreceiver_connect_timeout: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lagging_wal_timeout: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub max_lsn_wal_lag: FieldPatch<NonZeroU64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub eviction_policy: FieldPatch<EvictionPolicy>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub min_resident_size_override: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub evictions_low_residence_duration_metric_threshold: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub heatmap_period: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lazy_slru_download: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub timeline_get_throttle: FieldPatch<ThrottleConfig>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_layer_creation_check_threshold: FieldPatch<u8>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lsn_lease_length: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lsn_lease_length_for_ts: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub timeline_offloading: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub wal_receiver_protocol_override: FieldPatch<PostgresClientProtocol>,
+}
+
 /// An alternative representation of `pageserver::tenant::TenantConf` with
 /// simpler types.
 #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
@@ -356,6 +465,107 @@ pub struct TenantConfig {
     pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
 }
 
+impl TenantConfig {
+    pub fn apply_patch(self, patch: TenantConfigPatch) -> TenantConfig {
+        let Self {
+            mut checkpoint_distance,
+            mut checkpoint_timeout,
+            mut compaction_target_size,
+            mut compaction_period,
+            mut compaction_threshold,
+            mut compaction_algorithm,
+            mut gc_horizon,
+            mut gc_period,
+            mut image_creation_threshold,
+            mut pitr_interval,
+            mut walreceiver_connect_timeout,
+            mut lagging_wal_timeout,
+            mut max_lsn_wal_lag,
+            mut eviction_policy,
+            mut min_resident_size_override,
+            mut evictions_low_residence_duration_metric_threshold,
+            mut heatmap_period,
+            mut lazy_slru_download,
+            mut timeline_get_throttle,
+            mut image_layer_creation_check_threshold,
+            mut lsn_lease_length,
+            mut lsn_lease_length_for_ts,
+            mut timeline_offloading,
+            mut wal_receiver_protocol_override,
+        } = self;
+
+        patch.checkpoint_distance.apply(&mut checkpoint_distance);
+        patch.checkpoint_timeout.apply(&mut checkpoint_timeout);
+        patch
+            .compaction_target_size
+            .apply(&mut compaction_target_size);
+        patch.compaction_period.apply(&mut compaction_period);
+        patch.compaction_threshold.apply(&mut compaction_threshold);
+        patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch.gc_horizon.apply(&mut gc_horizon);
+        patch.gc_period.apply(&mut gc_period);
+        patch
+            .image_creation_threshold
+            .apply(&mut image_creation_threshold);
+        patch.pitr_interval.apply(&mut pitr_interval);
+        patch
+            .walreceiver_connect_timeout
+            .apply(&mut walreceiver_connect_timeout);
+        patch.lagging_wal_timeout.apply(&mut lagging_wal_timeout);
+        patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag);
+        patch.eviction_policy.apply(&mut eviction_policy);
+        patch
+            .min_resident_size_override
+            .apply(&mut min_resident_size_override);
+        patch
+            .evictions_low_residence_duration_metric_threshold
+            .apply(&mut evictions_low_residence_duration_metric_threshold);
+        patch.heatmap_period.apply(&mut heatmap_period);
+        patch.lazy_slru_download.apply(&mut lazy_slru_download);
+        patch
+            .timeline_get_throttle
+            .apply(&mut timeline_get_throttle);
+        patch
+            .image_layer_creation_check_threshold
+            .apply(&mut image_layer_creation_check_threshold);
+        patch.lsn_lease_length.apply(&mut lsn_lease_length);
+        patch
+            .lsn_lease_length_for_ts
+            .apply(&mut lsn_lease_length_for_ts);
+        patch.timeline_offloading.apply(&mut timeline_offloading);
+        patch
+            .wal_receiver_protocol_override
+            .apply(&mut wal_receiver_protocol_override);
+
+        Self {
+            checkpoint_distance,
+            checkpoint_timeout,
+            compaction_target_size,
+            compaction_period,
+            compaction_threshold,
+            compaction_algorithm,
+            gc_horizon,
+            gc_period,
+            image_creation_threshold,
+            pitr_interval,
+            walreceiver_connect_timeout,
+            lagging_wal_timeout,
+            max_lsn_wal_lag,
+            eviction_policy,
+            min_resident_size_override,
+            evictions_low_residence_duration_metric_threshold,
+            heatmap_period,
+            lazy_slru_download,
+            timeline_get_throttle,
+            image_layer_creation_check_threshold,
+            lsn_lease_length,
+            lsn_lease_length_for_ts,
+            timeline_offloading,
+            wal_receiver_protocol_override,
+        }
+    }
+}
+
 /// The policy for the aux file storage.
 ///
 /// It can be switched through `switch_aux_file_policy` tenant config.
@@ -686,6 +896,14 @@ impl TenantConfigRequest {
     }
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantConfigPatchRequest {
+    pub tenant_id: TenantId,
+    #[serde(flatten)]
+    pub config: TenantConfigPatch, // as we have a flattened field, we should reject all unknown fields in it
+}
+
 /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
 #[derive(Serialize, Deserialize, Clone)]
 #[serde(tag = "slug", content = "data", rename_all = "snake_case")]
@@ -1699,4 +1917,45 @@ mod tests {
             );
         }
     }
+
+    #[test]
+    fn test_tenant_config_patch_request_serde() {
+        let patch_request = TenantConfigPatchRequest {
+            tenant_id: TenantId::from_str("17c6d121946a61e5ab0fe5a2fd4d8215").unwrap(),
+            config: TenantConfigPatch {
+                checkpoint_distance: FieldPatch::Upsert(42),
+                gc_horizon: FieldPatch::Remove,
+                compaction_threshold: FieldPatch::Noop,
+                ..TenantConfigPatch::default()
+            },
+        };
+
+        let json = serde_json::to_string(&patch_request).unwrap();
+
+        let expected = r#"{"tenant_id":"17c6d121946a61e5ab0fe5a2fd4d8215","checkpoint_distance":42,"gc_horizon":null}"#;
+        assert_eq!(json, expected);
+
+        let decoded: TenantConfigPatchRequest = serde_json::from_str(&json).unwrap();
+        assert_eq!(decoded.tenant_id, patch_request.tenant_id);
+        assert_eq!(decoded.config, patch_request.config);
+
+        // Now apply the patch to a config to demonstrate semantics
+
+        let base = TenantConfig {
+            checkpoint_distance: Some(28),
+            gc_horizon: Some(100),
+            compaction_target_size: Some(1024),
+            ..Default::default()
+        };
+
+        let expected = TenantConfig {
+            checkpoint_distance: Some(42),
+            gc_horizon: None,
+            ..base.clone()
+        };
+
+        let patched = base.apply_patch(decoded.config);
+
+        assert_eq!(patched, expected);
+    }
 }
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index d9b82b20da..bccd0e0488 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -94,6 +94,8 @@ pub mod toml_edit_ext;
 
 pub mod circuit_breaker;
 
+pub mod try_rcu;
+
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
 #[doc(hidden)]
 pub use git_version;
diff --git a/libs/utils/src/try_rcu.rs b/libs/utils/src/try_rcu.rs
new file mode 100644
index 0000000000..6b53ab1316
--- /dev/null
+++ b/libs/utils/src/try_rcu.rs
@@ -0,0 +1,77 @@
+//! Try RCU extension lifted from <https://github.com/vorner/arc-swap/issues/94#issuecomment-1987154023>
+
+pub trait ArcSwapExt<T> {
+    /// [`ArcSwap::rcu`](arc_swap::ArcSwap::rcu), but with Result that short-circuits on error.
+    fn try_rcu<R, F, E>(&self, f: F) -> Result<T, E>
+    where
+        F: FnMut(&T) -> Result<R, E>,
+        R: Into<T>;
+}
+
+impl<T, S> ArcSwapExt<T> for arc_swap::ArcSwapAny<T, S>
+where
+    T: arc_swap::RefCnt,
+    S: arc_swap::strategy::CaS<T>,
+{
+    fn try_rcu<R, F, E>(&self, mut f: F) -> Result<T, E>
+    where
+        F: FnMut(&T) -> Result<R, E>,
+        R: Into<T>,
+    {
+        fn ptr_eq<Base, A, B>(a: A, b: B) -> bool
+        where
+            A: arc_swap::AsRaw<Base>,
+            B: arc_swap::AsRaw<Base>,
+        {
+            let a = a.as_raw();
+            let b = b.as_raw();
+            std::ptr::eq(a, b)
+        }
+
+        let mut cur = self.load();
+        loop {
+            let new = f(&cur)?.into();
+            let prev = self.compare_and_swap(&*cur, new);
+            let swapped = ptr_eq(&*cur, &*prev);
+            if swapped {
+                return Ok(arc_swap::Guard::into_inner(prev));
+            } else {
+                cur = prev;
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arc_swap::ArcSwap;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_try_rcu_success() {
+        let swap = ArcSwap::from(Arc::new(42));
+
+        let result = swap.try_rcu(|value| -> Result<_, String> { Ok(**value + 1) });
+
+        assert!(result.is_ok());
+        assert_eq!(**swap.load(), 43);
+    }
+
+    #[test]
+    fn test_try_rcu_error() {
+        let swap = ArcSwap::from(Arc::new(42));
+
+        let result = swap.try_rcu(|value| -> Result<i32, _> {
+            if **value == 42 {
+                Err("err")
+            } else {
+                Ok(**value + 1)
+            }
+        });
+
+        assert!(result.is_err());
+        assert_eq!(result.unwrap_err(), "err");
+        assert_eq!(**swap.load(), 42);
+    }
+}
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index c3a1ef8140..4e9b11879d 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -270,12 +270,18 @@ impl Client {
         Ok(body)
     }
 
-    pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
+    pub async fn set_tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
         let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
         self.request(Method::PUT, &uri, req).await?;
         Ok(())
     }
 
+    pub async fn patch_tenant_config(&self, req: &TenantConfigPatchRequest) -> Result<()> {
+        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
+        self.request(Method::PATCH, &uri, req).await?;
+        Ok(())
+    }
+
     pub async fn tenant_secondary_download(
         &self,
         tenant_id: TenantShardId,
diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs
index 923a7f1f18..b869a0c6c7 100644
--- a/pageserver/pagebench/src/cmd/aux_files.rs
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -64,7 +64,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
     println!("operating on timeline {}", timeline);
 
     mgmt_api_client
-        .tenant_config(&TenantConfigRequest {
+        .set_tenant_config(&TenantConfigRequest {
             tenant_id: timeline.tenant_id,
             config: TenantConfig::default(),
         })
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 7fb9247feb..ee43440534 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -767,7 +767,27 @@ paths:
   /v1/tenant/config:
     put:
       description: |
-        Update tenant's config.
+        Update tenant's config by setting it to the provided value
+
+        Invalid fields in the tenant config will cause the request to be rejected with status 400.
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/TenantConfigRequest"
+      responses:
+        "200":
+          description: OK
+          content:
+            application/json:
+              schema:
+                type: array
+                items:
+                  $ref: "#/components/schemas/TenantInfo"
+    patch:
+      description: |
+        Update tenant's config additively by patching the updated fields provided.
+        Null values unset the field and non-null values upsert it.
 
         Invalid fields in the tenant config will cause the request to be rejected with status 400.
       requestBody:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 75d25d0a6a..6e9ee976f4 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -28,6 +28,7 @@ use pageserver_api::models::LsnLease;
 use pageserver_api::models::LsnLeaseRequest;
 use pageserver_api::models::OffloadedTimelineInfo;
 use pageserver_api::models::ShardParameters;
+use pageserver_api::models::TenantConfigPatchRequest;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigRequest;
 use pageserver_api::models::TenantLocationConfigResponse;
@@ -1695,7 +1696,47 @@ async fn update_tenant_config_handler(
     crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
         .await
         .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
-    tenant.set_new_tenant_config(new_tenant_conf);
+
+    let _ = tenant
+        .update_tenant_config(|_crnt| Ok(new_tenant_conf.clone()))
+        .expect("Closure returns Ok()");
+
+    json_response(StatusCode::OK, ())
+}
+
+async fn patch_tenant_config_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let request_data: TenantConfigPatchRequest = json_request(&mut request).await?;
+    let tenant_id = request_data.tenant_id;
+    check_permission(&request, Some(tenant_id))?;
+
+    let state = get_state(&request);
+
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+    let updated = tenant
+        .update_tenant_config(|crnt| crnt.apply_patch(request_data.config.clone()))
+        .map_err(ApiError::BadRequest)?;
+
+    // This is a legacy API that only operates on attached tenants: the preferred
+    // API to use is the location_config/ endpoint, which lets the caller provide
+    // the full LocationConf.
+    let location_conf = LocationConf::attached_single(
+        updated,
+        tenant.get_generation(),
+        &ShardParameters::default(),
+    );
+
+    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
+        .await
+        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
 
     json_response(StatusCode::OK, ())
 }
@@ -3288,6 +3329,9 @@ pub fn make_router(
         .get("/v1/tenant/:tenant_shard_id/synthetic_size", |r| {
             api_handler(r, tenant_size_handler)
         })
+        .patch("/v1/tenant/config", |r| {
+            api_handler(r, patch_tenant_config_handler)
+        })
         .put("/v1/tenant/config", |r| {
             api_handler(r, update_tenant_config_handler)
         })
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 54fa95fc47..92078e4b08 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -68,6 +68,7 @@ use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
 use utils::timeout::timeout_cancellable;
 use utils::timeout::TimeoutCancellableError;
+use utils::try_rcu::ArcSwapExt;
 use utils::zstd::create_zst_tarball;
 use utils::zstd::extract_zst_tarball;
 
@@ -3921,25 +3922,28 @@ impl Tenant {
         }
     }
 
-    pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
+    pub fn update_tenant_config<F: Fn(TenantConfOpt) -> anyhow::Result<TenantConfOpt>>(
+        &self,
+        update: F,
+    ) -> anyhow::Result<TenantConfOpt> {
         // Use read-copy-update in order to avoid overwriting the location config
         // state if this races with [`Tenant::set_new_location_config`]. Note that
         // this race is not possible if both request types come from the storage
         // controller (as they should!) because an exclusive op lock is required
         // on the storage controller side.
 
-        self.tenant_conf.rcu(|inner| {
-            Arc::new(AttachedTenantConf {
-                tenant_conf: new_tenant_conf.clone(),
-                location: inner.location,
-                // Attached location is not changed, no need to update lsn lease deadline.
-                lsn_lease_deadline: inner.lsn_lease_deadline,
-            })
-        });
+        self.tenant_conf
+            .try_rcu(|attached_conf| -> Result<_, anyhow::Error> {
+                Ok(Arc::new(AttachedTenantConf {
+                    tenant_conf: update(attached_conf.tenant_conf.clone())?,
+                    location: attached_conf.location,
+                    lsn_lease_deadline: attached_conf.lsn_lease_deadline,
+                }))
+            })?;
 
-        let updated = self.tenant_conf.load().clone();
+        let updated = self.tenant_conf.load();
 
-        self.tenant_conf_updated(&new_tenant_conf);
+        self.tenant_conf_updated(&updated.tenant_conf);
         // Don't hold self.timelines.lock() during the notifies.
         // There's no risk of deadlock right now, but there could be if we consolidate
         // mutexes in struct Timeline in the future.
@@ -3947,6 +3951,8 @@ impl Tenant {
         for timeline in timelines {
             timeline.tenant_conf_updated(&updated);
         }
+
+        Ok(updated.tenant_conf.clone())
     }
 
     pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 5d3ac5a8e3..d54dded778 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -11,7 +11,7 @@
 pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
-use pageserver_api::models::{self, ThrottleConfig};
+use pageserver_api::models::{self, TenantConfigPatch, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
@@ -427,6 +427,129 @@ impl TenantConfOpt {
                 .or(global_conf.wal_receiver_protocol_override),
         }
     }
+
+    pub fn apply_patch(self, patch: TenantConfigPatch) -> anyhow::Result<TenantConfOpt> {
+        let Self {
+            mut checkpoint_distance,
+            mut checkpoint_timeout,
+            mut compaction_target_size,
+            mut compaction_period,
+            mut compaction_threshold,
+            mut compaction_algorithm,
+            mut gc_horizon,
+            mut gc_period,
+            mut image_creation_threshold,
+            mut pitr_interval,
+            mut walreceiver_connect_timeout,
+            mut lagging_wal_timeout,
+            mut max_lsn_wal_lag,
+            mut eviction_policy,
+            mut min_resident_size_override,
+            mut evictions_low_residence_duration_metric_threshold,
+            mut heatmap_period,
+            mut lazy_slru_download,
+            mut timeline_get_throttle,
+            mut image_layer_creation_check_threshold,
+            mut lsn_lease_length,
+            mut lsn_lease_length_for_ts,
+            mut timeline_offloading,
+            mut wal_receiver_protocol_override,
+        } = self;
+
+        patch.checkpoint_distance.apply(&mut checkpoint_distance);
+        patch
+            .checkpoint_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut checkpoint_timeout);
+        patch
+            .compaction_target_size
+            .apply(&mut compaction_target_size);
+        patch
+            .compaction_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut compaction_period);
+        patch.compaction_threshold.apply(&mut compaction_threshold);
+        patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch.gc_horizon.apply(&mut gc_horizon);
+        patch
+            .gc_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut gc_period);
+        patch
+            .image_creation_threshold
+            .apply(&mut image_creation_threshold);
+        patch
+            .pitr_interval
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut pitr_interval);
+        patch
+            .walreceiver_connect_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut walreceiver_connect_timeout);
+        patch
+            .lagging_wal_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lagging_wal_timeout);
+        patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag);
+        patch.eviction_policy.apply(&mut eviction_policy);
+        patch
+            .min_resident_size_override
+            .apply(&mut min_resident_size_override);
+        patch
+            .evictions_low_residence_duration_metric_threshold
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut evictions_low_residence_duration_metric_threshold);
+        patch
+            .heatmap_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut heatmap_period);
+        patch.lazy_slru_download.apply(&mut lazy_slru_download);
+        patch
+            .timeline_get_throttle
+            .apply(&mut timeline_get_throttle);
+        patch
+            .image_layer_creation_check_threshold
+            .apply(&mut image_layer_creation_check_threshold);
+        patch
+            .lsn_lease_length
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lsn_lease_length);
+        patch
+            .lsn_lease_length_for_ts
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lsn_lease_length_for_ts);
+        patch.timeline_offloading.apply(&mut timeline_offloading);
+        patch
+            .wal_receiver_protocol_override
+            .apply(&mut wal_receiver_protocol_override);
+
+        Ok(Self {
+            checkpoint_distance,
+            checkpoint_timeout,
+            compaction_target_size,
+            compaction_period,
+            compaction_threshold,
+            compaction_algorithm,
+            gc_horizon,
+            gc_period,
+            image_creation_threshold,
+            pitr_interval,
+            walreceiver_connect_timeout,
+            lagging_wal_timeout,
+            max_lsn_wal_lag,
+            eviction_policy,
+            min_resident_size_override,
+            evictions_low_residence_duration_metric_threshold,
+            heatmap_period,
+            lazy_slru_download,
+            timeline_get_throttle,
+            image_layer_creation_check_threshold,
+            lsn_lease_length,
+            lsn_lease_length_for_ts,
+            timeline_offloading,
+            wal_receiver_protocol_override,
+        })
+    }
 }
 
 impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 39e078ba7c..dce5380aa0 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -18,8 +18,9 @@ use pageserver_api::controller_api::{
     ShardsPreferredAzsRequest, TenantCreateRequest,
 };
 use pageserver_api::models::{
-    TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
-    TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest,
+    TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest,
+    TenantShardSplitRequest, TenantTimeTravelRequest, TimelineArchivalConfigRequest,
+    TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::{mgmt_api, BlockUnblock};
@@ -208,6 +209,27 @@ async fn handle_tenant_location_config(
     )
 }
 
+async fn handle_tenant_config_patch(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let config_req = json_request::<TenantConfigPatchRequest>(&mut req).await?;
+
+    json_response(
+        StatusCode::OK,
+        service.tenant_config_patch(config_req).await?,
+    )
+}
+
 async fn handle_tenant_config_set(
     service: Arc<Service>,
     req: Request<Body>,
@@ -1863,6 +1885,13 @@ pub fn make_router(
         .delete("/v1/tenant/:tenant_id", |r| {
             tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
         })
+        .patch("/v1/tenant/config", |r| {
+            tenant_service_handler(
+                r,
+                handle_tenant_config_patch,
+                RequestName("v1_tenant_config"),
+            )
+        })
         .put("/v1/tenant/config", |r| {
             tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
         })
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 7e4ee53b4c..e82e84fe89 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -52,8 +52,8 @@ use pageserver_api::{
         TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
     },
     models::{
-        SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest,
-        TopTenantShardsRequest,
+        SecondaryProgress, TenantConfigPatchRequest, TenantConfigRequest,
+        TimelineArchivalConfigRequest, TopTenantShardsRequest,
     },
 };
 use reqwest::StatusCode;
@@ -139,6 +139,7 @@ enum TenantOperations {
     Create,
     LocationConfig,
     ConfigSet,
+    ConfigPatch,
     TimeTravelRemoteStorage,
     Delete,
     UpdatePolicy,
@@ -2602,6 +2603,55 @@ impl Service {
         Ok(result)
     }
 
+    pub(crate) async fn tenant_config_patch(
+        &self,
+        req: TenantConfigPatchRequest,
+    ) -> Result<(), ApiError> {
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            req.tenant_id,
+            TenantOperations::ConfigPatch,
+        )
+        .await;
+
+        let tenant_id = req.tenant_id;
+        let patch = req.config;
+
+        let base = {
+            let locked = self.inner.read().unwrap();
+            let shards = locked
+                .tenants
+                .range(TenantShardId::tenant_range(req.tenant_id));
+
+            let mut configs = shards.map(|(_sid, shard)| &shard.config).peekable();
+
+            let first = match configs.peek() {
+                Some(first) => (*first).clone(),
+                None => {
+                    return Err(ApiError::NotFound(
+                        anyhow::anyhow!("Tenant {} not found", req.tenant_id).into(),
+                    ));
+                }
+            };
+
+            if !configs.all_equal() {
+                tracing::error!("Tenant configs for {} are mismatched. ", req.tenant_id);
+                // This can't happen because we atomically update the database records
+                // of all shards to the new value in [`Self::set_tenant_config_and_reconcile`].
+                return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                    "Tenant configs for {} are mismatched",
+                    req.tenant_id
+                )));
+            }
+
+            first
+        };
+
+        let updated_config = base.apply_patch(patch);
+        self.set_tenant_config_and_reconcile(tenant_id, updated_config)
+            .await
+    }
+
     pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
         // We require an exclusive lock, because we are updating persistent and in-memory state
         let _tenant_lock = trace_exclusive_lock(
@@ -2611,12 +2661,32 @@ impl Service {
         )
         .await;
 
-        let tenant_id = req.tenant_id;
-        let config = req.config;
+        let tenant_exists = {
+            let locked = self.inner.read().unwrap();
+            let mut r = locked
+                .tenants
+                .range(TenantShardId::tenant_range(req.tenant_id));
+            r.next().is_some()
+        };
 
+        if !tenant_exists {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant {} not found", req.tenant_id).into(),
+            ));
+        }
+
+        self.set_tenant_config_and_reconcile(req.tenant_id, req.config)
+            .await
+    }
+
+    async fn set_tenant_config_and_reconcile(
+        &self,
+        tenant_id: TenantId,
+        config: TenantConfig,
+    ) -> Result<(), ApiError> {
         self.persistence
             .update_tenant_shard(
-                TenantFilter::Tenant(req.tenant_id),
+                TenantFilter::Tenant(tenant_id),
                 None,
                 Some(config.clone()),
                 None,
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 0832eac22f..eabdeb1053 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -488,7 +488,20 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         )
         self.verbose_error(res)
 
-    def patch_tenant_config_client_side(
+    def patch_tenant_config(self, tenant_id: TenantId | TenantShardId, updates: dict[str, Any]):
+        """
+        Only use this via storage_controller.pageserver_api().
+
+        See `set_tenant_config` for more information.
+        """
+        assert "tenant_id" not in updates.keys()
+        res = self.patch(
+            f"http://localhost:{self.port}/v1/tenant/config",
+            json={**updates, "tenant_id": str(tenant_id)},
+        )
+        self.verbose_error(res)
+
+    def update_tenant_config(
         self,
         tenant_id: TenantId,
         inserts: dict[str, Any] | None = None,
@@ -499,13 +512,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
         See `set_tenant_config` for more information.
         """
-        current = self.tenant_config(tenant_id).tenant_specific_overrides
-        if inserts is not None:
-            current.update(inserts)
-        if removes is not None:
-            for key in removes:
-                del current[key]
-        self.set_tenant_config(tenant_id, current)
+        if inserts is None:
+            inserts = {}
+        if removes is None:
+            removes = []
+
+        patch = inserts | {remove: None for remove in removes}
+        self.patch_tenant_config(tenant_id, patch)
 
     def tenant_size(self, tenant_id: TenantId | TenantShardId) -> int:
         return self.tenant_size_and_modelinputs(tenant_id)[0]
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 954db914b9..7abcdb3838 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -460,10 +460,10 @@ def test_pageserver_respects_overridden_resident_size(
     assert (
         du_by_timeline[large_tenant] > min_resident_size
     ), "ensure the larger tenant will get a haircut"
-    env.neon_env.storage_controller.pageserver_api().patch_tenant_config_client_side(
+    env.neon_env.storage_controller.pageserver_api().update_tenant_config(
         small_tenant[0], {"min_resident_size_override": min_resident_size}
     )
-    env.neon_env.storage_controller.pageserver_api().patch_tenant_config_client_side(
+    env.neon_env.storage_controller.pageserver_api().update_tenant_config(
         large_tenant[0], {"min_resident_size_override": min_resident_size}
     )
 
diff --git a/test_runner/regress/test_ingestion_layer_size.py b/test_runner/regress/test_ingestion_layer_size.py
index 9c9bc5b519..7e99d4b2f2 100644
--- a/test_runner/regress/test_ingestion_layer_size.py
+++ b/test_runner/regress/test_ingestion_layer_size.py
@@ -74,7 +74,7 @@ def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder):
     print_layer_size_histogram(post_ingest)
 
     # since all we have are L0s, we should be getting nice L1s and images out of them now
-    env.storage_controller.pageserver_api().patch_tenant_config_client_side(
+    env.storage_controller.pageserver_api().update_tenant_config(
         env.initial_tenant,
         {
             "compaction_threshold": 1,
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 8818b40712..5e06a1d47f 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -132,7 +132,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str):
     ), "sanity check for what above loop is supposed to do"
 
     # create the image layer from the future
-    env.storage_controller.pageserver_api().patch_tenant_config_client_side(
+    env.storage_controller.pageserver_api().update_tenant_config(
         tenant_id, {"image_creation_threshold": image_creation_threshold}, None
     )
     assert ps_http.tenant_config(tenant_id).effective_config["image_creation_threshold"] == 1
diff --git a/test_runner/regress/test_pageserver_crash_consistency.py b/test_runner/regress/test_pageserver_crash_consistency.py
index fcae7983f4..e9eee2760e 100644
--- a/test_runner/regress/test_pageserver_crash_consistency.py
+++ b/test_runner/regress/test_pageserver_crash_consistency.py
@@ -46,7 +46,7 @@ def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin:
     for sk in env.safekeepers:
         sk.stop()
 
-    env.storage_controller.pageserver_api().patch_tenant_config_client_side(
+    env.storage_controller.pageserver_api().update_tenant_config(
         tenant_id, {"compaction_threshold": 3}
     )
     # hit the exit failpoint
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 9f74dcccb9..4d1784d45a 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1768,7 +1768,7 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     # Modify a tenant's config
     storcon_cli(
         [
-            "tenant-config",
+            "patch-tenant-config",
             "--tenant-id",
             str(env.initial_tenant),
             "--config",
@@ -2403,7 +2403,7 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
 
     # Make a change to the tenant config to trigger a slow reconcile
     virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
-    virtual_ps_http.patch_tenant_config_client_side(tid, {"compaction_threshold": 5}, None)
+    virtual_ps_http.update_tenant_config(tid, {"compaction_threshold": 5}, None)
     env.storage_controller.allowed_errors.extend(
         [
             ".*Accepted configuration update but reconciliation failed.*",
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index f8f240cfdc..0c2d535af4 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -3,13 +3,14 @@ from __future__ import annotations
 import json
 from typing import TYPE_CHECKING
 
+import pytest
 from fixtures.common_types import Lsn
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
 )
 from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from fixtures.utils import wait_until
+from fixtures.utils import run_only_on_default_postgres, wait_until
 from fixtures.workload import Workload
 
 if TYPE_CHECKING:
@@ -330,3 +331,83 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
     metric = get_metric()
     assert int(metric.labels["low_threshold_secs"]) == 24 * 60 * 60, "label resets to default"
     assert int(metric.value) == 0, "value resets to default"
+
+
+@run_only_on_default_postgres("Test does not start a compute")
+@pytest.mark.parametrize("ps_managed_by", ["storcon", "cplane"])
+def test_tenant_config_patch(neon_env_builder: NeonEnvBuilder, ps_managed_by: str):
+    """
+    Test tenant config patching (i.e. additive updates)
+
+    The flow is different for storage controller and cplane managed pageserver.
+    1. Storcon managed: /v1/tenant/config request lands on storcon, which generates
+    location_config calls containing the update to the pageserver
+    2. Cplane managed: /v1/tenant/config is called directly on the pageserver
+    """
+
+    def assert_tenant_conf_semantically_equal(lhs, rhs):
+        """
+        Storcon returns None for fields that are not set while the pageserver does not.
+        Compare two tenant's config overrides semantically, by dropping the None values.
+        """
+        lhs = {k: v for k, v in lhs.items() if v is not None}
+        rhs = {k: v for k, v in rhs.items() if v is not None}
+
+        assert lhs == rhs
+
+    env = neon_env_builder.init_start()
+
+    if ps_managed_by == "storcon":
+        api = env.storage_controller.pageserver_api()
+    elif ps_managed_by == "cplane":
+        # Disallow storcon from sending location_configs to the pageserver.
+        # These would overwrite the manually set tenant configs.
+        env.storage_controller.reconcile_until_idle()
+        env.storage_controller.tenant_policy_update(env.initial_tenant, {"scheduling": "Stop"})
+        env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy Stop.*")
+
+        api = env.pageserver.http_client()
+    else:
+        raise Exception(f"Unexpected value of ps_managed_by param: {ps_managed_by}")
+
+    crnt_tenant_conf = api.tenant_config(env.initial_tenant).tenant_specific_overrides
+
+    patch: dict[str, Any | None] = {
+        "gc_period": "3h",
+        "wal_receiver_protocol_override": {
+            "type": "interpreted",
+            "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
+        },
+    }
+    api.patch_tenant_config(env.initial_tenant, patch)
+    tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides
+    if ps_managed_by == "storcon":
+        # Check that the config was propagated to the PS.
+        overrides_on_ps = (
+            env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
+        )
+        assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_patch)
+    assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch)
+    crnt_tenant_conf = tenant_conf_after_patch
+
+    patch = {"gc_period": "5h", "wal_receiver_protocol_override": None}
+    api.patch_tenant_config(env.initial_tenant, patch)
+    tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides
+    if ps_managed_by == "storcon":
+        overrides_on_ps = (
+            env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
+        )
+        assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_patch)
+    assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch)
+    crnt_tenant_conf = tenant_conf_after_patch
+
+    put = {"pitr_interval": "1m 1s"}
+    api.set_tenant_config(env.initial_tenant, put)
+    tenant_conf_after_put = api.tenant_config(env.initial_tenant).tenant_specific_overrides
+    if ps_managed_by == "storcon":
+        overrides_on_ps = (
+            env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
+        )
+        assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_put)
+    assert_tenant_conf_semantically_equal(tenant_conf_after_put, put)
+    crnt_tenant_conf = tenant_conf_after_put
diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py
index 68e9385035..bedbd84aee 100644
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -81,7 +81,7 @@ def test_threshold_based_eviction(
 
     # create a bunch of L1s, only the least of which will need to be resident
     compaction_threshold = 3  # create L1 layers quickly
-    vps_http.patch_tenant_config_client_side(
+    vps_http.update_tenant_config(
         tenant_id,
         inserts={
             # Disable gc and compaction to avoid on-demand downloads from their side.
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 2c3ee38bae..5234d8278f 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -514,7 +514,7 @@ def test_compaction_induced_by_detaches_in_history(
 
         assert len(delta_layers(branch_timeline_id)) == 5
 
-        env.storage_controller.pageserver_api().patch_tenant_config_client_side(
+        env.storage_controller.pageserver_api().update_tenant_config(
             env.initial_tenant, {"compaction_threshold": 5}, None
         )
 

From e8395807a59478d007180a0e3943b1db937c4899 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 11 Dec 2024 19:43:40 +0000
Subject: [PATCH 014/583] storcon: allow for more concurrency in drain/fill
 operations (#10093)

## Problem

We saw the drain/fill operations not drain fast enough in ap-southeast.

## Summary of changes

These are some quick changes to speed it up:
* double reconcile concurrency - this is now half of the available
reconcile bandwidth
* reduce the waiter polling timeout - this way we can spawn new
reconciliations faster
---
 storage_controller/src/background_node_operations.rs | 2 +-
 storage_controller/src/service.rs                    | 6 ++++--
 test_runner/regress/test_storage_controller.py       | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/storage_controller/src/background_node_operations.rs b/storage_controller/src/background_node_operations.rs
index 6f1355eb68..226d4942e7 100644
--- a/storage_controller/src/background_node_operations.rs
+++ b/storage_controller/src/background_node_operations.rs
@@ -3,7 +3,7 @@ use std::{borrow::Cow, fmt::Debug, fmt::Display};
 use tokio_util::sync::CancellationToken;
 use utils::id::NodeId;
 
-pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 32;
+pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 64;
 
 #[derive(Copy, Clone)]
 pub(crate) struct Drain {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index e82e84fe89..2600500a53 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -100,6 +100,8 @@ use crate::{
 
 use context_iterator::TenantShardContextIterator;
 
+const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500);
+
 // For operations that should be quick, like attaching a new tenant
 const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
 
@@ -6798,7 +6800,7 @@ impl Service {
             }
 
             waiters = self
-                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
+                .await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT)
                 .await;
 
             failpoint_support::sleep_millis_async!("sleepy-drain-loop", &cancel);
@@ -7051,7 +7053,7 @@ impl Service {
             }
 
             waiters = self
-                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
+                .await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT)
                 .await;
         }
 
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 4d1784d45a..02da389809 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2136,7 +2136,7 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
     env.start()
 
     tenant_count = 10
-    shard_count_per_tenant = 8
+    shard_count_per_tenant = 16
     tenant_ids = []
 
     for _ in range(0, tenant_count):

From 7fa986bc923b6af3c77e888be96123f7ba068313 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 11 Dec 2024 21:10:22 +0100
Subject: [PATCH 015/583] Do tenant manifest validation with index-part
 (#10007)

This adds some validation of invariants that we want to uphold wrt the
tenant manifest and `index_part.json`:

* the data the manifest has about a timeline must match with the data in
`index_part.json`. It might actually change, e.g. when we do reparenting
during detach ancestor, but that requires the timeline to be
unoffloaded, i.e. removed from the manifest.
* any timeline mentioned in index part, must, if present, be archived.
If we unarchive, we first update the tenant manifest to unoffload, and
only then update index part. And one needs to archive before offloading.
* it is legal for timelines to be mentioned in the manifest but have no
`index_part`: this is a temporary state visible during deletion of the
timeline. if the pageserver crashed, an attach of the tenant will clean
the state up.
* it is also legal for offloaded timelines to have an
`ancestor_retain_lsn` of None while having an `ancestor_timeline_id`.
This is for the to-be-added flattening functionality: the plan is to set
former to None if we have flattened a timeline.

follow-up of #9942
part of #8088
---
 storage_scrubber/src/checks.rs                |  45 +++--
 .../src/pageserver_physical_gc.rs             | 156 ++++++++++++++----
 2 files changed, 155 insertions(+), 46 deletions(-)

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 1b4ff01a17..f759f54d19 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -533,8 +533,9 @@ async fn list_timeline_blobs_impl(
 }
 
 pub(crate) struct RemoteTenantManifestInfo {
-    pub(crate) latest_generation: Option<Generation>,
-    pub(crate) manifests: Vec<(Generation, ListingObject)>,
+    pub(crate) generation: Generation,
+    pub(crate) manifest: TenantManifest,
+    pub(crate) listing_object: ListingObject,
 }
 
 pub(crate) enum ListTenantManifestResult {
@@ -543,7 +544,10 @@ pub(crate) enum ListTenantManifestResult {
         #[allow(dead_code)]
         unknown_keys: Vec<ListingObject>,
     },
-    NoErrors(RemoteTenantManifestInfo),
+    NoErrors {
+        latest_generation: Option<RemoteTenantManifestInfo>,
+        manifests: Vec<(Generation, ListingObject)>,
+    },
 }
 
 /// Lists the tenant manifests in remote storage and parses the latest one, returning a [`ListTenantManifestResult`] object.
@@ -592,14 +596,6 @@ pub(crate) async fn list_tenant_manifests(
         unknown_keys.push(obj);
     }
 
-    if manifests.is_empty() {
-        tracing::debug!("No manifest for timeline.");
-
-        return Ok(ListTenantManifestResult::WithErrors {
-            errors,
-            unknown_keys,
-        });
-    }
     if !unknown_keys.is_empty() {
         errors.push(((*prefix_str).to_owned(), "unknown keys listed".to_string()));
 
@@ -609,6 +605,15 @@ pub(crate) async fn list_tenant_manifests(
         });
     }
 
+    if manifests.is_empty() {
+        tracing::debug!("No manifest for timeline.");
+
+        return Ok(ListTenantManifestResult::NoErrors {
+            latest_generation: None,
+            manifests,
+        });
+    }
+
     // Find the manifest with the highest generation
     let (latest_generation, latest_listing_object) = manifests
         .iter()
@@ -616,6 +621,8 @@ pub(crate) async fn list_tenant_manifests(
         .map(|(g, obj)| (*g, obj.clone()))
         .unwrap();
 
+    manifests.retain(|(gen, _obj)| gen != &latest_generation);
+
     let manifest_bytes =
         match download_object_with_retries(remote_client, &latest_listing_object.key).await {
             Ok(bytes) => bytes,
@@ -634,13 +641,15 @@ pub(crate) async fn list_tenant_manifests(
         };
 
     match TenantManifest::from_json_bytes(&manifest_bytes) {
-        Ok(_manifest) => {
-            return Ok(ListTenantManifestResult::NoErrors(
-                RemoteTenantManifestInfo {
-                    latest_generation: Some(latest_generation),
-                    manifests,
-                },
-            ));
+        Ok(manifest) => {
+            return Ok(ListTenantManifestResult::NoErrors {
+                latest_generation: Some(RemoteTenantManifestInfo {
+                    generation: latest_generation,
+                    manifest,
+                    listing_object: latest_listing_object,
+                }),
+                manifests,
+            });
         }
         Err(parse_error) => errors.push((
             latest_listing_object.key.get_path().as_str().to_owned(),
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 20cb9c3633..d19b8a5f91 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -4,11 +4,13 @@ use std::time::Duration;
 
 use crate::checks::{
     list_tenant_manifests, list_timeline_blobs, BlobDataParseResult, ListTenantManifestResult,
+    RemoteTenantManifestInfo,
 };
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
 use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
+use pageserver::tenant::remote_timeline_client::manifest::OffloadedTimelineManifest;
 use pageserver::tenant::remote_timeline_client::{
     parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path,
 };
@@ -527,7 +529,7 @@ async fn gc_tenant_manifests(
     target: &RootTarget,
     mode: GcMode,
     tenant_shard_id: TenantShardId,
-) -> anyhow::Result<GcSummary> {
+) -> anyhow::Result<(GcSummary, Option<RemoteTenantManifestInfo>)> {
     let mut gc_summary = GcSummary::default();
     match list_tenant_manifests(remote_client, tenant_shard_id, target).await? {
         ListTenantManifestResult::WithErrors {
@@ -537,33 +539,35 @@ async fn gc_tenant_manifests(
             for (_key, error) in errors {
                 tracing::warn!(%tenant_shard_id, "list_tenant_manifests: {error}");
             }
+            Ok((gc_summary, None))
         }
-        ListTenantManifestResult::NoErrors(mut manifest_info) => {
-            let Some(latest_gen) = manifest_info.latest_generation else {
-                return Ok(gc_summary);
+        ListTenantManifestResult::NoErrors {
+            latest_generation,
+            mut manifests,
+        } => {
+            let Some(latest_generation) = latest_generation else {
+                return Ok((gc_summary, None));
             };
-            manifest_info
-                .manifests
-                .sort_by_key(|(generation, _obj)| *generation);
+            manifests.sort_by_key(|(generation, _obj)| *generation);
             // skip the two latest generations (they don't neccessarily have to be 1 apart from each other)
-            let candidates = manifest_info.manifests.iter().rev().skip(2);
+            let candidates = manifests.iter().rev().skip(2);
             for (_generation, key) in candidates {
                 maybe_delete_tenant_manifest(
                     remote_client,
                     &min_age,
-                    latest_gen,
+                    latest_generation.generation,
                     key,
                     mode,
                     &mut gc_summary,
                 )
                 .instrument(
-                    info_span!("maybe_delete_tenant_manifest", %tenant_shard_id, ?latest_gen, %key.key),
+                    info_span!("maybe_delete_tenant_manifest", %tenant_shard_id, ?latest_generation.generation, %key.key),
                 )
                 .await;
             }
+            Ok((gc_summary, Some(latest_generation)))
         }
     }
-    Ok(gc_summary)
 }
 
 async fn gc_timeline(
@@ -573,6 +577,7 @@ async fn gc_timeline(
     mode: GcMode,
     ttid: TenantShardTimelineId,
     accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
+    tenant_manifest_info: Arc<Option<RemoteTenantManifestInfo>>,
 ) -> anyhow::Result<GcSummary> {
     let mut summary = GcSummary::default();
     let data = list_timeline_blobs(remote_client, ttid, target).await?;
@@ -597,6 +602,60 @@ async fn gc_timeline(
         }
     };
 
+    if let Some(tenant_manifest_info) = &*tenant_manifest_info {
+        // TODO: this is O(n^2) in the number of offloaded timelines. Do a hashmap lookup instead.
+        let maybe_offloaded = tenant_manifest_info
+            .manifest
+            .offloaded_timelines
+            .iter()
+            .find(|offloaded_timeline| offloaded_timeline.timeline_id == ttid.timeline_id);
+        if let Some(offloaded) = maybe_offloaded {
+            let warnings = validate_index_part_with_offloaded(index_part, offloaded);
+            let warn = if warnings.is_empty() {
+                false
+            } else {
+                // Verify that the manifest hasn't changed. If it has, a potential racing change could have been cause for our troubles.
+                match list_tenant_manifests(remote_client, ttid.tenant_shard_id, target).await? {
+                    ListTenantManifestResult::WithErrors {
+                        errors,
+                        unknown_keys: _,
+                    } => {
+                        for (_key, error) in errors {
+                            tracing::warn!(%ttid, "list_tenant_manifests in gc_timeline: {error}");
+                        }
+                        true
+                    }
+                    ListTenantManifestResult::NoErrors {
+                        latest_generation,
+                        manifests: _,
+                    } => {
+                        if let Some(new_latest_gen) = latest_generation {
+                            let manifest_changed = (
+                                new_latest_gen.generation,
+                                new_latest_gen.listing_object.last_modified,
+                            ) == (
+                                tenant_manifest_info.generation,
+                                tenant_manifest_info.listing_object.last_modified,
+                            );
+                            if manifest_changed {
+                                tracing::debug!(%ttid, "tenant manifest changed since it was loaded, suppressing {} warnings", warnings.len());
+                            }
+                            manifest_changed
+                        } else {
+                            // The latest generation is gone. This timeline is in the progress of being deleted?
+                            false
+                        }
+                    }
+                }
+            };
+            if warn {
+                for warning in warnings {
+                    tracing::warn!(%ttid, "{}", warning);
+                }
+            }
+        }
+    }
+
     accumulator.lock().unwrap().update(ttid, index_part);
 
     for key in candidates {
@@ -608,6 +667,35 @@ async fn gc_timeline(
     Ok(summary)
 }
 
+fn validate_index_part_with_offloaded(
+    index_part: &IndexPart,
+    offloaded: &OffloadedTimelineManifest,
+) -> Vec<String> {
+    let mut warnings = Vec::new();
+    if let Some(archived_at_index_part) = index_part.archived_at {
+        if archived_at_index_part
+            .signed_duration_since(offloaded.archived_at)
+            .num_seconds()
+            != 0
+        {
+            warnings.push(format!(
+                "index-part archived_at={} differs from manifest archived_at={}",
+                archived_at_index_part, offloaded.archived_at
+            ));
+        }
+    } else {
+        warnings.push("Timeline offloaded in manifest but not archived in index-part".to_string());
+    }
+    if index_part.metadata.ancestor_timeline() != offloaded.ancestor_timeline_id {
+        warnings.push(format!(
+            "index-part anestor={:?} differs from manifest ancestor={:?}",
+            index_part.metadata.ancestor_timeline(),
+            offloaded.ancestor_timeline_id
+        ));
+    }
+    warnings
+}
+
 /// Physical garbage collection: removing unused S3 objects.
 ///
 /// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level
@@ -650,29 +738,38 @@ pub async fn pageserver_physical_gc(
         let target_ref = &target;
         let remote_client_ref = &remote_client;
         async move {
-            let summaries_from_manifests = match gc_tenant_manifests(
+            let gc_manifest_result = gc_tenant_manifests(
                 remote_client_ref,
                 min_age,
                 target_ref,
                 mode,
                 tenant_shard_id,
             )
-            .await
-            {
-                Ok(gc_summary) => vec![Ok(GcSummaryOrContent::<TenantShardTimelineId>::GcSummary(
-                    gc_summary,
-                ))],
+            .await;
+            let (summary_from_manifest, tenant_manifest_opt) = match gc_manifest_result {
+                Ok((gc_summary, tenant_manifest)) => (gc_summary, tenant_manifest),
                 Err(e) => {
                     tracing::warn!(%tenant_shard_id, "Error in gc_tenant_manifests: {e}");
-                    Vec::new()
+                    (GcSummary::default(), None)
                 }
             };
+            let tenant_manifest_arc = Arc::new(tenant_manifest_opt);
+            let summary_from_manifest = Ok(GcSummaryOrContent::<(_, _)>::GcSummary(
+                summary_from_manifest,
+            ));
             stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id)
                 .await
                 .map(|stream| {
                     stream
-                        .map_ok(GcSummaryOrContent::Content)
-                        .chain(futures::stream::iter(summaries_from_manifests.into_iter()))
+                        .zip(futures::stream::iter(std::iter::repeat(
+                            tenant_manifest_arc,
+                        )))
+                        .map(|(ttid_res, tenant_manifest_arc)| {
+                            ttid_res.map(move |ttid| {
+                                GcSummaryOrContent::Content((ttid, tenant_manifest_arc))
+                            })
+                        })
+                        .chain(futures::stream::iter([summary_from_manifest].into_iter()))
                 })
         }
     });
@@ -684,14 +781,17 @@ pub async fn pageserver_physical_gc(
     // Drain futures for per-shard GC, populating accumulator as a side effect
     {
         let timelines = timelines.map_ok(|summary_or_ttid| match summary_or_ttid {
-            GcSummaryOrContent::Content(ttid) => futures::future::Either::Left(gc_timeline(
-                &remote_client,
-                &min_age,
-                &target,
-                mode,
-                ttid,
-                &accumulator,
-            )),
+            GcSummaryOrContent::Content((ttid, tenant_manifest_arc)) => {
+                futures::future::Either::Left(gc_timeline(
+                    &remote_client,
+                    &min_age,
+                    &target,
+                    mode,
+                    ttid,
+                    &accumulator,
+                    tenant_manifest_arc,
+                ))
+            }
             GcSummaryOrContent::GcSummary(gc_summary) => {
                 futures::future::Either::Right(futures::future::ok(gc_summary))
             }

From 5126ebbfed85404d89b068288f02765e7141aaf7 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 11 Dec 2024 22:37:25 +0100
Subject: [PATCH 016/583] test_runner: bump test_check_visibility_map timeout
 (#10091)

## Problem

`test_check_visibility_map` has been seen to time out in debug tests.

## Summary of changes

Bump the timeout to 10 minutes (test reports indicate 7 minutes is
sufficient).

We don't want to disable the test entirely in debug builds, to exercise
this with debug assertions enabled.

Resolves #10069.
---
 test_runner/regress/test_vm_bits.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index 46e90852a6..d9e59c71f4 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import time
 from contextlib import closing
 
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, fork_at_current_lsn
 from fixtures.utils import query_scalar
@@ -294,6 +295,7 @@ def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder):
     cur.execute("commit transaction")
 
 
+@pytest.mark.timeout(600)  # slow in debug builds
 def test_check_visibility_map(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     """
     Runs pgbench across a few databases on a sharded tenant, then performs a visibility map

From b391b29bdc2af059261dc00ecfe13a98cf9f2e54 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 11 Dec 2024 16:21:42 -0600
Subject: [PATCH 017/583] Improve typing in test_runner/fixtures/httpserver.py
 (#10103)

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 test_runner/fixtures/httpserver.py                | 15 ++++++++-------
 test_runner/regress/test_ddl_forwarding.py        |  4 +++-
 test_runner/regress/test_download_extensions.py   |  6 ++++--
 .../regress/test_pageserver_metric_collection.py  |  6 ++++--
 .../regress/test_proxy_metric_collection.py       |  6 +++++-
 test_runner/regress/test_sharding.py              |  7 +++++--
 test_runner/regress/test_storage_controller.py    |  8 +++++---
 .../regress/test_threshold_based_eviction.py      |  6 +++++-
 8 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/test_runner/fixtures/httpserver.py b/test_runner/fixtures/httpserver.py
index f653fd804c..1f46bb22b2 100644
--- a/test_runner/fixtures/httpserver.py
+++ b/test_runner/fixtures/httpserver.py
@@ -7,24 +7,25 @@ from pytest_httpserver import HTTPServer
 
 if TYPE_CHECKING:
     from collections.abc import Iterator
+    from ssl import SSLContext
 
     from fixtures.port_distributor import PortDistributor
 
-# TODO: mypy fails with:
-#  Module "fixtures.neon_fixtures" does not explicitly export attribute "PortDistributor"  [attr-defined]
-# from fixtures.neon_fixtures import PortDistributor
+    ListenAddress = tuple[str, int]
 
 # compared to the fixtures from pytest_httpserver with same names, these are
 # always function scoped, so you can check and stop the server in tests.
 
 
 @pytest.fixture(scope="function")
-def httpserver_ssl_context():
-    return None
+def httpserver_ssl_context() -> Iterator[SSLContext | None]:
+    yield None
 
 
 @pytest.fixture(scope="function")
-def make_httpserver(httpserver_listen_address, httpserver_ssl_context) -> Iterator[HTTPServer]:
+def make_httpserver(
+    httpserver_listen_address: ListenAddress, httpserver_ssl_context: SSLContext | None
+) -> Iterator[HTTPServer]:
     host, port = httpserver_listen_address
     if not host:
         host = HTTPServer.DEFAULT_LISTEN_HOST
@@ -47,6 +48,6 @@ def httpserver(make_httpserver: HTTPServer) -> Iterator[HTTPServer]:
 
 
 @pytest.fixture(scope="function")
-def httpserver_listen_address(port_distributor: PortDistributor) -> tuple[str, int]:
+def httpserver_listen_address(port_distributor: PortDistributor) -> ListenAddress:
     port = port_distributor.get_port()
     return ("localhost", port)
diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
index 1c5554c379..de44bbcbc8 100644
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -15,6 +15,8 @@ from werkzeug.wrappers.response import Response
 if TYPE_CHECKING:
     from typing import Any, Self
 
+    from fixtures.httpserver import ListenAddress
+
 
 def handle_db(dbs, roles, operation):
     if operation["op"] == "set":
@@ -120,7 +122,7 @@ class DdlForwardingContext:
 
 @pytest.fixture(scope="function")
 def ddl(
-    httpserver: HTTPServer, vanilla_pg: VanillaPostgres, httpserver_listen_address: tuple[str, int]
+    httpserver: HTTPServer, vanilla_pg: VanillaPostgres, httpserver_listen_address: ListenAddress
 ):
     (host, port) = httpserver_listen_address
     with DdlForwardingContext(httpserver, vanilla_pg, host, port) as ddl:
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index b2e19ad713..f18f4e78bd 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -20,6 +20,8 @@ from werkzeug.wrappers.response import Response
 if TYPE_CHECKING:
     from typing import Any
 
+    from fixtures.httpserver import ListenAddress
+
 
 # use neon_env_builder_local fixture to override the default neon_env_builder fixture
 # and use a test-specific pg_install instead of shared one
@@ -47,8 +49,8 @@ def neon_env_builder_local(
 def test_remote_extensions(
     httpserver: HTTPServer,
     neon_env_builder_local: NeonEnvBuilder,
-    httpserver_listen_address,
-    pg_version,
+    httpserver_listen_address: ListenAddress,
+    pg_version: PgVersion,
 ):
     # setup mock http server
     # that expects request for anon.tar.zst
diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py
index 5ec8357597..aedfdbd210 100644
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -27,6 +27,8 @@ from werkzeug.wrappers.response import Response
 if TYPE_CHECKING:
     from typing import Any
 
+    from fixtures.httpserver import ListenAddress
+
 
 # TODO: collect all of the env setup *AFTER* removal of RemoteStorageKind.NOOP
 
@@ -34,7 +36,7 @@ if TYPE_CHECKING:
 def test_metric_collection(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
 ):
     (host, port) = httpserver_listen_address
     metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
@@ -195,7 +197,7 @@ def test_metric_collection(
 def test_metric_collection_cleans_up_tempfile(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
 ):
     (host, port) = httpserver_listen_address
     metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
diff --git a/test_runner/regress/test_proxy_metric_collection.py b/test_runner/regress/test_proxy_metric_collection.py
index dd63256388..5ff4a99c51 100644
--- a/test_runner/regress/test_proxy_metric_collection.py
+++ b/test_runner/regress/test_proxy_metric_collection.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
 from collections.abc import Iterator
 from pathlib import Path
+from typing import TYPE_CHECKING
 
 import pytest
 from fixtures.log_helper import log
@@ -15,6 +16,9 @@ from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
+if TYPE_CHECKING:
+    from fixtures.httpserver import ListenAddress
+
 
 def proxy_metrics_handler(request: Request) -> Response:
     if request.json is None:
@@ -38,7 +42,7 @@ def proxy_metrics_handler(request: Request) -> Response:
 def proxy_with_metric_collector(
     port_distributor: PortDistributor,
     neon_binpath: Path,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
     test_output_dir: Path,
 ) -> Iterator[NeonProxy]:
     """Neon proxy that routes through link auth and has metric collection enabled."""
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 30abf91d3a..743ab0088b 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 import os
 import time
 from collections import defaultdict
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import pytest
 import requests
@@ -27,6 +27,9 @@ from typing_extensions import override
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
+if TYPE_CHECKING:
+    from fixtures.httpserver import ListenAddress
+
 
 def test_sharding_smoke(
     neon_env_builder: NeonEnvBuilder,
@@ -759,7 +762,7 @@ def test_sharding_split_smoke(
 def test_sharding_split_stripe_size(
     neon_env_builder: NeonEnvBuilder,
     httpserver: HTTPServer,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
     initial_stripe_size: int,
 ):
     """
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 02da389809..5f3a7b39d3 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -58,6 +58,8 @@ from werkzeug.wrappers.response import Response
 if TYPE_CHECKING:
     from typing import Any
 
+    from fixtures.httpserver import ListenAddress
+
 
 def get_node_shard_counts(env: NeonEnv, tenant_ids):
     counts: defaultdict[int, int] = defaultdict(int)
@@ -563,7 +565,7 @@ def test_storage_controller_onboard_detached(neon_env_builder: NeonEnvBuilder):
 def test_storage_controller_compute_hook(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
 ):
     """
     Test that the sharding service calls out to the configured HTTP endpoint on attachment changes
@@ -681,7 +683,7 @@ NOTIFY_FAILURE_LOGS = [
 def test_storage_controller_stuck_compute_hook(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
 ):
     """
     Test the migration process's behavior when the compute hook does not enable it to proceed
@@ -818,7 +820,7 @@ def test_storage_controller_stuck_compute_hook(
 def test_storage_controller_compute_hook_revert(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
 ):
     """
     'revert' in the sense of a migration which gets reversed shortly after, as may happen during
diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py
index bedbd84aee..c87b520366 100644
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
 import time
 from dataclasses import dataclass
+from typing import TYPE_CHECKING
 
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
@@ -13,12 +14,15 @@ from fixtures.pageserver.http import LayerMapInfo
 from fixtures.remote_storage import RemoteStorageKind
 from pytest_httpserver import HTTPServer
 
+if TYPE_CHECKING:
+    from fixtures.httpserver import ListenAddress
+
 # NB: basic config change tests are in test_tenant_conf.py
 
 
 def test_threshold_based_eviction(
     httpserver: HTTPServer,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
     pg_bin: PgBin,
     neon_env_builder: NeonEnvBuilder,
 ):

From 342cbea255aa601079ed01f91f902c8f2da8c3e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 12 Dec 2024 02:09:24 +0100
Subject: [PATCH 018/583] storcon: add safekeeper list API (#10089)

This adds an API to the storage controller to list safekeepers
registered to it.

This PR does a `diesel print-schema > storage_controller/src/schema.rs`
because of an inconsistency between up.sql and schema.rs, introduced by
[this](https://github.com/neondatabase/neon/pull/8879/commits/2c142f14f7ba9bd9a178ae96723d196330eb34ba)
commit, so there is some updates of `schema.rs` due to that. As a
followup to this, we should maybe think about running `diesel
print-schema` in CI.

Part of #9981
---
 storage_controller/src/http.rs                | 41 +++++++++++++++----
 storage_controller/src/persistence.rs         | 17 ++++++++
 storage_controller/src/schema.rs              | 35 +++++++++-------
 storage_controller/src/service.rs             |  6 +++
 test_runner/fixtures/neon_fixtures.py         | 10 +++++
 .../regress/test_storage_controller.py        |  5 +++
 6 files changed, 90 insertions(+), 24 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index dce5380aa0..24fd4c341a 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -879,6 +879,21 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
     json_response(StatusCode::ACCEPTED, ())
 }
 
+async fn handle_safekeeper_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Infra)?;
+
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let state = get_state(&req);
+    let safekeepers = state.service.safekeepers_list().await?;
+    json_response(StatusCode::OK, safekeepers)
+}
+
 async fn handle_metadata_health_update(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Scrubber)?;
 
@@ -1203,7 +1218,7 @@ impl From<ReconcileError> for ApiError {
 ///
 /// Not used by anything except manual testing.
 async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Infra)?;
 
     let id = parse_request_param::<i64>(&req, "id")?;
 
@@ -1221,7 +1236,7 @@ async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, Api
     match res {
         Ok(b) => json_response(StatusCode::OK, b),
         Err(crate::persistence::DatabaseError::Query(diesel::result::Error::NotFound)) => {
-            Err(ApiError::NotFound("unknown instance_id".into()))
+            Err(ApiError::NotFound("unknown instance id".into()))
         }
         Err(other) => Err(other.into()),
     }
@@ -1817,6 +1832,21 @@ pub fn make_router(
                 RequestName("control_v1_metadata_health_list_outdated"),
             )
         })
+        // Safekeepers
+        .get("/control/v1/safekeeper", |r| {
+            named_request_span(
+                r,
+                handle_safekeeper_list,
+                RequestName("control_v1_safekeeper_list"),
+            )
+        })
+        .get("/control/v1/safekeeper/:id", |r| {
+            named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper"))
+        })
+        .post("/control/v1/safekeeper/:id", |r| {
+            // id is in the body
+            named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
+        })
         // Tenant Shard operations
         .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
             tenant_service_handler(
@@ -1869,13 +1899,6 @@ pub fn make_router(
         .put("/control/v1/step_down", |r| {
             named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
         })
-        .get("/control/v1/safekeeper/:id", |r| {
-            named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper"))
-        })
-        .post("/control/v1/safekeeper/:id", |r| {
-            // id is in the body
-            named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
-        })
         // Tenant operations
         // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
         // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 7ca80c7dfe..e17fe78d25 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -104,6 +104,7 @@ pub(crate) enum DatabaseOperation {
     ListMetadataHealth,
     ListMetadataHealthUnhealthy,
     ListMetadataHealthOutdated,
+    ListSafekeepers,
     GetLeader,
     UpdateLeader,
     SetPreferredAzs,
@@ -1011,6 +1012,22 @@ impl Persistence {
         Ok(())
     }
 
+    /// At startup, populate the list of nodes which our shards may be placed on
+    pub(crate) async fn list_safekeepers(&self) -> DatabaseResult<Vec<SafekeeperPersistence>> {
+        let safekeepers: Vec<SafekeeperPersistence> = self
+            .with_measured_conn(
+                DatabaseOperation::ListNodes,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::safekeepers::table.load::<SafekeeperPersistence>(conn)?)
+                },
+            )
+            .await?;
+
+        tracing::info!("list_safekeepers: loaded {} nodes", safekeepers.len());
+
+        Ok(safekeepers)
+    }
+
     pub(crate) async fn safekeeper_get(
         &self,
         id: i64,
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index 1717a9369d..9e005ab932 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -29,6 +29,19 @@ diesel::table! {
     }
 }
 
+diesel::table! {
+    safekeepers (id) {
+        id -> Int8,
+        region_id -> Text,
+        version -> Int8,
+        host -> Text,
+        port -> Int4,
+        active -> Bool,
+        http_port -> Int4,
+        availability_zone_id -> Text,
+    }
+}
+
 diesel::table! {
     tenant_shards (tenant_id, shard_number, shard_count) {
         tenant_id -> Varchar,
@@ -45,18 +58,10 @@ diesel::table! {
     }
 }
 
-diesel::allow_tables_to_appear_in_same_query!(controllers, metadata_health, nodes, tenant_shards,);
-
-diesel::table! {
-    safekeepers {
-        id -> Int8,
-        region_id -> Text,
-        version -> Int8,
-        instance_id -> Text,
-        host -> Text,
-        port -> Int4,
-        active -> Bool,
-        http_port -> Int4,
-        availability_zone_id -> Text,
-    }
-}
+diesel::allow_tables_to_appear_in_same_query!(
+    controllers,
+    metadata_health,
+    nodes,
+    safekeepers,
+    tenant_shards,
+);
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 2600500a53..894b67fdc6 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -7185,6 +7185,12 @@ impl Service {
         global_observed
     }
 
+    pub(crate) async fn safekeepers_list(
+        &self,
+    ) -> Result<Vec<crate::persistence::SafekeeperPersistence>, DatabaseError> {
+        self.persistence.list_safekeepers().await
+    }
+
     pub(crate) async fn get_safekeeper(
         &self,
         id: i64,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 8354432c0c..0ecc324030 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2329,6 +2329,16 @@ class NeonStorageController(MetricsGetter, LogUtils):
                 return None
             raise e
 
+    def get_safekeepers(self) -> list[dict[str, Any]]:
+        response = self.request(
+            "GET",
+            f"{self.api}/control/v1/safekeeper",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        json = response.json()
+        assert isinstance(json, list)
+        return json
+
     def set_preferred_azs(self, preferred_azs: dict[TenantShardId, str]) -> list[TenantShardId]:
         response = self.request(
             "PUT",
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 5f3a7b39d3..ae9b596a1b 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2955,6 +2955,8 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
 
     assert target.get_safekeeper(fake_id) is None
 
+    assert len(target.get_safekeepers()) == 0
+
     body = {
         "active": True,
         "id": fake_id,
@@ -2972,6 +2974,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
 
     inserted = target.get_safekeeper(fake_id)
     assert inserted is not None
+    assert target.get_safekeepers() == [inserted]
     assert eq_safekeeper_records(body, inserted)
 
     # error out if pk is changed (unexpected)
@@ -2983,6 +2986,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
     assert exc.value.status_code == 400
 
     inserted_again = target.get_safekeeper(fake_id)
+    assert target.get_safekeepers() == [inserted_again]
     assert inserted_again is not None
     assert eq_safekeeper_records(inserted, inserted_again)
 
@@ -2991,6 +2995,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
     body["version"] += 1
     target.on_safekeeper_deploy(fake_id, body)
     inserted_now = target.get_safekeeper(fake_id)
+    assert target.get_safekeepers() == [inserted_now]
     assert inserted_now is not None
 
     assert eq_safekeeper_records(body, inserted_now)

From 739f627b96b3b0e00479f90621559375b3382582 Mon Sep 17 00:00:00 2001
From: Misha Sakhnov <mikhail@skhnv.me>
Date: Thu, 12 Dec 2024 10:45:52 +0200
Subject: [PATCH 019/583] Bump vm-builder v0.35.0 -> v0.37.1 (#10015)

Bump version to pick up changes introduced in the neonvm-daemon to
support sys fs based CPU scaling
(https://github.com/neondatabase/autoscaling/issues/1082).

Previous update: https://github.com/neondatabase/neon/pull/9208
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index ee22f2ff54..67d59c7da1 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -797,7 +797,7 @@ jobs:
           - pg: v17
             debian: bookworm
     env:
-      VM_BUILDER_VERSION: v0.35.0
+      VM_BUILDER_VERSION: v0.37.1
 
     steps:
       - uses: actions/checkout@v4

From 0bd8eca9ca98ffbcaa403a47b2e6dfc2961fafa1 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 12 Dec 2024 09:18:50 +0000
Subject: [PATCH 020/583] Storage: create release PRs On Fridays (#10017)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

To give Storage more time on preprod — create a release branch on Friday

## Summary of changes
- Automatically create Storage release PR on Friday instead of Monday
---
 .github/workflows/release.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index f0273b977f..3c1af1d9c6 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -3,7 +3,7 @@ name: Create Release Branch
 on:
   schedule:
     # It should be kept in sync with if-condition in jobs
-    - cron: '0 6 * * MON' # Storage release
+    - cron: '0 6 * * FRI' # Storage release
     - cron: '0 6 * * THU' # Proxy release
   workflow_dispatch:
     inputs:
@@ -29,7 +29,7 @@ defaults:
 
 jobs:
   create-storage-release-branch:
-    if: ${{ github.event.schedule == '0 6 * * MON' || inputs.create-storage-release-branch }}
+    if: ${{ github.event.schedule == '0 6 * * FRI' || inputs.create-storage-release-branch }}
 
     permissions:
       contents: write

From ec0ce06c164dcbd301acf1bc96822b6aa542d173 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 12 Dec 2024 10:53:10 +0000
Subject: [PATCH 021/583] tests: default interpreted proto in tests (#10079)

## Problem

We aren't using the sharded interpreted wal receiver protocol in all
tests.

## Summary of changes

Default to the interpreted protocol.
---
 test_runner/fixtures/neon_fixtures.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0ecc324030..0b7cdec50d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -435,7 +435,10 @@ class NeonEnvBuilder:
 
         self.pageserver_virtual_file_io_mode = pageserver_virtual_file_io_mode
 
-        self.pageserver_wal_receiver_protocol = pageserver_wal_receiver_protocol
+        if pageserver_wal_receiver_protocol is not None:
+            self.pageserver_wal_receiver_protocol = pageserver_wal_receiver_protocol
+        else:
+            self.pageserver_wal_receiver_protocol = PageserverWalReceiverProtocol.INTERPRETED
 
         assert test_name.startswith(
             "test_"

From c9a773af37207979955ba08e40edcf2eb927745d Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Thu, 12 Dec 2024 14:57:00 +0300
Subject: [PATCH 022/583] Fix test_subscriber_synchronous_commit flakiness.
 (#10057)

6f7aeaa configured LFC for USE_LFC case, but omitted setting
shared_buffers for non USE_LFC, causing flakiness.

ref https://github.com/neondatabase/neon/issues/9989
---
 .../regress/test_logical_replication.py       | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index db18e1758c..8908763109 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -573,17 +573,18 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg: Van
     vanilla_pg.safe_psql("create extension neon;")
 
     env.create_branch("subscriber")
-    # We want all data to fit into shared_buffers because later we stop
-    # safekeeper and insert more; this shouldn't cause page requests as they
-    # will be stuck.
+    # We want all data to fit into shared_buffers or LFC cache because later we
+    # stop safekeeper and insert more; this shouldn't cause page requests as
+    # they will be stuck.
+    if USE_LFC:
+        config_lines = ["neon.max_file_cache_size = 32MB", "neon.file_cache_size_limit = 32MB"]
+    else:
+        config_lines = [
+            "shared_buffers = 32MB",
+        ]
     sub = env.endpoints.create(
         "subscriber",
-        config_lines=[
-            "neon.max_file_cache_size = 32MB",
-            "neon.file_cache_size_limit = 32MB",
-        ]
-        if USE_LFC
-        else [],
+        config_lines=config_lines,
     )
     sub.start()
 

From e502e880b542e54b13f29a45ec74d36c93361520 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 12 Dec 2024 13:42:50 +0000
Subject: [PATCH 023/583] chore(proxy): remove code for old API (#10109)

## Problem

Now that https://github.com/neondatabase/cloud/issues/15245 is done, we
can remove the old code.

## Summary of changes

Removes support for the ManagementV2 API, in favour of the ProxyV1 API.
---
 proxy/src/auth/backend/mod.rs          |   4 -
 proxy/src/bin/proxy.rs                 |  97 +----
 proxy/src/control_plane/client/mod.rs  |   9 +-
 proxy/src/control_plane/client/neon.rs | 511 -------------------------
 proxy/src/control_plane/messages.rs    |  22 +-
 test_runner/fixtures/neon_fixtures.py  |  14 +-
 6 files changed, 12 insertions(+), 645 deletions(-)
 delete mode 100644 proxy/src/control_plane/client/neon.rs

diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 1bad7b3086..f38ecf715f 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -74,10 +74,6 @@ impl std::fmt::Display for Backend<'_, ()> {
                     .debug_tuple("ControlPlane::ProxyV1")
                     .field(&endpoint.url())
                     .finish(),
-                ControlPlaneClient::Neon(endpoint) => fmt
-                    .debug_tuple("ControlPlane::Neon")
-                    .field(&endpoint.url())
-                    .finish(),
                 #[cfg(any(test, feature = "testing"))]
                 ControlPlaneClient::PostgresMock(endpoint) => fmt
                     .debug_tuple("ControlPlane::PostgresMock")
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 99144acef0..97c4037009 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -43,9 +43,6 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
 #[derive(Clone, Debug, ValueEnum)]
 enum AuthBackendType {
-    #[value(name("console"), alias("cplane"))]
-    ControlPlane,
-
     #[value(name("cplane-v1"), alias("control-plane"))]
     ControlPlaneV1,
 
@@ -488,40 +485,7 @@ async fn main() -> anyhow::Result<()> {
     }
 
     if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend {
-        if let proxy::control_plane::client::ControlPlaneClient::Neon(api) = &**api {
-            match (redis_notifications_client, regional_redis_client.clone()) {
-                (None, None) => {}
-                (client1, client2) => {
-                    let cache = api.caches.project_info.clone();
-                    if let Some(client) = client1 {
-                        maintenance_tasks.spawn(notifications::task_main(
-                            client,
-                            cache.clone(),
-                            cancel_map.clone(),
-                            args.region.clone(),
-                        ));
-                    }
-                    if let Some(client) = client2 {
-                        maintenance_tasks.spawn(notifications::task_main(
-                            client,
-                            cache.clone(),
-                            cancel_map.clone(),
-                            args.region.clone(),
-                        ));
-                    }
-                    maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
-                }
-            }
-            if let Some(regional_redis_client) = regional_redis_client {
-                let cache = api.caches.endpoints_cache.clone();
-                let con = regional_redis_client;
-                let span = tracing::info_span!("endpoints_cache");
-                maintenance_tasks.spawn(
-                    async move { cache.do_read(con, cancellation_token.clone()).await }
-                        .instrument(span),
-                );
-            }
-        } else if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
+        if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
             match (redis_notifications_client, regional_redis_client.clone()) {
                 (None, None) => {}
                 (client1, client2) => {
@@ -757,65 +721,6 @@ fn build_auth_backend(
             Ok(Either::Left(config))
         }
 
-        AuthBackendType::ControlPlane => {
-            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
-            let project_info_cache_config: ProjectInfoCacheOptions =
-                args.project_info_cache.parse()?;
-            let endpoint_cache_config: config::EndpointCacheConfig =
-                args.endpoint_cache_config.parse()?;
-
-            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
-            info!(
-                "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
-            );
-            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
-            let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
-                wake_compute_cache_config,
-                project_info_cache_config,
-                endpoint_cache_config,
-            )));
-
-            let config::ConcurrencyLockOptions {
-                shards,
-                limiter,
-                epoch,
-                timeout,
-            } = args.wake_compute_lock.parse()?;
-            info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
-            let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
-                "wake_compute_lock",
-                limiter,
-                shards,
-                timeout,
-                epoch,
-                &Metrics::get().wake_compute_lock,
-            )?));
-            tokio::spawn(locks.garbage_collect_worker());
-
-            let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
-
-            let endpoint = http::Endpoint::new(url, http::new_client());
-
-            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
-            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
-            let wake_compute_endpoint_rate_limiter =
-                Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
-
-            let api = control_plane::client::neon::NeonControlPlaneClient::new(
-                endpoint,
-                args.control_plane_token.clone(),
-                caches,
-                locks,
-                wake_compute_endpoint_rate_limiter,
-            );
-            let api = control_plane::client::ControlPlaneClient::Neon(api);
-            let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
-
-            let config = Box::leak(Box::new(auth_backend));
-
-            Ok(Either::Left(config))
-        }
-
         #[cfg(feature = "testing")]
         AuthBackendType::Postgres => {
             let url = args.auth_endpoint.parse()?;
diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs
index 7ef5a9c9fd..d559d96bbc 100644
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -1,7 +1,6 @@
 pub mod cplane_proxy_v1;
 #[cfg(any(test, feature = "testing"))]
 pub mod mock;
-pub mod neon;
 
 use std::hash::Hash;
 use std::sync::Arc;
@@ -28,10 +27,8 @@ use crate::types::EndpointId;
 #[non_exhaustive]
 #[derive(Clone)]
 pub enum ControlPlaneClient {
-    /// New Proxy V1 control plane API
+    /// Proxy V1 control plane API
     ProxyV1(cplane_proxy_v1::NeonControlPlaneClient),
-    /// Current Management API (V2).
-    Neon(neon::NeonControlPlaneClient),
     /// Local mock control plane.
     #[cfg(any(test, feature = "testing"))]
     PostgresMock(mock::MockControlPlane),
@@ -49,7 +46,6 @@ impl ControlPlaneApi for ControlPlaneClient {
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
         match self {
             Self::ProxyV1(api) => api.get_role_secret(ctx, user_info).await,
-            Self::Neon(api) => api.get_role_secret(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
             Self::PostgresMock(api) => api.get_role_secret(ctx, user_info).await,
             #[cfg(test)]
@@ -66,7 +62,6 @@ impl ControlPlaneApi for ControlPlaneClient {
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
         match self {
             Self::ProxyV1(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Self::Neon(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
             Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             #[cfg(test)]
@@ -81,7 +76,6 @@ impl ControlPlaneApi for ControlPlaneClient {
     ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
         match self {
             Self::ProxyV1(api) => api.get_endpoint_jwks(ctx, endpoint).await,
-            Self::Neon(api) => api.get_endpoint_jwks(ctx, endpoint).await,
             #[cfg(any(test, feature = "testing"))]
             Self::PostgresMock(api) => api.get_endpoint_jwks(ctx, endpoint).await,
             #[cfg(test)]
@@ -96,7 +90,6 @@ impl ControlPlaneApi for ControlPlaneClient {
     ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
         match self {
             Self::ProxyV1(api) => api.wake_compute(ctx, user_info).await,
-            Self::Neon(api) => api.wake_compute(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
             Self::PostgresMock(api) => api.wake_compute(ctx, user_info).await,
             #[cfg(test)]
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
deleted file mode 100644
index bf62c0d6ab..0000000000
--- a/proxy/src/control_plane/client/neon.rs
+++ /dev/null
@@ -1,511 +0,0 @@
-//! Stale console backend, remove after migrating to Proxy V1 API (#15245).
-
-use std::sync::Arc;
-use std::time::Duration;
-
-use ::http::header::AUTHORIZATION;
-use ::http::HeaderName;
-use futures::TryFutureExt;
-use postgres_client::config::SslMode;
-use tokio::time::Instant;
-use tracing::{debug, info, info_span, warn, Instrument};
-
-use super::super::messages::{ControlPlaneErrorMessage, GetRoleSecret, WakeCompute};
-use crate::auth::backend::jwt::AuthRule;
-use crate::auth::backend::ComputeUserInfo;
-use crate::cache::Cached;
-use crate::context::RequestContext;
-use crate::control_plane::caches::ApiCaches;
-use crate::control_plane::errors::{
-    ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
-};
-use crate::control_plane::locks::ApiLocks;
-use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
-use crate::control_plane::{
-    AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo,
-};
-use crate::metrics::{CacheOutcome, Metrics};
-use crate::rate_limiter::WakeComputeRateLimiter;
-use crate::types::{EndpointCacheKey, EndpointId};
-use crate::{compute, http, scram};
-
-const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
-
-#[derive(Clone)]
-pub struct NeonControlPlaneClient {
-    endpoint: http::Endpoint,
-    pub caches: &'static ApiCaches,
-    pub(crate) locks: &'static ApiLocks<EndpointCacheKey>,
-    pub(crate) wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
-    // put in a shared ref so we don't copy secrets all over in memory
-    jwt: Arc<str>,
-}
-
-impl NeonControlPlaneClient {
-    /// Construct an API object containing the auth parameters.
-    pub fn new(
-        endpoint: http::Endpoint,
-        jwt: Arc<str>,
-        caches: &'static ApiCaches,
-        locks: &'static ApiLocks<EndpointCacheKey>,
-        wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
-    ) -> Self {
-        Self {
-            endpoint,
-            caches,
-            locks,
-            wake_compute_endpoint_rate_limiter,
-            jwt,
-        }
-    }
-
-    pub(crate) fn url(&self) -> &str {
-        self.endpoint.url().as_str()
-    }
-
-    async fn do_get_auth_info(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> Result<AuthInfo, GetAuthInfoError> {
-        if !self
-            .caches
-            .endpoints_cache
-            .is_valid(ctx, &user_info.endpoint.normalize())
-        {
-            // TODO: refactor this because it's weird
-            // this is a failure to authenticate but we return Ok.
-            info!("endpoint is not valid, skipping the request");
-            return Ok(AuthInfo::default());
-        }
-        let request_id = ctx.session_id().to_string();
-        let application_name = ctx.console_application_name();
-        async {
-            let request = self
-                .endpoint
-                .get_path("proxy_get_role_secret")
-                .header(X_REQUEST_ID, &request_id)
-                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id())])
-                .query(&[
-                    ("application_name", application_name.as_str()),
-                    ("project", user_info.endpoint.as_str()),
-                    ("role", user_info.user.as_str()),
-                ])
-                .build()?;
-
-            debug!(url = request.url().as_str(), "sending http request");
-            let start = Instant::now();
-            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
-            let response = self.endpoint.execute(request).await?;
-            drop(pause);
-            info!(duration = ?start.elapsed(), "received http response");
-            let body = match parse_body::<GetRoleSecret>(response).await {
-                Ok(body) => body,
-                // Error 404 is special: it's ok not to have a secret.
-                // TODO(anna): retry
-                Err(e) => {
-                    return if e.get_reason().is_not_found() {
-                        // TODO: refactor this because it's weird
-                        // this is a failure to authenticate but we return Ok.
-                        Ok(AuthInfo::default())
-                    } else {
-                        Err(e.into())
-                    };
-                }
-            };
-
-            let secret = if body.role_secret.is_empty() {
-                None
-            } else {
-                let secret = scram::ServerSecret::parse(&body.role_secret)
-                    .map(AuthSecret::Scram)
-                    .ok_or(GetAuthInfoError::BadSecret)?;
-                Some(secret)
-            };
-            let allowed_ips = body.allowed_ips.unwrap_or_default();
-            Metrics::get()
-                .proxy
-                .allowed_ips_number
-                .observe(allowed_ips.len() as f64);
-            Ok(AuthInfo {
-                secret,
-                allowed_ips,
-                project_id: body.project_id,
-            })
-        }
-        .inspect_err(|e| tracing::debug!(error = ?e))
-        .instrument(info_span!("do_get_auth_info"))
-        .await
-    }
-
-    async fn do_get_endpoint_jwks(
-        &self,
-        ctx: &RequestContext,
-        endpoint: EndpointId,
-    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
-        if !self
-            .caches
-            .endpoints_cache
-            .is_valid(ctx, &endpoint.normalize())
-        {
-            return Err(GetEndpointJwksError::EndpointNotFound);
-        }
-        let request_id = ctx.session_id().to_string();
-        async {
-            let request = self
-                .endpoint
-                .get_with_url(|url| {
-                    url.path_segments_mut()
-                        .push("endpoints")
-                        .push(endpoint.as_str())
-                        .push("jwks");
-                })
-                .header(X_REQUEST_ID, &request_id)
-                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id())])
-                .build()
-                .map_err(GetEndpointJwksError::RequestBuild)?;
-
-            debug!(url = request.url().as_str(), "sending http request");
-            let start = Instant::now();
-            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
-            let response = self
-                .endpoint
-                .execute(request)
-                .await
-                .map_err(GetEndpointJwksError::RequestExecute)?;
-            drop(pause);
-            info!(duration = ?start.elapsed(), "received http response");
-
-            let body = parse_body::<EndpointJwksResponse>(response).await?;
-
-            let rules = body
-                .jwks
-                .into_iter()
-                .map(|jwks| AuthRule {
-                    id: jwks.id,
-                    jwks_url: jwks.jwks_url,
-                    audience: jwks.jwt_audience,
-                    role_names: jwks.role_names,
-                })
-                .collect();
-
-            Ok(rules)
-        }
-        .inspect_err(|e| tracing::debug!(error = ?e))
-        .instrument(info_span!("do_get_endpoint_jwks"))
-        .await
-    }
-
-    async fn do_wake_compute(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> Result<NodeInfo, WakeComputeError> {
-        let request_id = ctx.session_id().to_string();
-        let application_name = ctx.console_application_name();
-        async {
-            let mut request_builder = self
-                .endpoint
-                .get_path("proxy_wake_compute")
-                .header("X-Request-ID", &request_id)
-                .header("Authorization", format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id())])
-                .query(&[
-                    ("application_name", application_name.as_str()),
-                    ("project", user_info.endpoint.as_str()),
-                ]);
-
-            let options = user_info.options.to_deep_object();
-            if !options.is_empty() {
-                request_builder = request_builder.query(&options);
-            }
-
-            let request = request_builder.build()?;
-
-            debug!(url = request.url().as_str(), "sending http request");
-            let start = Instant::now();
-            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
-            let response = self.endpoint.execute(request).await?;
-            drop(pause);
-            info!(duration = ?start.elapsed(), "received http response");
-            let body = parse_body::<WakeCompute>(response).await?;
-
-            // Unfortunately, ownership won't let us use `Option::ok_or` here.
-            let (host, port) = match parse_host_port(&body.address) {
-                None => return Err(WakeComputeError::BadComputeAddress(body.address)),
-                Some(x) => x,
-            };
-
-            // Don't set anything but host and port! This config will be cached.
-            // We'll set username and such later using the startup message.
-            // TODO: add more type safety (in progress).
-            let mut config = compute::ConnCfg::new(host.to_owned(), port);
-            config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
-
-            let node = NodeInfo {
-                config,
-                aux: body.aux,
-                allow_self_signed_compute: false,
-            };
-
-            Ok(node)
-        }
-        .inspect_err(|e| tracing::debug!(error = ?e))
-        .instrument(info_span!("do_wake_compute"))
-        .await
-    }
-}
-
-impl super::ControlPlaneApi for NeonControlPlaneClient {
-    #[tracing::instrument(skip_all)]
-    async fn get_role_secret(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
-        let normalized_ep = &user_info.endpoint.normalize();
-        let user = &user_info.user;
-        if let Some(role_secret) = self
-            .caches
-            .project_info
-            .get_role_secret(normalized_ep, user)
-        {
-            return Ok(role_secret);
-        }
-        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
-        if let Some(project_id) = auth_info.project_id {
-            let normalized_ep_int = normalized_ep.into();
-            self.caches.project_info.insert_role_secret(
-                project_id,
-                normalized_ep_int,
-                user.into(),
-                auth_info.secret.clone(),
-            );
-            self.caches.project_info.insert_allowed_ips(
-                project_id,
-                normalized_ep_int,
-                Arc::new(auth_info.allowed_ips),
-            );
-            ctx.set_project_id(project_id);
-        }
-        // When we just got a secret, we don't need to invalidate it.
-        Ok(Cached::new_uncached(auth_info.secret))
-    }
-
-    async fn get_allowed_ips_and_secret(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        let normalized_ep = &user_info.endpoint.normalize();
-        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
-            Metrics::get()
-                .proxy
-                .allowed_ips_cache_misses
-                .inc(CacheOutcome::Hit);
-            return Ok((allowed_ips, None));
-        }
-        Metrics::get()
-            .proxy
-            .allowed_ips_cache_misses
-            .inc(CacheOutcome::Miss);
-        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
-        let allowed_ips = Arc::new(auth_info.allowed_ips);
-        let user = &user_info.user;
-        if let Some(project_id) = auth_info.project_id {
-            let normalized_ep_int = normalized_ep.into();
-            self.caches.project_info.insert_role_secret(
-                project_id,
-                normalized_ep_int,
-                user.into(),
-                auth_info.secret.clone(),
-            );
-            self.caches.project_info.insert_allowed_ips(
-                project_id,
-                normalized_ep_int,
-                allowed_ips.clone(),
-            );
-            ctx.set_project_id(project_id);
-        }
-        Ok((
-            Cached::new_uncached(allowed_ips),
-            Some(Cached::new_uncached(auth_info.secret)),
-        ))
-    }
-
-    #[tracing::instrument(skip_all)]
-    async fn get_endpoint_jwks(
-        &self,
-        ctx: &RequestContext,
-        endpoint: EndpointId,
-    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
-        self.do_get_endpoint_jwks(ctx, endpoint).await
-    }
-
-    #[tracing::instrument(skip_all)]
-    async fn wake_compute(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> Result<CachedNodeInfo, WakeComputeError> {
-        let key = user_info.endpoint_cache_key();
-
-        macro_rules! check_cache {
-            () => {
-                if let Some(cached) = self.caches.node_info.get(&key) {
-                    let (cached, info) = cached.take_value();
-                    let info = info.map_err(|c| {
-                        info!(key = &*key, "found cached wake_compute error");
-                        WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c)))
-                    })?;
-
-                    debug!(key = &*key, "found cached compute node info");
-                    ctx.set_project(info.aux.clone());
-                    return Ok(cached.map(|()| info));
-                }
-            };
-        }
-
-        // Every time we do a wakeup http request, the compute node will stay up
-        // for some time (highly depends on the console's scale-to-zero policy);
-        // The connection info remains the same during that period of time,
-        // which means that we might cache it to reduce the load and latency.
-        check_cache!();
-
-        let permit = self.locks.get_permit(&key).await?;
-
-        // after getting back a permit - it's possible the cache was filled
-        // double check
-        if permit.should_check_cache() {
-            // TODO: if there is something in the cache, mark the permit as success.
-            check_cache!();
-        }
-
-        // check rate limit
-        if !self
-            .wake_compute_endpoint_rate_limiter
-            .check(user_info.endpoint.normalize_intern(), 1)
-        {
-            return Err(WakeComputeError::TooManyConnections);
-        }
-
-        let node = permit.release_result(self.do_wake_compute(ctx, user_info).await);
-        match node {
-            Ok(node) => {
-                ctx.set_project(node.aux.clone());
-                debug!(key = &*key, "created a cache entry for woken compute node");
-
-                let mut stored_node = node.clone();
-                // store the cached node as 'warm_cached'
-                stored_node.aux.cold_start_info = ColdStartInfo::WarmCached;
-
-                let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node));
-
-                Ok(cached.map(|()| node))
-            }
-            Err(err) => match err {
-                WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => {
-                    let Some(status) = &err.status else {
-                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                            err,
-                        )));
-                    };
-
-                    let reason = status
-                        .details
-                        .error_info
-                        .map_or(Reason::Unknown, |x| x.reason);
-
-                    // if we can retry this error, do not cache it.
-                    if reason.can_retry() {
-                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                            err,
-                        )));
-                    }
-
-                    // at this point, we should only have quota errors.
-                    debug!(
-                        key = &*key,
-                        "created a cache entry for the wake compute error"
-                    );
-
-                    self.caches.node_info.insert_ttl(
-                        key,
-                        Err(err.clone()),
-                        Duration::from_secs(30),
-                    );
-
-                    Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                        err,
-                    )))
-                }
-                err => return Err(err),
-            },
-        }
-    }
-}
-
-/// Parse http response body, taking status code into account.
-async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
-    response: http::Response,
-) -> Result<T, ControlPlaneError> {
-    let status = response.status();
-    if status.is_success() {
-        // We shouldn't log raw body because it may contain secrets.
-        info!("request succeeded, processing the body");
-        return Ok(response.json().await?);
-    }
-    let s = response.bytes().await?;
-    // Log plaintext to be able to detect, whether there are some cases not covered by the error struct.
-    info!("response_error plaintext: {:?}", s);
-
-    // Don't throw an error here because it's not as important
-    // as the fact that the request itself has failed.
-    let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| {
-        warn!("failed to parse error body: {e}");
-        ControlPlaneErrorMessage {
-            error: "reason unclear (malformed error message)".into(),
-            http_status_code: status,
-            status: None,
-        }
-    });
-    body.http_status_code = status;
-
-    warn!("console responded with an error ({status}): {body:?}");
-    Err(ControlPlaneError::Message(Box::new(body)))
-}
-
-fn parse_host_port(input: &str) -> Option<(&str, u16)> {
-    let (host, port) = input.rsplit_once(':')?;
-    let ipv6_brackets: &[_] = &['[', ']'];
-    Some((host.trim_matches(ipv6_brackets), port.parse().ok()?))
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_parse_host_port_v4() {
-        let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse");
-        assert_eq!(host, "127.0.0.1");
-        assert_eq!(port, 5432);
-    }
-
-    #[test]
-    fn test_parse_host_port_v6() {
-        let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse");
-        assert_eq!(host, "2001:db8::1");
-        assert_eq!(port, 5432);
-    }
-
-    #[test]
-    fn test_parse_host_port_url() {
-        let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432")
-            .expect("failed to parse");
-        assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local");
-        assert_eq!(port, 5432);
-    }
-}
diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index 2662ab85f9..d068614b24 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -221,15 +221,6 @@ pub(crate) struct UserFacingMessage {
     pub(crate) message: Box<str>,
 }
 
-/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
-/// Returned by the `/proxy_get_role_secret` API method.
-#[derive(Deserialize)]
-pub(crate) struct GetRoleSecret {
-    pub(crate) role_secret: Box<str>,
-    pub(crate) allowed_ips: Option<Vec<IpPattern>>,
-    pub(crate) project_id: Option<ProjectIdInt>,
-}
-
 /// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
 /// Returned by the `/get_endpoint_access_control` API method.
 #[derive(Deserialize)]
@@ -240,13 +231,6 @@ pub(crate) struct GetEndpointAccessControl {
     pub(crate) allowed_vpc_endpoint_ids: Option<Vec<EndpointIdInt>>,
 }
 
-// Manually implement debug to omit sensitive info.
-impl fmt::Debug for GetRoleSecret {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_struct("GetRoleSecret").finish_non_exhaustive()
-    }
-}
-
 /// Response which holds compute node's `host:port` pair.
 /// Returned by the `/proxy_wake_compute` API method.
 #[derive(Debug, Deserialize)]
@@ -477,18 +461,18 @@ mod tests {
         let json = json!({
             "role_secret": "secret",
         });
-        serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
+        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
         let json = json!({
             "role_secret": "secret",
             "allowed_ips": ["8.8.8.8"],
         });
-        serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
+        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
         let json = json!({
             "role_secret": "secret",
             "allowed_ips": ["8.8.8.8"],
             "project_id": "project",
         });
-        serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
+        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
 
         Ok(())
     }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0b7cdec50d..13ada1361e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3222,7 +3222,7 @@ class NeonProxy(PgProtocol):
                 *["--allow-self-signed-compute", "true"],
             ]
 
-    class Console(AuthBackend):
+    class ProxyV1(AuthBackend):
         def __init__(self, endpoint: str, fixed_rate_limit: int | None = None):
             self.endpoint = endpoint
             self.fixed_rate_limit = fixed_rate_limit
@@ -3230,7 +3230,7 @@ class NeonProxy(PgProtocol):
         def extra_args(self) -> list[str]:
             args = [
                 # Console auth backend params
-                *["--auth-backend", "console"],
+                *["--auth-backend", "cplane-v1"],
                 *["--auth-endpoint", self.endpoint],
                 *["--sql-over-http-pool-opt-in", "false"],
             ]
@@ -3478,13 +3478,13 @@ class NeonProxy(PgProtocol):
 
 
 class NeonAuthBroker:
-    class ControlPlane:
+    class ProxyV1:
         def __init__(self, endpoint: str):
             self.endpoint = endpoint
 
         def extra_args(self) -> list[str]:
             args = [
-                *["--auth-backend", "console"],
+                *["--auth-backend", "cplane-v1"],
                 *["--auth-endpoint", self.endpoint],
             ]
             return args
@@ -3496,7 +3496,7 @@ class NeonAuthBroker:
         http_port: int,
         mgmt_port: int,
         external_http_port: int,
-        auth_backend: NeonAuthBroker.ControlPlane,
+        auth_backend: NeonAuthBroker.ProxyV1,
     ):
         self.domain = "apiauth.localtest.me"  # resolves to 127.0.0.1
         self.host = "127.0.0.1"
@@ -3682,7 +3682,7 @@ def static_auth_broker(
     local_proxy_addr = f"{http2_echoserver.host}:{http2_echoserver.port}"
 
     # return local_proxy addr on ProxyWakeCompute.
-    httpserver.expect_request("/cplane/proxy_wake_compute").respond_with_json(
+    httpserver.expect_request("/cplane/wake_compute").respond_with_json(
         {
             "address": local_proxy_addr,
             "aux": {
@@ -3722,7 +3722,7 @@ def static_auth_broker(
         http_port=http_port,
         mgmt_port=mgmt_port,
         external_http_port=external_http_port,
-        auth_backend=NeonAuthBroker.ControlPlane(httpserver.url_for("/cplane")),
+        auth_backend=NeonAuthBroker.ProxyV1(httpserver.url_for("/cplane")),
     ) as proxy:
         proxy.start()
         yield proxy

From 58d45c6e86b88b9693e8ab4dd1e8d402b51f1a4d Mon Sep 17 00:00:00 2001
From: Rahul Patil <rahul@neon.tech>
Date: Thu, 12 Dec 2024 16:13:08 +0100
Subject: [PATCH 024/583] ci(fix): Use OIDC auth to login on ECR (#10055)

## Problem

CI currently uses static credentials in some places. These are less
secure and hard to maintain, so we are going to deprecate them and use
OIDC auth.

## Summary of changes
- ci(fix): Use OIDC auth to upload artifact on s3
- ci(fix): Use OIDC auth to login on ECR
---
 .github/actions/download/action.yml           | 11 +++
 .../actions/run-python-test-set/action.yml    |  5 ++
 .github/actions/save-coverage-data/action.yml |  2 +
 .github/actions/upload/action.yml             | 11 +++
 .../workflows/_benchmarking_preparation.yml   |  1 +
 .github/workflows/_build-and-test-locally.yml | 18 +++-
 .github/workflows/benchmarking.yml            |  6 ++
 .github/workflows/build_and_test.yml          | 87 ++++++++++++++-----
 .github/workflows/cloud-regress.yml           |  8 +-
 .github/workflows/ingest_benchmark.yml        |  1 +
 .github/workflows/neon_extra_builds.yml       | 13 ++-
 .github/workflows/periodic_pagebench.yml      |  8 +-
 .github/workflows/pg-clients.yml              | 10 ++-
 .github/workflows/pin-build-tools-image.yml   | 14 +--
 .github/workflows/pre-merge-checks.yml        |  1 +
 control_plane/src/background_process.rs       |  1 +
 test_runner/fixtures/remote_storage.py        | 23 ++++-
 17 files changed, 180 insertions(+), 40 deletions(-)

diff --git a/.github/actions/download/action.yml b/.github/actions/download/action.yml
index 01c216b1ac..d6b1fac9f7 100644
--- a/.github/actions/download/action.yml
+++ b/.github/actions/download/action.yml
@@ -15,10 +15,21 @@ inputs:
   prefix:
     description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
     required: false
+  aws_oicd_role_arn:
+    description: "the OIDC role arn for aws auth"
+    required: false
+    default: ""
 
 runs:
   using: "composite"
   steps:
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ inputs.aws_oicd_role_arn }}
+        role-duration-seconds: 3600
+
     - name: Download artifact
       id: download-artifact
       shell: bash -euxo pipefail {0}
diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 1159627302..dd5c890f5b 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -62,6 +62,7 @@ runs:
       with:
         name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
         path: /tmp/neon
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
 
     - name: Download Neon binaries for the previous release
       if: inputs.build_type != 'remote'
@@ -70,6 +71,7 @@ runs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
         path: /tmp/neon-previous
         prefix: latest
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
 
     - name: Download compatibility snapshot
       if: inputs.build_type != 'remote'
@@ -81,6 +83,7 @@ runs:
         # The lack of compatibility snapshot (for example, for the new Postgres version)
         # shouldn't fail the whole job. Only relevant test should fail.
         skip-if-does-not-exist: true
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
 
     - name: Checkout
       if: inputs.needs_postgres_source == 'true'
@@ -218,6 +221,7 @@ runs:
         # The lack of compatibility snapshot shouldn't fail the job
         # (for example if we didn't run the test for non build-and-test workflow)
         skip-if-does-not-exist: true
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
 
     - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test
       if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }}
@@ -232,3 +236,4 @@ runs:
       with:
         report-dir: /tmp/test_output/allure/results
         unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
diff --git a/.github/actions/save-coverage-data/action.yml b/.github/actions/save-coverage-data/action.yml
index 6fbe19a96e..9e3a7cba24 100644
--- a/.github/actions/save-coverage-data/action.yml
+++ b/.github/actions/save-coverage-data/action.yml
@@ -14,9 +14,11 @@ runs:
         name: coverage-data-artifact
         path: /tmp/coverage
         skip-if-does-not-exist: true # skip if there's no previous coverage to download
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
 
     - name: Upload coverage data
       uses: ./.github/actions/upload
       with:
         name: coverage-data-artifact
         path: /tmp/coverage
+        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml
index 8a4cfe2eff..6616d08899 100644
--- a/.github/actions/upload/action.yml
+++ b/.github/actions/upload/action.yml
@@ -14,6 +14,10 @@ inputs:
   prefix:
     description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
     required: false
+  aws_oicd_role_arn:
+    description: "the OIDC role arn for aws auth"
+    required: false
+    default: ""
 
 runs:
   using: "composite"
@@ -53,6 +57,13 @@ runs:
 
         echo 'SKIPPED=false' >> $GITHUB_OUTPUT
 
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ inputs.aws_oicd_role_arn }}
+        role-duration-seconds: 3600
+
     - name: Upload artifact
       if: ${{ steps.prepare-artifact.outputs.SKIPPED == 'false' }}
       shell: bash -euxo pipefail {0}
diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml
index 5cdc16f248..371d815fc8 100644
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -70,6 +70,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     # we create a table that has one row for each database that we want to restore with the status whether the restore is done
     - name: Create benchmark_restore_status table if it does not exist
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 7d47f78d6b..456399f3c3 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -31,12 +31,13 @@ defaults:
 env:
   RUST_BACKTRACE: 1
   COPT: '-Werror'
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
 jobs:
   build-neon:
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      contents: read
     container:
       image: ${{ inputs.build-tools-image }}
       credentials:
@@ -205,6 +206,13 @@ jobs:
             done
           fi
 
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 18000 # 5 hours
+
       - name: Run rust tests
         env:
           NEXTEST_RETRIES: 3
@@ -256,6 +264,7 @@ jobs:
         with:
           name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
           path: /tmp/neon
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
       # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
       - name: Merge and upload coverage data
@@ -265,6 +274,10 @@ jobs:
   regress-tests:
     # Don't run regression tests on debug arm64 builds
     if: inputs.build-type != 'debug' || inputs.arch != 'arm64'
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      contents: read
+      statuses: write
     needs: [ build-neon ]
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
     container:
@@ -295,6 +308,7 @@ jobs:
           real_s3_region: eu-central-1
           rerun_failed: true
           pg_version: ${{ matrix.pg_version }}
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         env:
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 7621d72f64..2d37be8837 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -105,6 +105,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Create Neon Project
       id: create-neon-project
@@ -204,6 +205,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Run Logical Replication benchmarks
       uses: ./.github/actions/run-python-test-set
@@ -405,6 +407,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Create Neon Project
       if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
@@ -708,6 +711,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Set up Connection String
       id: set-up-connstr
@@ -818,6 +822,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Get Connstring Secret Name
       run: |
@@ -926,6 +931,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Set up Connection String
       id: set-up-connstr
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 67d59c7da1..62f190a0c2 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -21,8 +21,6 @@ concurrency:
 env:
   RUST_BACKTRACE: 1
   COPT: '-Werror'
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
   # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
   E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
 
@@ -362,6 +360,10 @@ jobs:
   create-test-report:
     needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
     if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
     outputs:
       report-url: ${{ steps.create-allure-report.outputs.report-url }}
 
@@ -382,6 +384,7 @@ jobs:
         uses: ./.github/actions/allure-report-generate
         with:
           store-test-results-into-db: true
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         env:
           REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
 
@@ -413,6 +416,10 @@ jobs:
   coverage-report:
     if: ${{ !startsWith(github.ref_name, 'release') }}
     needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
     runs-on: [ self-hosted, small ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
@@ -439,12 +446,14 @@ jobs:
         with:
           name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
           path: /tmp/neon
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
       - name: Get coverage artifact
         uses: ./.github/actions/download
         with:
           name: coverage-data-artifact
           path: /tmp/coverage
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
       - name: Merge coverage data
         run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
@@ -575,6 +584,10 @@ jobs:
   neon-image:
     needs: [ neon-image-arch, tag ]
     runs-on: ubuntu-22.04
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read
 
     steps:
       - uses: docker/login-action@v3
@@ -589,11 +602,15 @@ jobs:
                                              neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \
                                              neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64
 
-      - uses: docker/login-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
         with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2
 
       - name: Push multi-arch image to ECR
         run: |
@@ -602,6 +619,10 @@ jobs:
 
   compute-node-image-arch:
     needs: [ check-permissions, build-build-tools-image, tag ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read
     strategy:
       fail-fast: false
       matrix:
@@ -642,11 +663,15 @@ jobs:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - uses: docker/login-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
         with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2
 
       - uses: docker/login-action@v3
         with:
@@ -719,6 +744,10 @@ jobs:
 
   compute-node-image:
     needs: [ compute-node-image-arch, tag ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read
     runs-on: ubuntu-22.04
 
     strategy:
@@ -763,11 +792,15 @@ jobs:
                                              neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
                                              neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
 
-      - uses: docker/login-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
         with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2
 
       - name: Push multi-arch compute-node-${{ matrix.version.pg }} image to ECR
         run: |
@@ -892,7 +925,9 @@ jobs:
     runs-on: ubuntu-22.04
 
     permissions:
-      id-token: write # for `aws-actions/configure-aws-credentials`
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read
 
     env:
       VERSIONS: v14 v15 v16 v17
@@ -903,12 +938,15 @@ jobs:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - name: Login to dev ECR
-        uses: docker/login-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
         with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2
 
       - name: Copy vm-compute-node images to ECR
         run: |
@@ -1062,7 +1100,10 @@ jobs:
     needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
     # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
     if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled()
-
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
     runs-on: [ self-hosted, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
     steps:
@@ -1184,6 +1225,10 @@ jobs:
   # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
   promote-compatibility-data:
     needs: [ deploy ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read
     # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
     if: github.ref_name == 'release' && !failure() && !cancelled()
 
diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml
index 57194090cf..457634ddad 100644
--- a/.github/workflows/cloud-regress.yml
+++ b/.github/workflows/cloud-regress.yml
@@ -19,14 +19,15 @@ concurrency:
   group: ${{ github.workflow }}
   cancel-in-progress: true
 
+permissions:
+  id-token: write # aws-actions/configure-aws-credentials
+
 jobs:
   regress:
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
     strategy:
       fail-fast: false
       matrix:
@@ -78,6 +79,7 @@ jobs:
           name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
           path: /tmp/neon/
           prefix: latest
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
       - name: Create a new branch
         id: create-branch
@@ -107,6 +109,8 @@ jobs:
         id: create-allure-report
         if: ${{ !cancelled() }}
         uses: ./.github/actions/allure-report-generate
+        with:
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
       - name: Post to a Slack channel
         if: ${{ github.event.schedule && failure() }}
diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml
index a5810e91a4..6773032263 100644
--- a/.github/workflows/ingest_benchmark.yml
+++ b/.github/workflows/ingest_benchmark.yml
@@ -64,6 +64,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Create Neon Project
       if: ${{ matrix.target_project == 'new_empty_project' }}
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 092831adb9..1f85c2e102 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -143,6 +143,10 @@ jobs:
 
   gather-rust-build-stats:
     needs: [ check-permissions, build-build-tools-image ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
     if: |
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
@@ -177,13 +181,18 @@ jobs:
       - name: Produce the build stats
         run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)
 
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
       - name: Upload the build stats
         id: upload-stats
         env:
           BUCKET: neon-github-public-dev
           SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
         run: |
           REPORT_URL=https://${BUCKET}.s3.amazonaws.com/build-stats/${SHA}/${GITHUB_RUN_ID}/cargo-timing.html
           aws s3 cp --only-show-errors ./target/cargo-timings/cargo-timing.html "s3://${BUCKET}/build-stats/${SHA}/${GITHUB_RUN_ID}/"
diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
index 6b98bc873f..a04ceb4a24 100644
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -21,6 +21,9 @@ defaults:
   run:
     shell: bash -euo pipefail {0}
 
+permissions:
+  id-token: write # aws-actions/configure-aws-credentials
+
 concurrency:
   group: ${{ github.workflow }}
   cancel-in-progress: false
@@ -124,11 +127,10 @@ jobs:
         cat "test_log_${GITHUB_RUN_ID}"
 
     - name: Create Allure report
-      env:
-        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
+      with:
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml
index 4f5495cbe2..5c999d3810 100644
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -25,11 +25,13 @@ defaults:
   run:
     shell: bash -euxo pipefail {0}
 
+permissions:
+  id-token: write # aws-actions/configure-aws-credentials
+  statuses: write # require for posting a status update
+
 env:
   DEFAULT_PG_VERSION: 16
   PLATFORM: neon-captest-new
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
   AWS_DEFAULT_REGION: eu-central-1
 
 jobs:
@@ -94,6 +96,7 @@ jobs:
           name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
           path: /tmp/neon/
           prefix: latest
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
       - name: Create Neon Project
         id: create-neon-project
@@ -126,6 +129,7 @@ jobs:
         uses: ./.github/actions/allure-report-generate
         with:
           store-test-results-into-db: true
+          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         env:
           REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
 
@@ -159,6 +163,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Create Neon Project
       id: create-neon-project
@@ -191,6 +196,7 @@ jobs:
       uses: ./.github/actions/allure-report-generate
       with:
         store-test-results-into-db: true
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
 
diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
index 5b43d97de6..626de2b0e0 100644
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -67,7 +67,7 @@ jobs:
     runs-on: ubuntu-22.04
 
     permissions:
-      id-token: write # for `azure/login`
+      id-token: write # for `azure/login` and aws auth
 
     steps:
       - uses: docker/login-action@v3
@@ -75,11 +75,15 @@ jobs:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - uses: docker/login-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
         with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2
 
       - name: Azure login
         uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml
index d2f9d8a666..b2e00d94f7 100644
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -63,6 +63,7 @@ jobs:
     if: always()
     permissions:
       statuses: write # for `github.repos.createCommitStatus(...)`
+      contents: write
     needs:
       - get-changed-files
       - check-codestyle-python
diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 94a072e394..af312d73a7 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -274,6 +274,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
     for env_key in [
         "AWS_ACCESS_KEY_ID",
         "AWS_SECRET_ACCESS_KEY",
+        "AWS_SESSION_TOKEN",
         "AWS_PROFILE",
         // HOME is needed in combination with `AWS_PROFILE` to pick up the SSO sessions.
         "HOME",
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 4e1e8a884f..d969971a35 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -70,6 +70,9 @@ class MockS3Server:
     def secret_key(self) -> str:
         return "test"
 
+    def session_token(self) -> str:
+        return "test"
+
     def kill(self):
         self.server.stop()
 
@@ -161,6 +164,7 @@ class S3Storage:
     bucket_region: str
     access_key: str | None
     secret_key: str | None
+    session_token: str | None
     aws_profile: str | None
     prefix_in_bucket: str
     client: S3Client
@@ -181,13 +185,18 @@ class S3Storage:
             if home is not None:
                 env["HOME"] = home
             return env
-        if self.access_key is not None and self.secret_key is not None:
+        if (
+            self.access_key is not None
+            and self.secret_key is not None
+            and self.session_token is not None
+        ):
             return {
                 "AWS_ACCESS_KEY_ID": self.access_key,
                 "AWS_SECRET_ACCESS_KEY": self.secret_key,
+                "AWS_SESSION_TOKEN": self.session_token,
             }
         raise RuntimeError(
-            "Either AWS_PROFILE or (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY) have to be set for S3Storage"
+            "Either AWS_PROFILE or (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and AWS_SESSION_TOKEN) have to be set for S3Storage"
         )
 
     def to_string(self) -> str:
@@ -352,6 +361,7 @@ class RemoteStorageKind(StrEnum):
             mock_region = mock_s3_server.region()
 
             access_key, secret_key = mock_s3_server.access_key(), mock_s3_server.secret_key()
+            session_token = mock_s3_server.session_token()
 
             client = boto3.client(
                 "s3",
@@ -359,6 +369,7 @@ class RemoteStorageKind(StrEnum):
                 region_name=mock_region,
                 aws_access_key_id=access_key,
                 aws_secret_access_key=secret_key,
+                aws_session_token=session_token,
             )
 
             bucket_name = to_bucket_name(user, test_name)
@@ -372,6 +383,7 @@ class RemoteStorageKind(StrEnum):
                 bucket_region=mock_region,
                 access_key=access_key,
                 secret_key=secret_key,
+                session_token=session_token,
                 aws_profile=None,
                 prefix_in_bucket="",
                 client=client,
@@ -383,9 +395,10 @@ class RemoteStorageKind(StrEnum):
 
         env_access_key = os.getenv("AWS_ACCESS_KEY_ID")
         env_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
+        env_access_token = os.getenv("AWS_SESSION_TOKEN")
         env_profile = os.getenv("AWS_PROFILE")
         assert (
-            env_access_key and env_secret_key
+            env_access_key and env_secret_key and env_access_token
         ) or env_profile, "need to specify either access key and secret access key or profile"
 
         bucket_name = bucket_name or os.getenv("REMOTE_STORAGE_S3_BUCKET")
@@ -398,6 +411,9 @@ class RemoteStorageKind(StrEnum):
         client = boto3.client(
             "s3",
             region_name=bucket_region,
+            aws_access_key_id=env_access_key,
+            aws_secret_access_key=env_secret_key,
+            aws_session_token=env_access_token,
         )
 
         return S3Storage(
@@ -405,6 +421,7 @@ class RemoteStorageKind(StrEnum):
             bucket_region=bucket_region,
             access_key=env_access_key,
             secret_key=env_secret_key,
+            session_token=env_access_token,
             aws_profile=env_profile,
             prefix_in_bucket=prefix_in_bucket,
             client=client,

From 2f3433876fdce12093ee0c706caf2e04b4de2f51 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Thu, 12 Dec 2024 17:34:07 +0100
Subject: [PATCH 025/583] Change the channel for notification. (#10112)

## Problem
Now notifications about failures in `pg_regress` tests run on the
staging cloud instance, reach the channel `on-call-staging-stream`,
while they should reach `on-call-qa-staging-stream`
## Summary of changes
The channel changed.
---
 .github/actionlint.yml              | 1 +
 .github/workflows/cloud-regress.yml | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 27c8fb3c23..9d8389faa5 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -23,3 +23,4 @@ config-variables:
   - BENCHMARK_INGEST_TARGET_PROJECTID
   - PGREGRESS_PG16_PROJECT_ID
   - PGREGRESS_PG17_PROJECT_ID
+  - SLACK_ON_CALL_QA_STAGING_STREAM
diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml
index 457634ddad..2fc26baa21 100644
--- a/.github/workflows/cloud-regress.yml
+++ b/.github/workflows/cloud-regress.yml
@@ -116,7 +116,7 @@ jobs:
         if: ${{ github.event.schedule && failure() }}
         uses: slackapi/slack-github-action@v1
         with:
-          channel-id: "C033QLM5P7D" # on-call-staging-stream
+          channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }}
           slack-message: |
             Periodic pg_regress on staging: ${{ job.status }}
             <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>

From 53721266f1cf6f835541a8b64bbd0b46c60761e2 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 12 Dec 2024 19:05:58 +0200
Subject: [PATCH 026/583] Disable connection logging in pgbouncer by default
 (#10118)

It can produce a lot of logs, making pgbouncer itself consume all CPU in
extreme cases. We saw that happen in stress testing.
---
 compute/etc/pgbouncer.ini | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/compute/etc/pgbouncer.ini b/compute/etc/pgbouncer.ini
index abcd165636..604b4e41ea 100644
--- a/compute/etc/pgbouncer.ini
+++ b/compute/etc/pgbouncer.ini
@@ -19,3 +19,10 @@ max_prepared_statements=0
 admin_users=postgres
 unix_socket_dir=/tmp/
 unix_socket_mode=0777
+
+;; Disable connection logging. It produces a lot of logs that no one looks at,
+;; and we can get similar log entries from the proxy too. We had incidents in
+;; the past where the logging significantly stressed the log device or pgbouncer
+;; itself.
+log_connections=0
+log_disconnections=0

From 6d5687521b6eede3166884c0247405ebd59c6f8a Mon Sep 17 00:00:00 2001
From: Rahul Patil <rahul@neon.tech>
Date: Thu, 12 Dec 2024 19:53:35 +0100
Subject: [PATCH 027/583] fix(ci): Allow github-script to post test reports
 (#10120)

Allow github-script to post test reports
---
 .github/workflows/build_and_test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 62f190a0c2..4bfb5077c6 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -364,6 +364,7 @@ jobs:
       id-token: write # aws-actions/configure-aws-credentials
       statuses: write
       contents: write
+      pull-requests: write
     outputs:
       report-url: ${{ steps.create-allure-report.outputs.report-url }}
 

From a93e3d31cc5152c279480ded76ea886867fbfea3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 12 Dec 2024 19:35:38 +0000
Subject: [PATCH 028/583] storcon: refine logic for choosing AZ on tenant
 creation (#10054)

## Problem

When we update our scheduler/optimization code to respect AZs properly
(https://github.com/neondatabase/neon/pull/9916), the choice of AZ
becomes a much higher-stakes decision. We will pretty much always run a
tenant in its preferred AZ, and that AZ is fixed for the lifetime of the
tenant (unless a human intervenes)

Eventually, when we do auto-balancing based on utilization, I anticipate
that part of that will be to automatically change the AZ of tenants if
our original scheduling decisions have caused imbalance, but as an
interim measure, we can at least avoid making this scheduling decision
based purely on which AZ contains the emptiest node.

This is a precursor to https://github.com/neondatabase/neon/pull/9947

## Summary of changes

- When creating a tenant, instead of scheduling a shard and then reading
its preferred AZ back, make the AZ decision first.
- Instead of choosing AZ based on which node is emptiest, use the median
utilization of nodes in each AZ to pick the AZ to use. This avoids bad
AZ decisions during periods when some node has very low utilization
(such as after replacing a dead node)

I considered also making the selection a weighted pseudo-random choice
based on utilization, but wanted to avoid destabilising tests with that
for now.
---
 libs/pageserver_api/src/controller_api.rs |  2 +-
 storage_controller/src/scheduler.rs       | 93 +++++++++++++++++++++++
 storage_controller/src/service.rs         | 62 +++++----------
 storage_controller/src/tenant_shard.rs    | 15 ++--
 4 files changed, 118 insertions(+), 54 deletions(-)

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 6839ef69f5..ec7b81423a 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -75,7 +75,7 @@ pub struct TenantPolicyRequest {
     pub scheduling: Option<ShardSchedulingPolicy>,
 }
 
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug)]
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug, PartialOrd, Ord)]
 pub struct AvailabilityZone(pub String);
 
 impl Display for AvailabilityZone {
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index ecc6b11e47..51a4cf35be 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -742,6 +742,50 @@ impl Scheduler {
         self.schedule_shard::<AttachedShardTag>(&[], &None, &ScheduleContext::default())
     }
 
+    /// For choosing which AZ to schedule a new shard into, use this.  It will return the
+    /// AZ with the lowest median utilization.
+    ///
+    /// We use an AZ-wide measure rather than simply selecting the AZ of the least-loaded
+    /// node, because while tenants start out single sharded, when they grow and undergo
+    /// shard-split, they will occupy space on many nodes within an AZ.
+    ///
+    /// We use median rather than total free space or mean utilization, because
+    /// we wish to avoid preferring AZs that have low-load nodes resulting from
+    /// recent replacements.
+    ///
+    /// The practical result is that we will pick an AZ based on its median node, and
+    /// then actually _schedule_ the new shard onto the lowest-loaded node in that AZ.
+    pub(crate) fn get_az_for_new_tenant(&self) -> Option<AvailabilityZone> {
+        if self.nodes.is_empty() {
+            return None;
+        }
+
+        let mut scores_by_az = HashMap::new();
+        for (node_id, node) in &self.nodes {
+            let az_scores = scores_by_az.entry(&node.az).or_insert_with(Vec::new);
+            let score = match &node.may_schedule {
+                MaySchedule::Yes(utilization) => utilization.score(),
+                MaySchedule::No => PageserverUtilization::full().score(),
+            };
+            az_scores.push((node_id, node, score));
+        }
+
+        // Sort by utilization.  Also include the node ID to break ties.
+        for scores in scores_by_az.values_mut() {
+            scores.sort_by_key(|i| (i.2, i.0));
+        }
+
+        let mut median_by_az = scores_by_az
+            .iter()
+            .map(|(az, nodes)| (*az, nodes.get(nodes.len() / 2).unwrap().2))
+            .collect::<Vec<_>>();
+        // Sort by utilization.  Also include the AZ to break ties.
+        median_by_az.sort_by_key(|i| (i.1, i.0));
+
+        // Return the AZ with the lowest median utilization
+        Some(median_by_az.first().unwrap().0.clone())
+    }
+
     /// Unit test access to internal state
     #[cfg(test)]
     pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
@@ -1087,4 +1131,53 @@ mod tests {
             intent.clear(&mut scheduler);
         }
     }
+
+    #[test]
+    fn az_scheduling_for_new_tenant() {
+        let az_a_tag = AvailabilityZone("az-a".to_string());
+        let az_b_tag = AvailabilityZone("az-b".to_string());
+        let nodes = test_utils::make_test_nodes(
+            6,
+            &[
+                az_a_tag.clone(),
+                az_a_tag.clone(),
+                az_a_tag.clone(),
+                az_b_tag.clone(),
+                az_b_tag.clone(),
+                az_b_tag.clone(),
+            ],
+        );
+
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        /// Force the utilization of a node in Scheduler's state to a particular
+        /// number of bytes used.
+        fn set_utilization(scheduler: &mut Scheduler, node_id: NodeId, shard_count: u32) {
+            let mut node = Node::new(
+                node_id,
+                "".to_string(),
+                0,
+                "".to_string(),
+                0,
+                scheduler.nodes.get(&node_id).unwrap().az.clone(),
+            );
+            node.set_availability(NodeAvailability::Active(test_utilization::simple(
+                shard_count,
+                0,
+            )));
+            scheduler.node_upsert(&node);
+        }
+
+        // Initial empty state.  Scores are tied, scheduler prefers lower AZ ID.
+        assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
+
+        // Put some utilization on one node in AZ A: this should change nothing, as the median hasn't changed
+        set_utilization(&mut scheduler, NodeId(1), 1000000);
+        assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
+
+        // Put some utilization on a second node in AZ A: now the median has changed, so the scheduler
+        // should prefer the other AZ.
+        set_utilization(&mut scheduler, NodeId(2), 1000000);
+        assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_b_tag.clone()));
+    }
 }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 894b67fdc6..746177c089 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1582,6 +1582,7 @@ impl Service {
                             attach_req.tenant_shard_id,
                             ShardIdentity::unsharded(),
                             PlacementPolicy::Attached(0),
+                            None,
                         ),
                     );
                     tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
@@ -2109,6 +2110,16 @@ impl Service {
             )
         };
 
+        let preferred_az_id = {
+            let locked = self.inner.read().unwrap();
+            // Idempotency: take the existing value if the tenant already exists
+            if let Some(shard) = locked.tenants.get(create_ids.first().unwrap()) {
+                shard.preferred_az().cloned()
+            } else {
+                locked.scheduler.get_az_for_new_tenant()
+            }
+        };
+
         // Ordering: we persist tenant shards before creating them on the pageserver.  This enables a caller
         // to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart
         // during the creation, rather than risking leaving orphan objects in S3.
@@ -2128,7 +2139,7 @@ impl Service {
                 splitting: SplitState::default(),
                 scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                     .unwrap(),
-                preferred_az_id: None,
+                preferred_az_id: preferred_az_id.as_ref().map(|az| az.to_string()),
             })
             .collect();
 
@@ -2164,6 +2175,7 @@ impl Service {
                     &create_req.shard_parameters,
                     create_req.config.clone(),
                     placement_policy.clone(),
+                    preferred_az_id.as_ref(),
                     &mut schedule_context,
                 )
                 .await;
@@ -2177,44 +2189,6 @@ impl Service {
             }
         }
 
-        let preferred_azs = {
-            let locked = self.inner.read().unwrap();
-            response_shards
-                .iter()
-                .filter_map(|resp| {
-                    let az_id = locked
-                        .nodes
-                        .get(&resp.node_id)
-                        .map(|n| n.get_availability_zone_id().clone())?;
-
-                    Some((resp.shard_id, az_id))
-                })
-                .collect::<Vec<_>>()
-        };
-
-        // Note that we persist the preferred AZ for the new shards separately.
-        // In theory, we could "peek" the scheduler to determine where the shard will
-        // land, but the subsequent "real" call into the scheduler might select a different
-        // node. Hence, we do this awkward update to keep things consistent.
-        let updated = self
-            .persistence
-            .set_tenant_shard_preferred_azs(preferred_azs)
-            .await
-            .map_err(|err| {
-                ApiError::InternalServerError(anyhow::anyhow!(
-                    "Failed to persist preferred az ids: {err}"
-                ))
-            })?;
-
-        {
-            let mut locked = self.inner.write().unwrap();
-            for (tid, az_id) in updated {
-                if let Some(shard) = locked.tenants.get_mut(&tid) {
-                    shard.set_preferred_az(az_id);
-                }
-            }
-        }
-
         // If we failed to schedule shards, then they are still created in the controller,
         // but we return an error to the requester to avoid a silent failure when someone
         // tries to e.g. create a tenant whose placement policy requires more nodes than
@@ -2245,6 +2219,7 @@ impl Service {
 
     /// Helper for tenant creation that does the scheduling for an individual shard. Covers both the
     /// case of a new tenant and a pre-existing one.
+    #[allow(clippy::too_many_arguments)]
     async fn do_initial_shard_scheduling(
         &self,
         tenant_shard_id: TenantShardId,
@@ -2252,6 +2227,7 @@ impl Service {
         shard_params: &ShardParameters,
         config: TenantConfig,
         placement_policy: PlacementPolicy,
+        preferred_az_id: Option<&AvailabilityZone>,
         schedule_context: &mut ScheduleContext,
     ) -> InitialShardScheduleOutcome {
         let mut locked = self.inner.write().unwrap();
@@ -2262,10 +2238,6 @@ impl Service {
             Entry::Occupied(mut entry) => {
                 tracing::info!("Tenant shard {tenant_shard_id} already exists while creating");
 
-                // TODO: schedule() should take an anti-affinity expression that pushes
-                // attached and secondary locations (independently) away frorm those
-                // pageservers also holding a shard for this tenant.
-
                 if let Err(err) = entry.get_mut().schedule(scheduler, schedule_context) {
                     return InitialShardScheduleOutcome::ShardScheduleError(err);
                 }
@@ -2289,6 +2261,7 @@ impl Service {
                     tenant_shard_id,
                     ShardIdentity::from_params(tenant_shard_id.shard_number, shard_params),
                     placement_policy,
+                    preferred_az_id.cloned(),
                 ));
 
                 state.generation = initial_generation;
@@ -4256,7 +4229,8 @@ impl Service {
                         },
                     );
 
-                    let mut child_state = TenantShard::new(child, child_shard, policy.clone());
+                    let mut child_state =
+                        TenantShard::new(child, child_shard, policy.clone(), preferred_az.clone());
                     child_state.intent = IntentState::single(scheduler, Some(pageserver));
                     child_state.observed = ObservedState {
                         locations: child_observed,
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 2eb98ee825..f1b921646f 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -472,6 +472,7 @@ impl TenantShard {
         tenant_shard_id: TenantShardId,
         shard: ShardIdentity,
         policy: PlacementPolicy,
+        preferred_az_id: Option<AvailabilityZone>,
     ) -> Self {
         metrics::METRICS_REGISTRY
             .metrics_group
@@ -495,7 +496,7 @@ impl TenantShard {
             last_error: Arc::default(),
             pending_compute_notification: false,
             scheduling_policy: ShardSchedulingPolicy::default(),
-            preferred_az_id: None,
+            preferred_az_id,
         }
     }
 
@@ -1571,6 +1572,7 @@ pub(crate) mod tests {
             )
             .unwrap(),
             policy,
+            None,
         )
     }
 
@@ -1597,7 +1599,7 @@ pub(crate) mod tests {
                     shard_number,
                     shard_count,
                 };
-                let mut ts = TenantShard::new(
+                TenantShard::new(
                     tenant_shard_id,
                     ShardIdentity::new(
                         shard_number,
@@ -1606,13 +1608,8 @@ pub(crate) mod tests {
                     )
                     .unwrap(),
                     policy.clone(),
-                );
-
-                if let Some(az) = &preferred_az {
-                    ts.set_preferred_az(az.clone());
-                }
-
-                ts
+                    preferred_az.clone(),
+                )
             })
             .collect()
     }

From 5ff4b991c74141505927a2498310b20c617a8786 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 12 Dec 2024 15:23:24 -0500
Subject: [PATCH 029/583] feat(pageserver): gc-compaction split over LSN
 (#9900)

## Problem

part of https://github.com/neondatabase/neon/issues/9114, stacked PR
over https://github.com/neondatabase/neon/pull/9897, partially
refactored to help with
https://github.com/neondatabase/neon/issues/10031

## Summary of changes

* gc-compaction takes `above_lsn` parameter. We only compact the layers
above this LSN, and all data below the LSN are treated as if they are on
the ancestor branch.
* refactored gc-compaction to take `GcCompactJob` that describes the
rectangular range to be compacted.
* Added unit test for this case.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/http/routes.rs                |  13 +-
 pageserver/src/tenant.rs                     | 661 ++++++++++++++++++-
 pageserver/src/tenant/timeline.rs            |  71 +-
 pageserver/src/tenant/timeline/compaction.rs | 233 ++++---
 test_runner/regress/test_compaction.py       |   4 +-
 5 files changed, 879 insertions(+), 103 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 6e9ee976f4..db7d293856 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2081,13 +2081,20 @@ async fn timeline_compact_handler(
         .as_ref()
         .map(|r| r.sub_compaction)
         .unwrap_or(false);
+    let sub_compaction_max_job_size_mb = compact_request
+        .as_ref()
+        .and_then(|r| r.sub_compaction_max_job_size_mb);
+
     let options = CompactOptions {
-        compact_range: compact_request
+        compact_key_range: compact_request
             .as_ref()
-            .and_then(|r| r.compact_range.clone()),
-        compact_below_lsn: compact_request.as_ref().and_then(|r| r.compact_below_lsn),
+            .and_then(|r| r.compact_key_range.clone()),
+        compact_lsn_range: compact_request
+            .as_ref()
+            .and_then(|r| r.compact_lsn_range.clone()),
         flags,
         sub_compaction,
+        sub_compaction_max_job_size_mb,
     };
 
     let scheduled = compact_request
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 92078e4b08..99289d5f15 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -44,6 +44,7 @@ use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
+use timeline::compaction::GcCompactJob;
 use timeline::compaction::ScheduledCompactionTask;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
@@ -3017,8 +3018,15 @@ impl Tenant {
                             warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
                         } else if next_scheduled_compaction_task.options.sub_compaction {
                             info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
-                            let jobs = timeline
-                                .gc_compaction_split_jobs(next_scheduled_compaction_task.options)
+                            let jobs: Vec<GcCompactJob> = timeline
+                                .gc_compaction_split_jobs(
+                                    GcCompactJob::from_compact_options(
+                                        next_scheduled_compaction_task.options.clone(),
+                                    ),
+                                    next_scheduled_compaction_task
+                                        .options
+                                        .sub_compaction_max_job_size_mb,
+                                )
                                 .await
                                 .map_err(CompactionError::Other)?;
                             if jobs.is_empty() {
@@ -3029,9 +3037,23 @@ impl Tenant {
                                 let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
                                 let tline_pending_tasks = guard.entry(*timeline_id).or_default();
                                 for (idx, job) in jobs.into_iter().enumerate() {
+                                    // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
+                                    // until we do further refactors to allow directly call `compact_with_gc`.
+                                    let mut flags: EnumSet<CompactFlags> = EnumSet::default();
+                                    flags |= CompactFlags::EnhancedGcBottomMostCompaction;
+                                    if job.dry_run {
+                                        flags |= CompactFlags::DryRun;
+                                    }
+                                    let options = CompactOptions {
+                                        flags,
+                                        sub_compaction: false,
+                                        compact_key_range: Some(job.compact_key_range.into()),
+                                        compact_lsn_range: Some(job.compact_lsn_range.into()),
+                                        sub_compaction_max_job_size_mb: None,
+                                    };
                                     tline_pending_tasks.push_back(if idx == jobs_len - 1 {
                                         ScheduledCompactionTask {
-                                            options: job,
+                                            options,
                                             // The last job in the queue sends the signal and releases the gc guard
                                             result_tx: next_scheduled_compaction_task
                                                 .result_tx
@@ -3042,7 +3064,7 @@ impl Tenant {
                                         }
                                     } else {
                                         ScheduledCompactionTask {
-                                            options: job,
+                                            options,
                                             result_tx: None,
                                             gc_block: None,
                                         }
@@ -5742,6 +5764,8 @@ mod tests {
     #[cfg(feature = "testing")]
     use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
     #[cfg(feature = "testing")]
+    use timeline::CompactLsnRange;
+    #[cfg(feature = "testing")]
     use timeline::GcInfo;
 
     static TEST_KEY: Lazy<Key> =
@@ -9333,7 +9357,6 @@ mod tests {
                 &cancel,
                 CompactOptions {
                     flags: dryrun_flags,
-                    compact_range: None,
                     ..Default::default()
                 },
                 &ctx,
@@ -9582,7 +9605,6 @@ mod tests {
                 &cancel,
                 CompactOptions {
                     flags: dryrun_flags,
-                    compact_range: None,
                     ..Default::default()
                 },
                 &ctx,
@@ -9612,6 +9634,8 @@ mod tests {
     #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
+        use timeline::CompactLsnRange;
+
         let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
         let (tenant, ctx) = harness.load().await;
 
@@ -9804,6 +9828,22 @@ mod tests {
 
         verify_result().await;
 
+        // Piggyback a compaction with above_lsn. Ensure it works correctly when the specified LSN intersects with the layer files.
+        // Now we already have a single large delta layer, so the compaction min_layer_lsn should be the same as ancestor LSN (0x18).
+        branch_tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_lsn_range: Some(CompactLsnRange::above(Lsn(0x40))),
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await
+            .unwrap();
+
+        verify_result().await;
+
         Ok(())
     }
 
@@ -10092,7 +10132,7 @@ mod tests {
                 &cancel,
                 CompactOptions {
                     flags: EnumSet::new(),
-                    compact_range: Some((get_key(0)..get_key(2)).into()),
+                    compact_key_range: Some((get_key(0)..get_key(2)).into()),
                     ..Default::default()
                 },
                 &ctx,
@@ -10139,7 +10179,7 @@ mod tests {
                 &cancel,
                 CompactOptions {
                     flags: EnumSet::new(),
-                    compact_range: Some((get_key(2)..get_key(4)).into()),
+                    compact_key_range: Some((get_key(2)..get_key(4)).into()),
                     ..Default::default()
                 },
                 &ctx,
@@ -10191,7 +10231,7 @@ mod tests {
                 &cancel,
                 CompactOptions {
                     flags: EnumSet::new(),
-                    compact_range: Some((get_key(4)..get_key(9)).into()),
+                    compact_key_range: Some((get_key(4)..get_key(9)).into()),
                     ..Default::default()
                 },
                 &ctx,
@@ -10242,7 +10282,7 @@ mod tests {
                 &cancel,
                 CompactOptions {
                     flags: EnumSet::new(),
-                    compact_range: Some((get_key(9)..get_key(10)).into()),
+                    compact_key_range: Some((get_key(9)..get_key(10)).into()),
                     ..Default::default()
                 },
                 &ctx,
@@ -10298,7 +10338,7 @@ mod tests {
                 &cancel,
                 CompactOptions {
                     flags: EnumSet::new(),
-                    compact_range: Some((get_key(0)..get_key(10)).into()),
+                    compact_key_range: Some((get_key(0)..get_key(10)).into()),
                     ..Default::default()
                 },
                 &ctx,
@@ -10327,7 +10367,6 @@ mod tests {
                 },
             ],
         );
-
         Ok(())
     }
 
@@ -10380,4 +10419,602 @@ mod tests {
 
         Ok(())
     }
+
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_simple_bottom_most_compaction_above_lsn() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction_above_lsn").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![(
+            get_key(1),
+            Lsn(0x20),
+            Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+        )];
+        let delta4 = vec![(
+            get_key(1),
+            Lsn(0x28),
+            Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
+        )];
+        let delta2 = vec![
+            (
+                get_key(1),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x38),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
+            ),
+        ];
+        let delta3 = vec![
+            (
+                get_key(8),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+            (
+                get_key(9),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![
+                    // delta1/2/4 only contain a single key but multiple updates
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x28)..Lsn(0x30), delta4),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta3),
+                ], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+        {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![
+                    (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No),
+                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
+                ],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        let expected_result = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10@0x48"),
+            Bytes::from_static(b"value 9@0x10@0x48"),
+        ];
+
+        let expected_result_at_gc_horizon = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let expected_result_at_lsn_20 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let expected_result_at_lsn_10 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let verify_result = || async {
+            let gc_horizon = {
+                let gc_info = tline.gc_info.read().unwrap();
+                gc_info.cutoffs.time
+            };
+            for idx in 0..10 {
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), gc_horizon, &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_gc_horizon[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x20), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_20[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x10), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_10[idx]
+                );
+            }
+        };
+
+        verify_result().await;
+
+        let cancel = CancellationToken::new();
+        tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_lsn_range: Some(CompactLsnRange::above(Lsn(0x28))),
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await
+            .unwrap();
+        verify_result().await;
+
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                // The original image layer, not compacted
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                // Delta layer below the specified above_lsn not compacted
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(2),
+                    lsn_range: Lsn(0x20)..Lsn(0x28),
+                    is_delta: true,
+                },
+                // Delta layer compacted above the LSN
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(10),
+                    lsn_range: Lsn(0x28)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        // compact again
+        tline
+            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .await
+            .unwrap();
+        verify_result().await;
+
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                // The compacted image layer (full key range)
+                PersistentLayerKey {
+                    key_range: Key::MIN..Key::MAX,
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                // All other data in the delta layer
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        Ok(())
+    }
+
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_simple_bottom_most_compaction_rectangle() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction_rectangle").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![(
+            get_key(1),
+            Lsn(0x20),
+            Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+        )];
+        let delta4 = vec![(
+            get_key(1),
+            Lsn(0x28),
+            Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
+        )];
+        let delta2 = vec![
+            (
+                get_key(1),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x38),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
+            ),
+        ];
+        let delta3 = vec![
+            (
+                get_key(8),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+            (
+                get_key(9),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![
+                    // delta1/2/4 only contain a single key but multiple updates
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x28)..Lsn(0x30), delta4),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta3),
+                ], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+        {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![
+                    (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No),
+                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
+                ],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        let expected_result = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10@0x48"),
+            Bytes::from_static(b"value 9@0x10@0x48"),
+        ];
+
+        let expected_result_at_gc_horizon = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let expected_result_at_lsn_20 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let expected_result_at_lsn_10 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let verify_result = || async {
+            let gc_horizon = {
+                let gc_info = tline.gc_info.read().unwrap();
+                gc_info.cutoffs.time
+            };
+            for idx in 0..10 {
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), gc_horizon, &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_gc_horizon[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x20), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_20[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x10), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_10[idx]
+                );
+            }
+        };
+
+        verify_result().await;
+
+        let cancel = CancellationToken::new();
+
+        tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_key_range: Some((get_key(0)..get_key(2)).into()),
+                    compact_lsn_range: Some((Lsn(0x20)..Lsn(0x28)).into()),
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await
+            .unwrap();
+        verify_result().await;
+
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                // The original image layer, not compacted
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                // According the selection logic, we select all layers with start key <= 0x28, so we would merge the layer 0x20-0x28 and
+                // the layer 0x28-0x30 into one.
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(2),
+                    lsn_range: Lsn(0x20)..Lsn(0x30),
+                    is_delta: true,
+                },
+                // Above the upper bound and untouched
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(2),
+                    lsn_range: Lsn(0x30)..Lsn(0x50),
+                    is_delta: true,
+                },
+                // This layer is untouched
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x30)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_key_range: Some((get_key(3)..get_key(8)).into()),
+                    compact_lsn_range: Some((Lsn(0x28)..Lsn(0x40)).into()),
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await
+            .unwrap();
+        verify_result().await;
+
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                // The original image layer, not compacted
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                // Not in the compaction key range, uncompacted
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(2),
+                    lsn_range: Lsn(0x20)..Lsn(0x30),
+                    is_delta: true,
+                },
+                // Not in the compaction key range, uncompacted but need rewrite because the delta layer overlaps with the range
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(2),
+                    lsn_range: Lsn(0x30)..Lsn(0x50),
+                    is_delta: true,
+                },
+                // Note that when we specify the LSN upper bound to be 0x40, the compaction algorithm will not try to cut the layer
+                // horizontally in half. Instead, it will include all LSNs that overlap with 0x40. So the real max_lsn of the compaction
+                // becomes 0x50.
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x30)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        // compact again
+        tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_key_range: Some((get_key(0)..get_key(5)).into()),
+                    compact_lsn_range: Some((Lsn(0x20)..Lsn(0x50)).into()),
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await
+            .unwrap();
+        verify_result().await;
+
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                // The original image layer, not compacted
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                // The range gets compacted
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(2),
+                    lsn_range: Lsn(0x20)..Lsn(0x50),
+                    is_delta: true,
+                },
+                // Not touched during this iteration of compaction
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x30)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        // final full compaction
+        tline
+            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .await
+            .unwrap();
+        verify_result().await;
+
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                // The compacted image layer (full key range)
+                PersistentLayerKey {
+                    key_range: Key::MIN..Key::MAX,
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                // All other data in the delta layer
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8f1d5f6577..b5c7079226 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -780,46 +780,90 @@ pub(crate) enum CompactFlags {
 #[serde_with::serde_as]
 #[derive(Debug, Clone, serde::Deserialize)]
 pub(crate) struct CompactRequest {
-    pub compact_range: Option<CompactRange>,
-    pub compact_below_lsn: Option<Lsn>,
+    pub compact_key_range: Option<CompactKeyRange>,
+    pub compact_lsn_range: Option<CompactLsnRange>,
     /// Whether the compaction job should be scheduled.
     #[serde(default)]
     pub scheduled: bool,
     /// Whether the compaction job should be split across key ranges.
     #[serde(default)]
     pub sub_compaction: bool,
+    /// Max job size for each subcompaction job.
+    pub sub_compaction_max_job_size_mb: Option<u64>,
 }
 
 #[serde_with::serde_as]
 #[derive(Debug, Clone, serde::Deserialize)]
-pub(crate) struct CompactRange {
+pub(crate) struct CompactLsnRange {
+    pub start: Lsn,
+    pub end: Lsn,
+}
+
+#[serde_with::serde_as]
+#[derive(Debug, Clone, serde::Deserialize)]
+pub(crate) struct CompactKeyRange {
     #[serde_as(as = "serde_with::DisplayFromStr")]
     pub start: Key,
     #[serde_as(as = "serde_with::DisplayFromStr")]
     pub end: Key,
 }
 
-impl From<Range<Key>> for CompactRange {
-    fn from(range: Range<Key>) -> Self {
-        CompactRange {
+impl From<Range<Lsn>> for CompactLsnRange {
+    fn from(range: Range<Lsn>) -> Self {
+        Self {
             start: range.start,
             end: range.end,
         }
     }
 }
 
+impl From<Range<Key>> for CompactKeyRange {
+    fn from(range: Range<Key>) -> Self {
+        Self {
+            start: range.start,
+            end: range.end,
+        }
+    }
+}
+
+impl From<CompactLsnRange> for Range<Lsn> {
+    fn from(range: CompactLsnRange) -> Self {
+        range.start..range.end
+    }
+}
+
+impl From<CompactKeyRange> for Range<Key> {
+    fn from(range: CompactKeyRange) -> Self {
+        range.start..range.end
+    }
+}
+
+impl CompactLsnRange {
+    #[cfg(test)]
+    #[cfg(feature = "testing")]
+    pub fn above(lsn: Lsn) -> Self {
+        Self {
+            start: lsn,
+            end: Lsn::MAX,
+        }
+    }
+}
+
 #[derive(Debug, Clone, Default)]
 pub(crate) struct CompactOptions {
     pub flags: EnumSet<CompactFlags>,
     /// If set, the compaction will only compact the key range specified by this option.
-    /// This option is only used by GC compaction.
-    pub compact_range: Option<CompactRange>,
-    /// If set, the compaction will only compact the LSN below this value.
-    /// This option is only used by GC compaction.
-    pub compact_below_lsn: Option<Lsn>,
+    /// This option is only used by GC compaction. For the full explanation, see [`compaction::GcCompactJob`].
+    pub compact_key_range: Option<CompactKeyRange>,
+    /// If set, the compaction will only compact the LSN within this value.
+    /// This option is only used by GC compaction. For the full explanation, see [`compaction::GcCompactJob`].
+    pub compact_lsn_range: Option<CompactLsnRange>,
     /// Enable sub-compaction (split compaction job across key ranges).
     /// This option is only used by GC compaction.
     pub sub_compaction: bool,
+    /// Set job size for the GC compaction.
+    /// This option is only used by GC compaction.
+    pub sub_compaction_max_job_size_mb: Option<u64>,
 }
 
 impl std::fmt::Debug for Timeline {
@@ -1641,9 +1685,10 @@ impl Timeline {
             cancel,
             CompactOptions {
                 flags,
-                compact_range: None,
-                compact_below_lsn: None,
+                compact_key_range: None,
+                compact_lsn_range: None,
                 sub_compaction: false,
+                sub_compaction_max_job_size_mb: None,
             },
             ctx,
         )
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index fa924d23b0..701247194b 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -10,8 +10,8 @@ use std::sync::Arc;
 
 use super::layer_manager::LayerManager;
 use super::{
-    CompactFlags, CompactOptions, CompactRange, CreateImageLayersError, DurationRecorder,
-    ImageLayerCreationMode, RecordedDuration, Timeline,
+    CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
+    RecordedDuration, Timeline,
 };
 
 use anyhow::{anyhow, bail, Context};
@@ -64,6 +64,9 @@ const COMPACTION_DELTA_THRESHOLD: usize = 5;
 
 /// A scheduled compaction task.
 pub(crate) struct ScheduledCompactionTask {
+    /// It's unfortunate that we need to store a compact options struct here because the only outer
+    /// API we can call here is `compact_with_options` which does a few setup calls before starting the
+    /// actual compaction job... We should refactor this to store `GcCompactionJob` in the future.
     pub options: CompactOptions,
     /// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender.
     pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
@@ -71,16 +74,57 @@ pub(crate) struct ScheduledCompactionTask {
     pub gc_block: Option<gc_block::Guard>,
 }
 
+/// A job description for the gc-compaction job. This structure describes the rectangle range that the job will
+/// process. The exact layers that need to be compacted/rewritten will be generated when `compact_with_gc` gets
+/// called.
+#[derive(Debug, Clone)]
+pub(crate) struct GcCompactJob {
+    pub dry_run: bool,
+    /// The key range to be compacted. The compaction algorithm will only regenerate key-value pairs within this range
+    /// [left inclusive, right exclusive), and other pairs will be rewritten into new files if necessary.
+    pub compact_key_range: Range<Key>,
+    /// The LSN range to be compacted. The compaction algorithm will use this range to determine the layers to be
+    /// selected for the compaction, and it does not guarantee the generated layers will have exactly the same LSN range
+    /// as specified here. The true range being compacted is `min_lsn/max_lsn` in [`GcCompactionJobDescription`].
+    /// min_lsn will always <= the lower bound specified here, and max_lsn will always >= the upper bound specified here.
+    pub compact_lsn_range: Range<Lsn>,
+}
+
+impl GcCompactJob {
+    pub fn from_compact_options(options: CompactOptions) -> Self {
+        GcCompactJob {
+            dry_run: options.flags.contains(CompactFlags::DryRun),
+            compact_key_range: options
+                .compact_key_range
+                .map(|x| x.into())
+                .unwrap_or(Key::MIN..Key::MAX),
+            compact_lsn_range: options
+                .compact_lsn_range
+                .map(|x| x.into())
+                .unwrap_or(Lsn::INVALID..Lsn::MAX),
+        }
+    }
+}
+
+/// A job description for the gc-compaction job. This structure is generated when `compact_with_gc` is called
+/// and contains the exact layers we want to compact.
 pub struct GcCompactionJobDescription {
     /// All layers to read in the compaction job
     selected_layers: Vec<Layer>,
-    /// GC cutoff of the job
+    /// GC cutoff of the job. This is the lowest LSN that will be accessed by the read/GC path and we need to
+    /// keep all deltas <= this LSN or generate an image == this LSN.
     gc_cutoff: Lsn,
-    /// LSNs to retain for the job
+    /// LSNs to retain for the job. Read path will use this LSN so we need to keep deltas <= this LSN or
+    /// generate an image == this LSN.
     retain_lsns_below_horizon: Vec<Lsn>,
-    /// Maximum layer LSN processed in this compaction
+    /// Maximum layer LSN processed in this compaction, that is max(end_lsn of layers). Exclusive. All data
+    /// \>= this LSN will be kept and will not be rewritten.
     max_layer_lsn: Lsn,
-    /// Only compact layers overlapping with this range
+    /// Minimum layer LSN processed in this compaction, that is min(start_lsn of layers). Inclusive.
+    /// All access below (strict lower than `<`) this LSN will be routed through the normal read path instead of
+    /// k-merge within gc-compaction.
+    min_layer_lsn: Lsn,
+    /// Only compact layers overlapping with this range.
     compaction_key_range: Range<Key>,
     /// When partial compaction is enabled, these layers need to be rewritten to ensure no overlap.
     /// This field is here solely for debugging. The field will not be read once the compaction
@@ -299,7 +343,7 @@ impl Timeline {
             )));
         }
 
-        if options.compact_range.is_some() {
+        if options.compact_key_range.is_some() || options.compact_lsn_range.is_some() {
             // maybe useful in the future? could implement this at some point
             return Err(CompactionError::Other(anyhow!(
                 "compaction range is not supported for legacy compaction for now"
@@ -1754,25 +1798,26 @@ impl Timeline {
         Ok(())
     }
 
-    /// Split a gc-compaction job into multiple compaction jobs. Optimally, this function should return a vector of
-    /// `GcCompactionJobDesc`. But we want to keep it simple on the tenant scheduling side without exposing too much
-    /// ad-hoc information about gc compaction itself.
+    /// Split a gc-compaction job into multiple compaction jobs. The split is based on the key range and the estimated size of the compaction job.
+    /// The function returns a list of compaction jobs that can be executed separately. If the upper bound of the compact LSN
+    /// range is not specified, we will use the latest gc_cutoff as the upper bound, so that all jobs in the jobset acts
+    /// like a full compaction of the specified keyspace.
     pub(crate) async fn gc_compaction_split_jobs(
         self: &Arc<Self>,
-        options: CompactOptions,
-    ) -> anyhow::Result<Vec<CompactOptions>> {
-        if !options.sub_compaction {
-            return Ok(vec![options]);
-        }
-        let compact_range = options.compact_range.clone().unwrap_or(CompactRange {
-            start: Key::MIN,
-            end: Key::MAX,
-        });
-        let compact_below_lsn = if let Some(compact_below_lsn) = options.compact_below_lsn {
-            compact_below_lsn
+        job: GcCompactJob,
+        sub_compaction_max_job_size_mb: Option<u64>,
+    ) -> anyhow::Result<Vec<GcCompactJob>> {
+        let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX {
+            job.compact_lsn_range.end
         } else {
             *self.get_latest_gc_cutoff_lsn() // use the real gc cutoff
         };
+
+        // Split compaction job to about 4GB each
+        const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024;
+        let sub_compaction_max_job_size_mb =
+            sub_compaction_max_job_size_mb.unwrap_or(GC_COMPACT_MAX_SIZE_MB);
+
         let mut compact_jobs = Vec::new();
         // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
         // by estimating the amount of files read for a compaction job. We should also partition on LSN.
@@ -1808,8 +1853,8 @@ impl Timeline {
             let Some((start, end)) = truncate_to(
                 &range.start,
                 &range.end,
-                &compact_range.start,
-                &compact_range.end,
+                &job.compact_key_range.start,
+                &job.compact_key_range.end,
             ) else {
                 continue;
             };
@@ -1819,8 +1864,6 @@ impl Timeline {
         let guard = self.layers.read().await;
         let layer_map = guard.layer_map()?;
         let mut current_start = None;
-        // Split compaction job to about 2GB each
-        const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024; // 4GB, TODO: should be configuration in the future
         let ranges_num = split_key_ranges.len();
         for (idx, (start, end)) in split_key_ranges.into_iter().enumerate() {
             if current_start.is_none() {
@@ -1833,8 +1876,7 @@ impl Timeline {
             }
             let res = layer_map.range_search(start..end, compact_below_lsn);
             let total_size = res.found.keys().map(|x| x.layer.file_size()).sum::<u64>();
-            if total_size > GC_COMPACT_MAX_SIZE_MB * 1024 * 1024 || ranges_num == idx + 1 {
-                let mut compact_options = options.clone();
+            if total_size > sub_compaction_max_job_size_mb * 1024 * 1024 || ranges_num == idx + 1 {
                 // Try to extend the compaction range so that we include at least one full layer file.
                 let extended_end = res
                     .found
@@ -1852,10 +1894,11 @@ impl Timeline {
                     "splitting compaction job: {}..{}, estimated_size={}",
                     start, end, total_size
                 );
-                compact_options.compact_range = Some(CompactRange { start, end });
-                compact_options.compact_below_lsn = Some(compact_below_lsn);
-                compact_options.sub_compaction = false;
-                compact_jobs.push(compact_options);
+                compact_jobs.push(GcCompactJob {
+                    dry_run: job.dry_run,
+                    compact_key_range: start..end,
+                    compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn,
+                });
                 current_start = Some(end);
             }
         }
@@ -1877,7 +1920,7 @@ impl Timeline {
     /// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
     /// part of the range.
     ///
-    /// If `options.compact_below_lsn` is provided, the compaction will only compact layers below or intersect with
+    /// If `options.compact_lsn_range.end` is provided, the compaction will only compact layers below or intersect with
     /// the LSN. Otherwise, it will use the gc cutoff by default.
     pub(crate) async fn compact_with_gc(
         self: &Arc<Self>,
@@ -1885,9 +1928,13 @@ impl Timeline {
         options: CompactOptions,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        if options.sub_compaction {
+        let sub_compaction = options.sub_compaction;
+        let job = GcCompactJob::from_compact_options(options.clone());
+        if sub_compaction {
             info!("running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
-            let jobs = self.gc_compaction_split_jobs(options).await?;
+            let jobs = self
+                .gc_compaction_split_jobs(job, options.sub_compaction_max_job_size_mb)
+                .await?;
             let jobs_len = jobs.len();
             for (idx, job) in jobs.into_iter().enumerate() {
                 info!(
@@ -1902,19 +1949,15 @@ impl Timeline {
             }
             return Ok(());
         }
-        self.compact_with_gc_inner(cancel, options, ctx).await
+        self.compact_with_gc_inner(cancel, job, ctx).await
     }
 
     async fn compact_with_gc_inner(
         self: &Arc<Self>,
         cancel: &CancellationToken,
-        options: CompactOptions,
+        job: GcCompactJob,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        assert!(
-            !options.sub_compaction,
-            "sub-compaction should be handled by the outer function"
-        );
         // Block other compaction/GC tasks from running for now. GC-compaction could run along
         // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
         // Note that we already acquired the compaction lock when the outer `compact` function gets called.
@@ -1934,19 +1977,11 @@ impl Timeline {
         )
         .await?;
 
-        let flags = options.flags;
-        let compaction_key_range = options
-            .compact_range
-            .map(|range| range.start..range.end)
-            .unwrap_or_else(|| Key::MIN..Key::MAX);
+        let dry_run = job.dry_run;
+        let compact_key_range = job.compact_key_range;
+        let compact_lsn_range = job.compact_lsn_range;
 
-        let dry_run = flags.contains(CompactFlags::DryRun);
-
-        if compaction_key_range == (Key::MIN..Key::MAX) {
-            info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compaction_key_range={}..{}", compaction_key_range.start, compaction_key_range.end);
-        } else {
-            info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
-        }
+        info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", compact_key_range.start, compact_key_range.end, compact_lsn_range.start, compact_lsn_range.end);
 
         scopeguard::defer! {
             info!("done enhanced gc bottom-most compaction");
@@ -1970,11 +2005,15 @@ impl Timeline {
                 // to get the truth data.
                 let real_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
                 // The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for
-                // each of the retain_lsn. Therefore, if the user-provided `compact_below_lsn` is larger than the real gc cutoff, we will use
+                // each of the retain_lsn. Therefore, if the user-provided `compact_lsn_range.end` is larger than the real gc cutoff, we will use
                 // the real cutoff.
-                let mut gc_cutoff = options.compact_below_lsn.unwrap_or(real_gc_cutoff);
+                let mut gc_cutoff = if compact_lsn_range.end == Lsn::MAX {
+                    real_gc_cutoff
+                } else {
+                    compact_lsn_range.end
+                };
                 if gc_cutoff > real_gc_cutoff {
-                    warn!("provided compact_below_lsn={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff);
+                    warn!("provided compact_lsn_range.end={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff);
                     gc_cutoff = real_gc_cutoff;
                 }
                 gc_cutoff
@@ -1991,7 +2030,7 @@ impl Timeline {
             }
             let mut selected_layers: Vec<Layer> = Vec::new();
             drop(gc_info);
-            // Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
+            // Firstly, pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
             let Some(max_layer_lsn) = layers
                 .iter_historic_layers()
                 .filter(|desc| desc.get_lsn_range().start <= gc_cutoff)
@@ -2001,27 +2040,45 @@ impl Timeline {
                 info!("no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff);
                 return Ok(());
             };
+            // Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below
+            // the min_layer_lsn computed as below will be filtered out and the data will be accessed using the normal read path, as if
+            // it is a branch.
+            let Some(min_layer_lsn) = layers
+                .iter_historic_layers()
+                .filter(|desc| {
+                    if compact_lsn_range.start == Lsn::INVALID {
+                        true // select all layers below if start == Lsn(0)
+                    } else {
+                        desc.get_lsn_range().end > compact_lsn_range.start // strictly larger than compact_above_lsn
+                    }
+                })
+                .map(|desc| desc.get_lsn_range().start)
+                .min()
+            else {
+                info!("no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}", compact_lsn_range.end);
+                return Ok(());
+            };
             // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
             // layers to compact.
             let mut rewrite_layers = Vec::new();
             for desc in layers.iter_historic_layers() {
                 if desc.get_lsn_range().end <= max_layer_lsn
-                    && overlaps_with(&desc.get_key_range(), &compaction_key_range)
+                    && desc.get_lsn_range().start >= min_layer_lsn
+                    && overlaps_with(&desc.get_key_range(), &compact_key_range)
                 {
                     // If the layer overlaps with the compaction key range, we need to read it to obtain all keys within the range,
                     // even if it might contain extra keys
                     selected_layers.push(guard.get_from_desc(&desc));
                     // If the layer is not fully contained within the key range, we need to rewrite it if it's a delta layer (it's fine
                     // to overlap image layers)
-                    if desc.is_delta()
-                        && !fully_contains(&compaction_key_range, &desc.get_key_range())
+                    if desc.is_delta() && !fully_contains(&compact_key_range, &desc.get_key_range())
                     {
                         rewrite_layers.push(desc);
                     }
                 }
             }
             if selected_layers.is_empty() {
-                info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compaction_key_range.start, compaction_key_range.end);
+                info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compact_key_range.start, compact_key_range.end);
                 return Ok(());
             }
             retain_lsns_below_horizon.sort();
@@ -2029,13 +2086,20 @@ impl Timeline {
                 selected_layers,
                 gc_cutoff,
                 retain_lsns_below_horizon,
+                min_layer_lsn,
                 max_layer_lsn,
-                compaction_key_range,
+                compaction_key_range: compact_key_range,
                 rewrite_layers,
             }
         };
-        let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
-            Lsn(self.ancestor_lsn.0 + 1)
+        let (has_data_below, lowest_retain_lsn) = if compact_lsn_range.start != Lsn::INVALID {
+            // If we only compact above some LSN, we should get the history from the current branch below the specified LSN.
+            // We use job_desc.min_layer_lsn as if it's the lowest branch point.
+            (true, job_desc.min_layer_lsn)
+        } else if self.ancestor_timeline.is_some() {
+            // In theory, we can also use min_layer_lsn here, but using ancestor LSN makes sure the delta layers cover the
+            // LSN ranges all the way to the ancestor timeline.
+            (true, self.ancestor_lsn)
         } else {
             let res = job_desc
                 .retain_lsns_below_horizon
@@ -2053,17 +2117,19 @@ impl Timeline {
                         .unwrap_or(job_desc.gc_cutoff)
                 );
             }
-            res
+            (false, res)
         };
         info!(
-            "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}",
+            "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} min_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}, has_data_below={}",
             job_desc.selected_layers.len(),
             job_desc.rewrite_layers.len(),
             job_desc.max_layer_lsn,
+            job_desc.min_layer_lsn,
             job_desc.gc_cutoff,
             lowest_retain_lsn,
             job_desc.compaction_key_range.start,
-            job_desc.compaction_key_range.end
+            job_desc.compaction_key_range.end,
+            has_data_below,
         );
 
         for layer in &job_desc.selected_layers {
@@ -2107,10 +2173,22 @@ impl Timeline {
         let mut delta_layers = Vec::new();
         let mut image_layers = Vec::new();
         let mut downloaded_layers = Vec::new();
+        let mut total_downloaded_size = 0;
+        let mut total_layer_size = 0;
         for layer in &job_desc.selected_layers {
+            if layer.needs_download().await?.is_some() {
+                total_downloaded_size += layer.layer_desc().file_size;
+            }
+            total_layer_size += layer.layer_desc().file_size;
             let resident_layer = layer.download_and_keep_resident().await?;
             downloaded_layers.push(resident_layer);
         }
+        info!(
+            "finish downloading layers, downloaded={}, total={}, ratio={:.2}",
+            total_downloaded_size,
+            total_layer_size,
+            total_downloaded_size as f64 / total_layer_size as f64
+        );
         for resident_layer in &downloaded_layers {
             if resident_layer.layer_desc().is_delta() {
                 let layer = resident_layer.get_as_delta(ctx).await?;
@@ -2133,7 +2211,7 @@ impl Timeline {
 
         // Only create image layers when there is no ancestor branches. TODO: create covering image layer
         // when some condition meet.
-        let mut image_layer_writer = if self.ancestor_timeline.is_none() {
+        let mut image_layer_writer = if !has_data_below {
             Some(
                 SplitImageLayerWriter::new(
                     self.conf,
@@ -2166,7 +2244,11 @@ impl Timeline {
         }
         let mut delta_layer_rewriters = HashMap::<Arc<PersistentLayerKey>, RewritingLayers>::new();
 
-        /// Returns None if there is no ancestor branch. Throw an error when the key is not found.
+        /// When compacting not at a bottom range (=`[0,X)`) of the root branch, we "have data below" (`has_data_below=true`).
+        /// The two cases are compaction in ancestor branches and when `compact_lsn_range.start` is set.
+        /// In those cases, we need to pull up data from below the LSN range we're compaction.
+        ///
+        /// This function unifies the cases so that later code doesn't have to think about it.
         ///
         /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
         /// is needed for reconstruction. This should be fixed in the future.
@@ -2174,17 +2256,19 @@ impl Timeline {
         /// Furthermore, we should do vectored get instead of a single get, or better, use k-merge for ancestor
         /// images.
         async fn get_ancestor_image(
-            tline: &Arc<Timeline>,
+            this_tline: &Arc<Timeline>,
             key: Key,
             ctx: &RequestContext,
+            has_data_below: bool,
+            history_lsn_point: Lsn,
         ) -> anyhow::Result<Option<(Key, Lsn, Bytes)>> {
-            if tline.ancestor_timeline.is_none() {
+            if !has_data_below {
                 return Ok(None);
             };
             // This function is implemented as a get of the current timeline at ancestor LSN, therefore reusing
             // as much existing code as possible.
-            let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
-            Ok(Some((key, tline.ancestor_lsn, img)))
+            let img = this_tline.get(key, history_lsn_point, ctx).await?;
+            Ok(Some((key, history_lsn_point, img)))
         }
 
         // Actually, we can decide not to write to the image layer at all at this point because
@@ -2268,7 +2352,8 @@ impl Timeline {
                         job_desc.gc_cutoff,
                         &job_desc.retain_lsns_below_horizon,
                         COMPACTION_DELTA_THRESHOLD,
-                        get_ancestor_image(self, *last_key, ctx).await?,
+                        get_ancestor_image(self, *last_key, ctx, has_data_below, lowest_retain_lsn)
+                            .await?,
                     )
                     .await?;
                 retention
@@ -2297,7 +2382,7 @@ impl Timeline {
                 job_desc.gc_cutoff,
                 &job_desc.retain_lsns_below_horizon,
                 COMPACTION_DELTA_THRESHOLD,
-                get_ancestor_image(self, last_key, ctx).await?,
+                get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn).await?,
             )
             .await?;
         retention
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 810a9723e0..88873c63c2 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -153,6 +153,7 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
         if i % 10 == 0:
             log.info(f"Running churn round {i}/{churn_rounds} ...")
 
+        if (i - 1) % 10 == 0:
             # Run gc-compaction every 10 rounds to ensure the test doesn't take too long time.
             ps_http.timeline_compact(
                 tenant_id,
@@ -161,10 +162,11 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
                 body={
                     "scheduled": True,
                     "sub_compaction": True,
-                    "compact_range": {
+                    "compact_key_range": {
                         "start": "000000000000000000000000000000000000",
                         "end": "030000000000000000000000000000000000",
                     },
+                    "sub_compaction_max_job_size_mb": 16,
                 },
             )
 

From 2f3f98a3190a787b8420b8e969a7c5f6930de0b7 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 12 Dec 2024 21:25:39 +0100
Subject: [PATCH 030/583] use OIDC role instead of AWS access keys for managing
 test runner  (#10117)

in periodic pagebench workflow

## Problem

for background see https://github.com/neondatabase/cloud/issues/21545

## Summary of changes

use OIDC role to manage runners instead of AWS access key which needs to
be periodically rotated

## logs

seems to work in
https://github.com/neondatabase/neon/actions/runs/12298575888/job/34322306127#step:6:1
---
 .github/actionlint.yml                   |  1 +
 .github/workflows/periodic_pagebench.yml | 17 +++++++++++++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 9d8389faa5..7a97e2ae55 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -24,3 +24,4 @@ config-variables:
   - PGREGRESS_PG16_PROJECT_ID
   - PGREGRESS_PG17_PROJECT_ID
   - SLACK_ON_CALL_QA_STAGING_STREAM
+  - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
index a04ceb4a24..9f5a16feca 100644
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -41,8 +41,6 @@ jobs:
     env:
       API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
       RUN_ID: ${{ github.run_id }}
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }}
       AWS_DEFAULT_REGION : "eu-central-1"
       AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
     steps:
@@ -53,6 +51,13 @@ jobs:
     - name: Show my own (github runner) external IP address - usefull for IP allowlisting
       run: curl https://ifconfig.me
 
+    - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine)
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }}
+        role-duration-seconds: 3600
+
     - name: Start EC2 instance and wait for the instance to boot up
       run: |
         aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
@@ -150,6 +155,14 @@ jobs:
         -H "Authorization: Bearer $API_KEY" \
         -d ''
 
+    - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine)
+      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }}
+        role-duration-seconds: 3600
+
     - name: Stop EC2 instance and wait for the instance to be stopped
       if: always() && steps.poll_step.outputs.too_many_runs != 'true'
       run: |

From ac04bad45751bd60f19e6ac51935f7de01e6b87d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 12 Dec 2024 22:55:38 +0000
Subject: [PATCH 031/583] CI: don't run debug builds with LFC (#10123)

## Problem

I've noticed that debug builds with LFC fail more frequently and for
some reason ,their failure do block merging (but it should not)

## Summary of changes
- Do not run Debug builds with LFC
---
 .github/workflows/build_and_test.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 4bfb5077c6..1bae23f7ef 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -254,16 +254,14 @@ jobs:
       build-tag: ${{ needs.tag.outputs.build-tag }}
       build-type: ${{ matrix.build-type }}
       # Run tests on all Postgres versions in release builds and only on the latest version in debug builds.
-      # Run without LFC on v17 release and debug builds only. For all the other cases LFC is enabled. Failure on the
-      # debug build with LFC enabled doesn't block merging.
+      # Run without LFC on v17 release and debug builds only. For all the other cases LFC is enabled.
       test-cfg: |
         ${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "with-lfc"},
                                                 {"pg_version":"v15", "lfc_state": "with-lfc"},
                                                 {"pg_version":"v16", "lfc_state": "with-lfc"},
                                                 {"pg_version":"v17", "lfc_state": "with-lfc"},
                                                 {"pg_version":"v17", "lfc_state": "without-lfc"}]'
-                                           || '[{"pg_version":"v17", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v17", "lfc_state": "with-lfc" }]' }}
+                                           || '[{"pg_version":"v17", "lfc_state": "without-lfc" }]' }}
     secrets: inherit
 
   # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking

From 59ef70192591c0fc2a5c2a69eea0d47f7bd8e033 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Fri, 13 Dec 2024 00:38:20 +0100
Subject: [PATCH 032/583] CI(deploy): fix git tag/release creation (#10119)

## Problem

When moving the comment on proxy-releases from the yaml doc into a
javascript code block, I missed converting the comment marker from `#`
to `//`.

## Summary of changes

Correctly convert comment marker.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 1bae23f7ef..a3943cba91 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1143,7 +1143,7 @@ jobs:
               console.log(`Tag ${tag} created successfully.`);
             }
 
-            # TODO: check how GitHub releases looks for proxy/compute releases and enable them if they're ok
+            // TODO: check how GitHub releases looks for proxy/compute releases and enable them if they're ok
             if (context.ref !== 'refs/heads/release') {
               console.log(`GitHub release skipped for ${context.ref}.`);
               return;

From 2451969d5c533ab4ca085cdce12fdd79d55cca8b Mon Sep 17 00:00:00 2001
From: Rahul Patil <rahul@neon.tech>
Date: Fri, 13 Dec 2024 13:22:15 +0100
Subject: [PATCH 033/583] fix(ci): Allow github-action-script to post reports
 (#10136)

Allow github-action-script to post reports.

Failed CI:
https://github.com/neondatabase/neon/actions/runs/12304655364/job/34342554049#step:13:514
---
 .github/workflows/build_and_test.yml     | 15 +++++++++++++++
 .github/workflows/periodic_pagebench.yml |  8 +++++---
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index a3943cba91..b3556debe3 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -303,6 +303,11 @@ jobs:
   benchmarks:
     if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
     needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
+      pull-requests: write
     runs-on: [ self-hosted, small ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
@@ -343,6 +348,11 @@ jobs:
   report-benchmarks-failures:
     needs: [ benchmarks, create-test-report ]
     if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
+      pull-requests: write
     runs-on: ubuntu-22.04
 
     steps:
@@ -1024,6 +1034,11 @@ jobs:
   trigger-custom-extensions-build-and-wait:
     needs: [ check-permissions, tag ]
     runs-on: ubuntu-22.04
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
+      pull-requests: write
     steps:
       - name: Set PR's status to pending and request a remote CI test
         run: |
diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
index 9f5a16feca..049990f17b 100644
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -21,15 +21,17 @@ defaults:
   run:
     shell: bash -euo pipefail {0}
 
-permissions:
-  id-token: write # aws-actions/configure-aws-credentials
-
 concurrency:
   group: ${{ github.workflow }}
   cancel-in-progress: false
 
 jobs:
   trigger_bench_on_ec2_machine_in_eu_central_1:
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
+      pull-requests: write
     runs-on: [ self-hosted, small ]
     container:
       image: neondatabase/build-tools:pinned-bookworm

From 7dc382601cb9f2c508f11acdac897a8a0bbbdce3 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Fri, 13 Dec 2024 14:59:04 +0100
Subject: [PATCH 034/583] Fix pg_regress tests on a cloud staging instance
 (#10134)

## Problem
pg_regress tests start failing due to unique ids added to Neon error
messages
## Summary of changes
Patches updated
---
 .github/workflows/cloud-regress.yml      |   1 +
 compute/patches/cloud_regress_pg16.patch | 173 ++++++++++++----------
 compute/patches/cloud_regress_pg17.patch | 179 ++++++++++++-----------
 3 files changed, 190 insertions(+), 163 deletions(-)

diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml
index 2fc26baa21..7b9e434ec3 100644
--- a/.github/workflows/cloud-regress.yml
+++ b/.github/workflows/cloud-regress.yml
@@ -99,6 +99,7 @@ jobs:
           BENCHMARK_CONNSTR: ${{steps.create-branch.outputs.dsn}}
 
       - name: Delete branch
+        if: always()
         uses: ./.github/actions/neon-branch-delete
         with:
           api_key: ${{ secrets.NEON_STAGING_API_KEY }}
diff --git a/compute/patches/cloud_regress_pg16.patch b/compute/patches/cloud_regress_pg16.patch
index a4b93d0260..3f0bb84ae7 100644
--- a/compute/patches/cloud_regress_pg16.patch
+++ b/compute/patches/cloud_regress_pg16.patch
@@ -981,7 +981,7 @@ index fc42d418bf..e38f517574 100644
  CREATE SCHEMA addr_nsp;
  SET search_path TO 'addr_nsp';
 diff --git a/src/test/regress/expected/password.out b/src/test/regress/expected/password.out
-index 8475231735..1afae5395f 100644
+index 8475231735..0653946337 100644
 --- a/src/test/regress/expected/password.out
 +++ b/src/test/regress/expected/password.out
 @@ -12,11 +12,11 @@ SET password_encryption = 'md5'; -- ok
@@ -1006,65 +1006,63 @@ index 8475231735..1afae5395f 100644
  -----------------+---------------------------------------------------
 - regress_passwd1 | md5783277baca28003b33453252be4dbb34
 - regress_passwd2 | md54044304ba511dd062133eb5b4b84a2a3
-+ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1
-+ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2
++ regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1
++ regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2
   regress_passwd3 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
 - regress_passwd4 | 
 + regress_passwd4 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
  (4 rows)
  
  -- Rename a role
-@@ -54,24 +54,30 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
+@@ -54,24 +54,16 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
  -- passwords.
  SET password_encryption = 'md5';
  -- encrypt with MD5
 -ALTER ROLE regress_passwd2 PASSWORD 'foo';
+--- already encrypted, use as they are
+-ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
+-ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
 +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
- -- already encrypted, use as they are
- ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
  SET password_encryption = 'scram-sha-256';
  -- create SCRAM secret
 -ALTER ROLE  regress_passwd4 PASSWORD 'foo';
+--- already encrypted with MD5, use as it is
+-CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
+--- This looks like a valid SCRAM-SHA-256 secret, but it is not
+--- so it should be hashed with SCRAM-SHA-256.
+-CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234';
+--- These may look like valid MD5 secrets, but they are not, so they
+--- should be hashed with SCRAM-SHA-256.
+--- trailing garbage at the end
+-CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz';
+--- invalid length
+-CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz';
 +ALTER ROLE  regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
- -- already encrypted with MD5, use as it is
- CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- -- This looks like a valid SCRAM-SHA-256 secret, but it is not
- -- so it should be hashed with SCRAM-SHA-256.
- CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- -- These may look like valid MD5 secrets, but they are not, so they
- -- should be hashed with SCRAM-SHA-256.
- -- trailing garbage at the end
- CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- -- invalid length
- CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
++-- Neon does not support encrypted passwords, use unencrypted instead
++CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++-- Neon does not support encrypted passwords, use unencrypted instead
++CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER;
  -- Changing the SCRAM iteration count
  SET scram_iterations = 1024;
  CREATE ROLE regress_passwd9 PASSWORD 'alterediterationcount';
-@@ -81,63 +87,67 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
+@@ -81,11 +73,11 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
      ORDER BY rolname, rolpassword;
       rolname     |                rolpassword_masked                 
  -----------------+---------------------------------------------------
 - regress_passwd1 | md5cd3578025fe2c3d7ed1b9a9b26238b70
 - regress_passwd2 | md5dfa155cadd5f4ad57860162f3fab9cdb
-+ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1
-+ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2
++ regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1
++ regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2
   regress_passwd3 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
   regress_passwd4 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
 - regress_passwd5 | md5e73a4b11df52a6068f8b39f90be36023
-- regress_passwd6 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
-- regress_passwd7 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
-- regress_passwd8 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
-  regress_passwd9 | SCRAM-SHA-256$1024:<salt>$<storedkey>:<serverkey>
--(9 rows)
-+(5 rows)
- 
++ regress_passwd5 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+  regress_passwd6 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+  regress_passwd7 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+  regress_passwd8 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+@@ -95,23 +87,20 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
  -- An empty password is not allowed, in any form
  CREATE ROLE regress_passwd_empty PASSWORD '';
  NOTICE:  empty string is not a valid password, clearing password
@@ -1082,56 +1080,37 @@ index 8475231735..1afae5395f 100644
 -(1 row)
 +(0 rows)
  
- -- Test with invalid stored and server keys.
- --
- -- The first is valid, to act as a control. The others have too long
- -- stored/server keys. They will be re-hashed.
- CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+--- Test with invalid stored and server keys.
+---
+--- The first is valid, to act as a control. The others have too long
+--- stored/server keys. They will be re-hashed.
+-CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
+-CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
+-CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=';
++-- Neon does not support encrypted passwords, use unencrypted instead
++CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
  -- Check that the invalid secrets were re-hashed. A re-hashed secret
  -- should not contain the original salt.
  SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassword_rehashed
-     FROM pg_authid
-     WHERE rolname LIKE 'regress_passwd_sha_len%'
+@@ -120,7 +109,7 @@ SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassw
      ORDER BY rolname;
--         rolname         | is_rolpassword_rehashed 
---------------------------+-------------------------
+          rolname         | is_rolpassword_rehashed 
+ -------------------------+-------------------------
 - regress_passwd_sha_len0 | f
-- regress_passwd_sha_len1 | t
-- regress_passwd_sha_len2 | t
--(3 rows)
-+ rolname | is_rolpassword_rehashed 
-+---------+-------------------------
-+(0 rows)
- 
- DROP ROLE regress_passwd1;
- DROP ROLE regress_passwd2;
- DROP ROLE regress_passwd3;
- DROP ROLE regress_passwd4;
- DROP ROLE regress_passwd5;
-+ERROR:  role "regress_passwd5" does not exist
- DROP ROLE regress_passwd6;
-+ERROR:  role "regress_passwd6" does not exist
- DROP ROLE regress_passwd7;
-+ERROR:  role "regress_passwd7" does not exist
++ regress_passwd_sha_len0 | t
+  regress_passwd_sha_len1 | t
+  regress_passwd_sha_len2 | t
+ (3 rows)
+@@ -135,6 +124,7 @@ DROP ROLE regress_passwd7;
  DROP ROLE regress_passwd8;
-+ERROR:  role "regress_passwd8" does not exist
  DROP ROLE regress_passwd9;
  DROP ROLE regress_passwd_empty;
 +ERROR:  role "regress_passwd_empty" does not exist
  DROP ROLE regress_passwd_sha_len0;
-+ERROR:  role "regress_passwd_sha_len0" does not exist
  DROP ROLE regress_passwd_sha_len1;
-+ERROR:  role "regress_passwd_sha_len1" does not exist
  DROP ROLE regress_passwd_sha_len2;
-+ERROR:  role "regress_passwd_sha_len2" does not exist
- -- all entries should have been removed
- SELECT rolname, rolpassword
-     FROM pg_authid
 diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
 index 5b9dba7b32..cc408dad42 100644
 --- a/src/test/regress/expected/privileges.out
@@ -3194,7 +3173,7 @@ index 1a6c61f49d..1c31ac6a53 100644
  -- Test generic object addressing/identification functions
  CREATE SCHEMA addr_nsp;
 diff --git a/src/test/regress/sql/password.sql b/src/test/regress/sql/password.sql
-index 53e86b0b6c..f07cf1ec54 100644
+index 53e86b0b6c..0303fdfe96 100644
 --- a/src/test/regress/sql/password.sql
 +++ b/src/test/regress/sql/password.sql
 @@ -10,11 +10,11 @@ SET password_encryption = 'scram-sha-256'; -- ok
@@ -3213,23 +3192,59 @@ index 53e86b0b6c..f07cf1ec54 100644
  
  -- check list of created entries
  --
-@@ -42,14 +42,14 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
+@@ -42,26 +42,18 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
  SET password_encryption = 'md5';
  
  -- encrypt with MD5
 -ALTER ROLE regress_passwd2 PASSWORD 'foo';
+--- already encrypted, use as they are
+-ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
+-ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
 +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
- -- already encrypted, use as they are
- ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
- ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
  
  SET password_encryption = 'scram-sha-256';
  -- create SCRAM secret
 -ALTER ROLE  regress_passwd4 PASSWORD 'foo';
+--- already encrypted with MD5, use as it is
+-CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
 +ALTER ROLE  regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
- -- already encrypted with MD5, use as it is
- CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
++-- Neon does not support encrypted passwords, use unencrypted instead
++CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
  
+--- This looks like a valid SCRAM-SHA-256 secret, but it is not
+--- so it should be hashed with SCRAM-SHA-256.
+-CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234';
+--- These may look like valid MD5 secrets, but they are not, so they
+--- should be hashed with SCRAM-SHA-256.
+--- trailing garbage at the end
+-CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz';
+--- invalid length
+-CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz';
++-- Neon does not support encrypted passwords, use unencrypted instead
++CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- Changing the SCRAM iteration count
+ SET scram_iterations = 1024;
+@@ -78,13 +70,10 @@ ALTER ROLE regress_passwd_empty PASSWORD 'md585939a5ce845f1a1b620742e3c659e0a';
+ ALTER ROLE regress_passwd_empty PASSWORD 'SCRAM-SHA-256$4096:hpFyHTUsSWcR7O9P$LgZFIt6Oqdo27ZFKbZ2nV+vtnYM995pDh9ca6WSi120=:qVV5NeluNfUPkwm7Vqat25RjSPLkGeoZBQs6wVv+um4=';
+ SELECT rolpassword FROM pg_authid WHERE rolname='regress_passwd_empty';
+ 
+--- Test with invalid stored and server keys.
+---
+--- The first is valid, to act as a control. The others have too long
+--- stored/server keys. They will be re-hashed.
+-CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
+-CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
+-CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=';
++-- Neon does not support encrypted passwords, use unencrypted instead
++CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- Check that the invalid secrets were re-hashed. A re-hashed secret
+ -- should not contain the original salt.
 diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql
 index 249df17a58..b258e7f26a 100644
 --- a/src/test/regress/sql/privileges.sql
diff --git a/compute/patches/cloud_regress_pg17.patch b/compute/patches/cloud_regress_pg17.patch
index cbe84ef54b..e57447a2c6 100644
--- a/compute/patches/cloud_regress_pg17.patch
+++ b/compute/patches/cloud_regress_pg17.patch
@@ -1014,10 +1014,10 @@ index fc42d418bf..e38f517574 100644
  CREATE SCHEMA addr_nsp;
  SET search_path TO 'addr_nsp';
 diff --git a/src/test/regress/expected/password.out b/src/test/regress/expected/password.out
-index 924d6e001d..5966531db6 100644
+index 924d6e001d..7fdda73439 100644
 --- a/src/test/regress/expected/password.out
 +++ b/src/test/regress/expected/password.out
-@@ -12,13 +12,13 @@ SET password_encryption = 'md5'; -- ok
+@@ -12,13 +12,11 @@ SET password_encryption = 'md5'; -- ok
  SET password_encryption = 'scram-sha-256'; -- ok
  -- consistency of password entries
  SET password_encryption = 'md5';
@@ -1026,9 +1026,7 @@ index 924d6e001d..5966531db6 100644
 -CREATE ROLE regress_passwd2;
 -ALTER ROLE regress_passwd2 PASSWORD 'role_pwd2';
 +CREATE ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
-+ALTER ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
 +CREATE ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
-+ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
  SET password_encryption = 'scram-sha-256';
 -CREATE ROLE regress_passwd3 PASSWORD 'role_pwd3';
 -CREATE ROLE regress_passwd4 PASSWORD NULL;
@@ -1037,71 +1035,69 @@ index 924d6e001d..5966531db6 100644
  -- check list of created entries
  --
  -- The scram secret will look something like:
-@@ -32,10 +32,10 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
+@@ -32,10 +30,10 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
      ORDER BY rolname, rolpassword;
       rolname     |                rolpassword_masked                 
  -----------------+---------------------------------------------------
 - regress_passwd1 | md5783277baca28003b33453252be4dbb34
 - regress_passwd2 | md54044304ba511dd062133eb5b4b84a2a3
-+ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1
-+ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2
++ regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1
++ regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2
   regress_passwd3 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
 - regress_passwd4 | 
 + regress_passwd4 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
  (4 rows)
  
  -- Rename a role
-@@ -56,24 +56,30 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
+@@ -56,24 +54,17 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
  -- passwords.
  SET password_encryption = 'md5';
  -- encrypt with MD5
 -ALTER ROLE regress_passwd2 PASSWORD 'foo';
+--- already encrypted, use as they are
+-ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
+-ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
 +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
- -- already encrypted, use as they are
- ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
  SET password_encryption = 'scram-sha-256';
  -- create SCRAM secret
 -ALTER ROLE  regress_passwd4 PASSWORD 'foo';
 +ALTER ROLE  regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
  -- already encrypted with MD5, use as it is
- CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- -- This looks like a valid SCRAM-SHA-256 secret, but it is not
- -- so it should be hashed with SCRAM-SHA-256.
- CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- -- These may look like valid MD5 secrets, but they are not, so they
- -- should be hashed with SCRAM-SHA-256.
- -- trailing garbage at the end
- CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- -- invalid length
- CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+-CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
+--- This looks like a valid SCRAM-SHA-256 secret, but it is not
+--- so it should be hashed with SCRAM-SHA-256.
+-CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234';
+--- These may look like valid MD5 secrets, but they are not, so they
+--- should be hashed with SCRAM-SHA-256.
+--- trailing garbage at the end
+-CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz';
+--- invalid length
+-CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz';
++-- Neon does not support encrypted passwords, use unencrypted instead
++CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++-- Neon does not support encrypted passwords, use unencrypted instead
++CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER;
  -- Changing the SCRAM iteration count
  SET scram_iterations = 1024;
  CREATE ROLE regress_passwd9 PASSWORD 'alterediterationcount';
-@@ -83,63 +89,67 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
+@@ -83,11 +74,11 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
      ORDER BY rolname, rolpassword;
       rolname     |                rolpassword_masked                 
  -----------------+---------------------------------------------------
 - regress_passwd1 | md5cd3578025fe2c3d7ed1b9a9b26238b70
 - regress_passwd2 | md5dfa155cadd5f4ad57860162f3fab9cdb
-+ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1
-+ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2
++ regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1
++ regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2
   regress_passwd3 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
   regress_passwd4 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
 - regress_passwd5 | md5e73a4b11df52a6068f8b39f90be36023
-- regress_passwd6 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
-- regress_passwd7 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
-- regress_passwd8 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
-  regress_passwd9 | SCRAM-SHA-256$1024:<salt>$<storedkey>:<serverkey>
--(9 rows)
-+(5 rows)
- 
++ regress_passwd5 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+  regress_passwd6 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+  regress_passwd7 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+  regress_passwd8 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+@@ -97,23 +88,20 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
  -- An empty password is not allowed, in any form
  CREATE ROLE regress_passwd_empty PASSWORD '';
  NOTICE:  empty string is not a valid password, clearing password
@@ -1119,56 +1115,37 @@ index 924d6e001d..5966531db6 100644
 -(1 row)
 +(0 rows)
  
- -- Test with invalid stored and server keys.
- --
- -- The first is valid, to act as a control. The others have too long
- -- stored/server keys. They will be re-hashed.
- CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
- CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=';
-+ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+--- Test with invalid stored and server keys.
+---
+--- The first is valid, to act as a control. The others have too long
+--- stored/server keys. They will be re-hashed.
+-CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
+-CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
+-CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=';
++-- Neon does not support encrypted passwords, use unencrypted instead
++CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
  -- Check that the invalid secrets were re-hashed. A re-hashed secret
  -- should not contain the original salt.
  SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassword_rehashed
-     FROM pg_authid
-     WHERE rolname LIKE 'regress_passwd_sha_len%'
+@@ -122,7 +110,7 @@ SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassw
      ORDER BY rolname;
--         rolname         | is_rolpassword_rehashed 
---------------------------+-------------------------
+          rolname         | is_rolpassword_rehashed 
+ -------------------------+-------------------------
 - regress_passwd_sha_len0 | f
-- regress_passwd_sha_len1 | t
-- regress_passwd_sha_len2 | t
--(3 rows)
-+ rolname | is_rolpassword_rehashed 
-+---------+-------------------------
-+(0 rows)
- 
- DROP ROLE regress_passwd1;
- DROP ROLE regress_passwd2;
- DROP ROLE regress_passwd3;
- DROP ROLE regress_passwd4;
- DROP ROLE regress_passwd5;
-+ERROR:  role "regress_passwd5" does not exist
- DROP ROLE regress_passwd6;
-+ERROR:  role "regress_passwd6" does not exist
- DROP ROLE regress_passwd7;
-+ERROR:  role "regress_passwd7" does not exist
++ regress_passwd_sha_len0 | t
+  regress_passwd_sha_len1 | t
+  regress_passwd_sha_len2 | t
+ (3 rows)
+@@ -137,6 +125,7 @@ DROP ROLE regress_passwd7;
  DROP ROLE regress_passwd8;
-+ERROR:  role "regress_passwd8" does not exist
  DROP ROLE regress_passwd9;
  DROP ROLE regress_passwd_empty;
 +ERROR:  role "regress_passwd_empty" does not exist
  DROP ROLE regress_passwd_sha_len0;
-+ERROR:  role "regress_passwd_sha_len0" does not exist
  DROP ROLE regress_passwd_sha_len1;
-+ERROR:  role "regress_passwd_sha_len1" does not exist
  DROP ROLE regress_passwd_sha_len2;
-+ERROR:  role "regress_passwd_sha_len2" does not exist
- -- all entries should have been removed
- SELECT rolname, rolpassword
-     FROM pg_authid
 diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
 index 1296da0d57..f43fffa44c 100644
 --- a/src/test/regress/expected/privileges.out
@@ -3249,10 +3226,10 @@ index 1a6c61f49d..1c31ac6a53 100644
  -- Test generic object addressing/identification functions
  CREATE SCHEMA addr_nsp;
 diff --git a/src/test/regress/sql/password.sql b/src/test/regress/sql/password.sql
-index bb82aa4aa2..7424c91b10 100644
+index bb82aa4aa2..dd8a05e24d 100644
 --- a/src/test/regress/sql/password.sql
 +++ b/src/test/regress/sql/password.sql
-@@ -10,13 +10,13 @@ SET password_encryption = 'scram-sha-256'; -- ok
+@@ -10,13 +10,11 @@ SET password_encryption = 'scram-sha-256'; -- ok
  
  -- consistency of password entries
  SET password_encryption = 'md5';
@@ -3261,9 +3238,7 @@ index bb82aa4aa2..7424c91b10 100644
 -CREATE ROLE regress_passwd2;
 -ALTER ROLE regress_passwd2 PASSWORD 'role_pwd2';
 +CREATE ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
-+ALTER ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
 +CREATE ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
-+ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
  SET password_encryption = 'scram-sha-256';
 -CREATE ROLE regress_passwd3 PASSWORD 'role_pwd3';
 -CREATE ROLE regress_passwd4 PASSWORD NULL;
@@ -3272,23 +3247,59 @@ index bb82aa4aa2..7424c91b10 100644
  
  -- check list of created entries
  --
-@@ -44,14 +44,14 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
+@@ -44,26 +42,19 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
  SET password_encryption = 'md5';
  
  -- encrypt with MD5
 -ALTER ROLE regress_passwd2 PASSWORD 'foo';
+--- already encrypted, use as they are
+-ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
+-ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
 +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
- -- already encrypted, use as they are
- ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
- ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
  
  SET password_encryption = 'scram-sha-256';
  -- create SCRAM secret
 -ALTER ROLE  regress_passwd4 PASSWORD 'foo';
 +ALTER ROLE  regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
  -- already encrypted with MD5, use as it is
- CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
+-CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
++-- Neon does not support encrypted passwords, use unencrypted instead
++CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
  
+--- This looks like a valid SCRAM-SHA-256 secret, but it is not
+--- so it should be hashed with SCRAM-SHA-256.
+-CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234';
+--- These may look like valid MD5 secrets, but they are not, so they
+--- should be hashed with SCRAM-SHA-256.
+--- trailing garbage at the end
+-CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz';
+--- invalid length
+-CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz';
++-- Neon does not support encrypted passwords, use unencrypted instead
++CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- Changing the SCRAM iteration count
+ SET scram_iterations = 1024;
+@@ -80,13 +71,10 @@ ALTER ROLE regress_passwd_empty PASSWORD 'md585939a5ce845f1a1b620742e3c659e0a';
+ ALTER ROLE regress_passwd_empty PASSWORD 'SCRAM-SHA-256$4096:hpFyHTUsSWcR7O9P$LgZFIt6Oqdo27ZFKbZ2nV+vtnYM995pDh9ca6WSi120=:qVV5NeluNfUPkwm7Vqat25RjSPLkGeoZBQs6wVv+um4=';
+ SELECT rolpassword FROM pg_authid WHERE rolname='regress_passwd_empty';
+ 
+--- Test with invalid stored and server keys.
+---
+--- The first is valid, to act as a control. The others have too long
+--- stored/server keys. They will be re-hashed.
+-CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
+-CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
+-CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=';
++-- Neon does not support encrypted passwords, use unencrypted instead
++CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- Check that the invalid secrets were re-hashed. A re-hashed secret
+ -- should not contain the original salt.
 diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql
 index 5880bc018d..27aa952b18 100644
 --- a/src/test/regress/sql/privileges.sql

From ce8eb089f3d002e5057f860454adeb0993431a19 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Fri, 13 Dec 2024 17:06:27 +0300
Subject: [PATCH 035/583] Extract public sk types to safekeeper_api (#10137)

## Problem

We want to extract safekeeper http client to separate crate for use in
storage controller and neon_local. However, many types used in the API
are internal to safekeeper.

## Summary of changes

Move them to safekeeper_api crate. No functional changes.

ref https://github.com/neondatabase/neon/issues/9011
---
 Cargo.lock                                    |   3 +
 libs/safekeeper_api/Cargo.toml                |   5 +-
 libs/safekeeper_api/src/lib.rs                |  17 ++
 libs/safekeeper_api/src/models.rs             | 170 +++++++++++++++++-
 safekeeper/src/control_file_upgrade.rs        |   3 +-
 safekeeper/src/debug_dump.rs                  |   2 +-
 safekeeper/src/handler.rs                     |   4 +-
 safekeeper/src/http/client.rs                 |   3 +-
 safekeeper/src/http/routes.rs                 |  72 ++------
 safekeeper/src/json_ctrl.rs                   |   5 +-
 safekeeper/src/pull_timeline.rs               |   7 +-
 safekeeper/src/receive_wal.rs                 |  21 +--
 safekeeper/src/recovery.rs                    |   8 +-
 safekeeper/src/safekeeper.rs                  |  19 +-
 safekeeper/src/send_wal.rs                    |  89 +--------
 safekeeper/src/state.rs                       |   7 +-
 safekeeper/src/timeline.rs                    |  55 ++----
 safekeeper/src/timeline_manager.rs            |   4 +-
 safekeeper/src/timelines_global_map.rs        |   2 +-
 safekeeper/src/wal_backup.rs                  |   3 +-
 safekeeper/src/wal_backup_partial.rs          |   2 +-
 safekeeper/src/wal_reader_stream.rs           |   2 +-
 safekeeper/src/wal_service.rs                 |   3 +-
 .../tests/walproposer_sim/safekeeper.rs       |   3 +-
 24 files changed, 264 insertions(+), 245 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e2d5e03613..c4f80f63c9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5565,7 +5565,10 @@ name = "safekeeper_api"
 version = "0.1.0"
 dependencies = [
  "const_format",
+ "postgres_ffi",
+ "pq_proto",
  "serde",
+ "tokio",
  "utils",
 ]
 
diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml
index 14811232d3..4234ec6779 100644
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -5,6 +5,9 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
-serde.workspace = true
 const_format.workspace = true
+serde.workspace = true
+postgres_ffi.workspace = true
+pq_proto.workspace = true
+tokio.workspace = true
 utils.workspace = true
diff --git a/libs/safekeeper_api/src/lib.rs b/libs/safekeeper_api/src/lib.rs
index 63c2c51188..be6923aca9 100644
--- a/libs/safekeeper_api/src/lib.rs
+++ b/libs/safekeeper_api/src/lib.rs
@@ -1,10 +1,27 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;
+use pq_proto::SystemId;
+use serde::{Deserialize, Serialize};
 
 /// Public API types
 pub mod models;
 
+/// Consensus logical timestamp. Note: it is a part of sk control file.
+pub type Term = u64;
+pub const INVALID_TERM: Term = 0;
+
+/// Information about Postgres. Safekeeper gets it once and then verifies all
+/// further connections from computes match. Note: it is a part of sk control
+/// file.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct ServerInfo {
+    /// Postgres server version
+    pub pg_version: u32,
+    pub system_id: SystemId,
+    pub wal_seg_size: u32,
+}
+
 pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
 
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index 28666d197a..3e424a792c 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -1,10 +1,23 @@
+//! Types used in safekeeper http API. Many of them are also reused internally.
+
+use postgres_ffi::TimestampTz;
 use serde::{Deserialize, Serialize};
+use std::net::SocketAddr;
+use tokio::time::Instant;
 
 use utils::{
-    id::{NodeId, TenantId, TimelineId},
+    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
     lsn::Lsn,
+    pageserver_feedback::PageserverFeedback,
 };
 
+use crate::{ServerInfo, Term};
+
+#[derive(Debug, Serialize)]
+pub struct SafekeeperStatus {
+    pub id: NodeId,
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
     pub tenant_id: TenantId,
@@ -18,6 +31,161 @@ pub struct TimelineCreateRequest {
     pub local_start_lsn: Option<Lsn>,
 }
 
+/// Same as TermLsn, but serializes LSN using display serializer
+/// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub struct TermSwitchApiEntry {
+    pub term: Term,
+    pub lsn: Lsn,
+}
+
+/// Augment AcceptorState with last_log_term for convenience
+#[derive(Debug, Serialize, Deserialize)]
+pub struct AcceptorStateStatus {
+    pub term: Term,
+    pub epoch: Term, // aka last_log_term, old `epoch` name is left for compatibility
+    pub term_history: Vec<TermSwitchApiEntry>,
+}
+
+/// Things safekeeper should know about timeline state on peers.
+/// Used as both model and internally.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PeerInfo {
+    pub sk_id: NodeId,
+    pub term: Term,
+    /// Term of the last entry.
+    pub last_log_term: Term,
+    /// LSN of the last record.
+    pub flush_lsn: Lsn,
+    pub commit_lsn: Lsn,
+    /// Since which LSN safekeeper has WAL.
+    pub local_start_lsn: Lsn,
+    /// When info was received. Serde annotations are not very useful but make
+    /// the code compile -- we don't rely on this field externally.
+    #[serde(skip)]
+    #[serde(default = "Instant::now")]
+    pub ts: Instant,
+    pub pg_connstr: String,
+    pub http_connstr: String,
+}
+
+pub type FullTransactionId = u64;
+
+/// Hot standby feedback received from replica
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub struct HotStandbyFeedback {
+    pub ts: TimestampTz,
+    pub xmin: FullTransactionId,
+    pub catalog_xmin: FullTransactionId,
+}
+
+pub const INVALID_FULL_TRANSACTION_ID: FullTransactionId = 0;
+
+impl HotStandbyFeedback {
+    pub fn empty() -> HotStandbyFeedback {
+        HotStandbyFeedback {
+            ts: 0,
+            xmin: 0,
+            catalog_xmin: 0,
+        }
+    }
+}
+
+/// Standby status update
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub struct StandbyReply {
+    pub write_lsn: Lsn, // The location of the last WAL byte + 1 received and written to disk in the standby.
+    pub flush_lsn: Lsn, // The location of the last WAL byte + 1 flushed to disk in the standby.
+    pub apply_lsn: Lsn, // The location of the last WAL byte + 1 applied in the standby.
+    pub reply_ts: TimestampTz, // The client's system clock at the time of transmission, as microseconds since midnight on 2000-01-01.
+    pub reply_requested: bool,
+}
+
+impl StandbyReply {
+    pub fn empty() -> Self {
+        StandbyReply {
+            write_lsn: Lsn::INVALID,
+            flush_lsn: Lsn::INVALID,
+            apply_lsn: Lsn::INVALID,
+            reply_ts: 0,
+            reply_requested: false,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub struct StandbyFeedback {
+    pub reply: StandbyReply,
+    pub hs_feedback: HotStandbyFeedback,
+}
+
+impl StandbyFeedback {
+    pub fn empty() -> Self {
+        StandbyFeedback {
+            reply: StandbyReply::empty(),
+            hs_feedback: HotStandbyFeedback::empty(),
+        }
+    }
+}
+
+/// Receiver is either pageserver or regular standby, which have different
+/// feedbacks.
+/// Used as both model and internally.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub enum ReplicationFeedback {
+    Pageserver(PageserverFeedback),
+    Standby(StandbyFeedback),
+}
+
+/// Uniquely identifies a WAL service connection. Logged in spans for
+/// observability.
+pub type ConnectionId = u32;
+
+/// Serialize is used only for json'ing in API response. Also used internally.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct WalSenderState {
+    pub ttid: TenantTimelineId,
+    pub addr: SocketAddr,
+    pub conn_id: ConnectionId,
+    // postgres application_name
+    pub appname: Option<String>,
+    pub feedback: ReplicationFeedback,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct WalReceiverState {
+    /// None means it is recovery initiated by us (this safekeeper).
+    pub conn_id: Option<ConnectionId>,
+    pub status: WalReceiverStatus,
+}
+
+/// Walreceiver status. Currently only whether it passed voting stage and
+/// started receiving the stream, but it is easy to add more if needed.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum WalReceiverStatus {
+    Voting,
+    Streaming,
+}
+
+/// Info about timeline on safekeeper ready for reporting.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct TimelineStatus {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub acceptor_state: AcceptorStateStatus,
+    pub pg_info: ServerInfo,
+    pub flush_lsn: Lsn,
+    pub timeline_start_lsn: Lsn,
+    pub local_start_lsn: Lsn,
+    pub commit_lsn: Lsn,
+    pub backup_lsn: Lsn,
+    pub peer_horizon_lsn: Lsn,
+    pub remote_consistent_lsn: Lsn,
+    pub peers: Vec<PeerInfo>,
+    pub walsenders: Vec<WalSenderState>,
+    pub walreceivers: Vec<WalReceiverState>,
+}
+
 fn lsn_invalid() -> Lsn {
     Lsn::INVALID
 }
diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs
index a4b4670e42..dd152fd4cc 100644
--- a/safekeeper/src/control_file_upgrade.rs
+++ b/safekeeper/src/control_file_upgrade.rs
@@ -1,11 +1,12 @@
 //! Code to deal with safekeeper control file upgrades
 use crate::{
-    safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn},
+    safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn},
     state::{EvictionState, PersistedPeers, TimelinePersistentState},
     wal_backup_partial,
 };
 use anyhow::{bail, Result};
 use pq_proto::SystemId;
+use safekeeper_api::{ServerInfo, Term};
 use serde::{Deserialize, Serialize};
 use tracing::*;
 use utils::{
diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs
index 93011eddec..19362a0992 100644
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -14,6 +14,7 @@ use camino::Utf8PathBuf;
 use chrono::{DateTime, Utc};
 use postgres_ffi::XLogSegNo;
 use postgres_ffi::MAX_SEND_SIZE;
+use safekeeper_api::models::WalSenderState;
 use serde::Deserialize;
 use serde::Serialize;
 
@@ -25,7 +26,6 @@ use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 
 use crate::safekeeper::TermHistory;
-use crate::send_wal::WalSenderState;
 use crate::state::TimelineMemState;
 use crate::state::TimelinePersistentState;
 use crate::timeline::get_timeline_dir;
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index 2ca6333ba8..bb639bfb32 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -4,6 +4,8 @@
 use anyhow::Context;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
+use safekeeper_api::models::ConnectionId;
+use safekeeper_api::Term;
 use std::future::Future;
 use std::str::{self, FromStr};
 use std::sync::Arc;
@@ -16,9 +18,7 @@ use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
 
 use crate::metrics::{TrafficMetrics, PG_QUERIES_GAUGE};
-use crate::safekeeper::Term;
 use crate::timeline::TimelineError;
-use crate::wal_service::ConnectionId;
 use crate::{GlobalTimelines, SafeKeeperConf};
 use postgres_backend::PostgresBackend;
 use postgres_backend::QueryError;
diff --git a/safekeeper/src/http/client.rs b/safekeeper/src/http/client.rs
index a166fc1ab9..669a9c0ce9 100644
--- a/safekeeper/src/http/client.rs
+++ b/safekeeper/src/http/client.rs
@@ -8,6 +8,7 @@
 //! etc.
 
 use reqwest::{IntoUrl, Method, StatusCode};
+use safekeeper_api::models::TimelineStatus;
 use std::error::Error as _;
 use utils::{
     http::error::HttpErrorBody,
@@ -15,8 +16,6 @@ use utils::{
     logging::SecretString,
 };
 
-use super::routes::TimelineStatus;
-
 #[derive(Debug, Clone)]
 pub struct Client {
     mgmt_api_endpoint: String,
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 71c36f1d46..9bc1bf3409 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -1,5 +1,9 @@
 use hyper::{Body, Request, Response, StatusCode};
-use serde::{Deserialize, Serialize};
+use safekeeper_api::models::AcceptorStateStatus;
+use safekeeper_api::models::SafekeeperStatus;
+use safekeeper_api::models::TermSwitchApiEntry;
+use safekeeper_api::models::TimelineStatus;
+use safekeeper_api::ServerInfo;
 use std::collections::HashMap;
 use std::fmt;
 use std::io::Write as _;
@@ -31,26 +35,17 @@ use utils::{
         request::{ensure_no_body, parse_request_param},
         RequestExt, RouterBuilder,
     },
-    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
+    id::{TenantId, TenantTimelineId, TimelineId},
     lsn::Lsn,
 };
 
 use crate::debug_dump::TimelineDigestRequest;
-use crate::receive_wal::WalReceiverState;
-use crate::safekeeper::Term;
-use crate::safekeeper::{ServerInfo, TermLsn};
-use crate::send_wal::WalSenderState;
-use crate::timeline::PeerInfo;
+use crate::safekeeper::TermLsn;
 use crate::timelines_global_map::TimelineDeleteForceResult;
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;
 use crate::{copy_timeline, debug_dump, patch_control_file, pull_timeline};
 
-#[derive(Debug, Serialize)]
-struct SafekeeperStatus {
-    id: NodeId,
-}
-
 /// Healthcheck handler.
 async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
@@ -73,50 +68,6 @@ fn get_global_timelines(request: &Request<Body>) -> Arc<GlobalTimelines> {
         .clone()
 }
 
-/// Same as TermLsn, but serializes LSN using display serializer
-/// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response.
-#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
-pub struct TermSwitchApiEntry {
-    pub term: Term,
-    pub lsn: Lsn,
-}
-
-impl From<TermSwitchApiEntry> for TermLsn {
-    fn from(api_val: TermSwitchApiEntry) -> Self {
-        TermLsn {
-            term: api_val.term,
-            lsn: api_val.lsn,
-        }
-    }
-}
-
-/// Augment AcceptorState with last_log_term for convenience
-#[derive(Debug, Serialize, Deserialize)]
-pub struct AcceptorStateStatus {
-    pub term: Term,
-    pub epoch: Term, // aka last_log_term
-    pub term_history: Vec<TermSwitchApiEntry>,
-}
-
-/// Info about timeline on safekeeper ready for reporting.
-#[derive(Debug, Serialize, Deserialize)]
-pub struct TimelineStatus {
-    pub tenant_id: TenantId,
-    pub timeline_id: TimelineId,
-    pub acceptor_state: AcceptorStateStatus,
-    pub pg_info: ServerInfo,
-    pub flush_lsn: Lsn,
-    pub timeline_start_lsn: Lsn,
-    pub local_start_lsn: Lsn,
-    pub commit_lsn: Lsn,
-    pub backup_lsn: Lsn,
-    pub peer_horizon_lsn: Lsn,
-    pub remote_consistent_lsn: Lsn,
-    pub peers: Vec<PeerInfo>,
-    pub walsenders: Vec<WalSenderState>,
-    pub walreceivers: Vec<WalReceiverState>,
-}
-
 fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Result<(), ApiError> {
     check_permission_with(request, |claims| {
         crate::auth::check_permission(claims, tenant_id)
@@ -187,6 +138,15 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
     json_response(StatusCode::OK, res)
 }
 
+impl From<TermSwitchApiEntry> for TermLsn {
+    fn from(api_val: TermSwitchApiEntry) -> Self {
+        TermLsn {
+            term: api_val.term,
+            lsn: api_val.lsn,
+        }
+    }
+}
+
 /// Report info about timeline.
 async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let ttid = TenantTimelineId::new(
diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs
index dc4ad3706e..256e350ceb 100644
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -8,16 +8,17 @@
 
 use anyhow::Context;
 use postgres_backend::QueryError;
+use safekeeper_api::{ServerInfo, Term};
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::*;
 
 use crate::handler::SafekeeperPostgresHandler;
-use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo};
+use crate::safekeeper::{AcceptorProposerMessage, AppendResponse};
 use crate::safekeeper::{
     AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected,
 };
-use crate::safekeeper::{Term, TermHistory, TermLsn};
+use crate::safekeeper::{TermHistory, TermLsn};
 use crate::state::TimelinePersistentState;
 use crate::timeline::WalResidentTimeline;
 use postgres_backend::PostgresBackend;
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index f58a9dca1d..00777273cb 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -4,6 +4,7 @@ use camino::Utf8PathBuf;
 use chrono::{DateTime, Utc};
 use futures::{SinkExt, StreamExt, TryStreamExt};
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
+use safekeeper_api::{models::TimelineStatus, Term};
 use serde::{Deserialize, Serialize};
 use std::{
     cmp::min,
@@ -21,11 +22,7 @@ use tracing::{error, info, instrument};
 use crate::{
     control_file::CONTROL_FILE_NAME,
     debug_dump,
-    http::{
-        client::{self, Client},
-        routes::TimelineStatus,
-    },
-    safekeeper::Term,
+    http::client::{self, Client},
     state::{EvictionState, TimelinePersistentState},
     timeline::{Timeline, WalResidentTimeline},
     timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 2a49890d61..08371177cd 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -9,9 +9,7 @@ use crate::metrics::{
 };
 use crate::safekeeper::AcceptorProposerMessage;
 use crate::safekeeper::ProposerAcceptorMessage;
-use crate::safekeeper::ServerInfo;
 use crate::timeline::WalResidentTimeline;
-use crate::wal_service::ConnectionId;
 use crate::GlobalTimelines;
 use anyhow::{anyhow, Context};
 use bytes::BytesMut;
@@ -23,8 +21,8 @@ use postgres_backend::PostgresBackend;
 use postgres_backend::PostgresBackendReader;
 use postgres_backend::QueryError;
 use pq_proto::BeMessage;
-use serde::Deserialize;
-use serde::Serialize;
+use safekeeper_api::models::{ConnectionId, WalReceiverState, WalReceiverStatus};
+use safekeeper_api::ServerInfo;
 use std::future;
 use std::net::SocketAddr;
 use std::sync::Arc;
@@ -171,21 +169,6 @@ impl WalReceiversShared {
     }
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct WalReceiverState {
-    /// None means it is recovery initiated by us (this safekeeper).
-    pub conn_id: Option<ConnectionId>,
-    pub status: WalReceiverStatus,
-}
-
-/// Walreceiver status. Currently only whether it passed voting stage and
-/// started receiving the stream, but it is easy to add more if needed.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub enum WalReceiverStatus {
-    Voting,
-    Streaming,
-}
-
 /// Scope guard to access slot in WalReceivers registry and unregister from
 /// it in Drop.
 pub struct WalReceiverGuard {
diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs
index 7b87166aa0..61647c16b0 100644
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -7,6 +7,8 @@ use std::{fmt, pin::pin};
 use anyhow::{bail, Context};
 use futures::StreamExt;
 use postgres_protocol::message::backend::ReplicationMessage;
+use safekeeper_api::models::{PeerInfo, TimelineStatus};
+use safekeeper_api::Term;
 use tokio::sync::mpsc::{channel, Receiver, Sender};
 use tokio::time::timeout;
 use tokio::{
@@ -24,13 +26,11 @@ use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
 use crate::safekeeper::{AppendRequest, AppendRequestHeader};
 use crate::timeline::WalResidentTimeline;
 use crate::{
-    http::routes::TimelineStatus,
     receive_wal::MSG_QUEUE_SIZE,
     safekeeper::{
-        AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory,
-        TermLsn, VoteRequest,
+        AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, TermHistory, TermLsn,
+        VoteRequest,
     },
-    timeline::PeerInfo,
     SafeKeeperConf,
 };
 
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 6eb69f0b7c..ccd7940c72 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -5,6 +5,9 @@ use byteorder::{LittleEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 
 use postgres_ffi::{TimeLineID, MAX_SEND_SIZE};
+use safekeeper_api::models::HotStandbyFeedback;
+use safekeeper_api::Term;
+use safekeeper_api::INVALID_TERM;
 use serde::{Deserialize, Serialize};
 use std::cmp::max;
 use std::cmp::min;
@@ -16,7 +19,6 @@ use tracing::*;
 
 use crate::control_file;
 use crate::metrics::MISC_OPERATION_SECONDS;
-use crate::send_wal::HotStandbyFeedback;
 
 use crate::state::TimelineState;
 use crate::wal_storage;
@@ -31,10 +33,6 @@ use utils::{
 const SK_PROTOCOL_VERSION: u32 = 2;
 pub const UNKNOWN_SERVER_VERSION: u32 = 0;
 
-/// Consensus logical timestamp.
-pub type Term = u64;
-pub const INVALID_TERM: Term = 0;
-
 #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
 pub struct TermLsn {
     pub term: Term,
@@ -198,16 +196,6 @@ impl AcceptorState {
     }
 }
 
-/// Information about Postgres. Safekeeper gets it once and then verifies
-/// all further connections from computes match.
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-pub struct ServerInfo {
-    /// Postgres server version
-    pub pg_version: u32,
-    pub system_id: SystemId,
-    pub wal_seg_size: u32,
-}
-
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct PersistedPeerInfo {
     /// LSN up to which safekeeper offloaded WAL to s3.
@@ -1041,6 +1029,7 @@ where
 mod tests {
     use futures::future::BoxFuture;
     use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE};
+    use safekeeper_api::ServerInfo;
 
     use super::*;
     use crate::state::{EvictionState, PersistedPeers, TimelinePersistentState};
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 0887cf7264..8463221998 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -4,11 +4,10 @@
 use crate::handler::SafekeeperPostgresHandler;
 use crate::metrics::RECEIVED_PS_FEEDBACKS;
 use crate::receive_wal::WalReceivers;
-use crate::safekeeper::{Term, TermLsn};
+use crate::safekeeper::TermLsn;
 use crate::send_interpreted_wal::InterpretedWalSender;
 use crate::timeline::WalResidentTimeline;
 use crate::wal_reader_stream::WalReaderStreamBuilder;
-use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
 use anyhow::{bail, Context as AnyhowContext};
 use bytes::Bytes;
@@ -19,7 +18,11 @@ use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError};
 use postgres_ffi::get_current_timestamp;
 use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
 use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
-use serde::{Deserialize, Serialize};
+use safekeeper_api::models::{
+    ConnectionId, HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply,
+    WalSenderState, INVALID_FULL_TRANSACTION_ID,
+};
+use safekeeper_api::Term;
 use tokio::io::{AsyncRead, AsyncWrite};
 use utils::failpoint_support;
 use utils::id::TenantTimelineId;
@@ -28,7 +31,6 @@ use utils::postgres_client::PostgresClientProtocol;
 
 use std::cmp::{max, min};
 use std::net::SocketAddr;
-use std::str;
 use std::sync::Arc;
 use std::time::Duration;
 use tokio::sync::watch::Receiver;
@@ -42,65 +44,6 @@ const STANDBY_STATUS_UPDATE_TAG_BYTE: u8 = b'r';
 // neon extension of replication protocol
 const NEON_STATUS_UPDATE_TAG_BYTE: u8 = b'z';
 
-type FullTransactionId = u64;
-
-/// Hot standby feedback received from replica
-#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
-pub struct HotStandbyFeedback {
-    pub ts: TimestampTz,
-    pub xmin: FullTransactionId,
-    pub catalog_xmin: FullTransactionId,
-}
-
-const INVALID_FULL_TRANSACTION_ID: FullTransactionId = 0;
-
-impl HotStandbyFeedback {
-    pub fn empty() -> HotStandbyFeedback {
-        HotStandbyFeedback {
-            ts: 0,
-            xmin: 0,
-            catalog_xmin: 0,
-        }
-    }
-}
-
-/// Standby status update
-#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
-pub struct StandbyReply {
-    pub write_lsn: Lsn, // The location of the last WAL byte + 1 received and written to disk in the standby.
-    pub flush_lsn: Lsn, // The location of the last WAL byte + 1 flushed to disk in the standby.
-    pub apply_lsn: Lsn, // The location of the last WAL byte + 1 applied in the standby.
-    pub reply_ts: TimestampTz, // The client's system clock at the time of transmission, as microseconds since midnight on 2000-01-01.
-    pub reply_requested: bool,
-}
-
-impl StandbyReply {
-    fn empty() -> Self {
-        StandbyReply {
-            write_lsn: Lsn::INVALID,
-            flush_lsn: Lsn::INVALID,
-            apply_lsn: Lsn::INVALID,
-            reply_ts: 0,
-            reply_requested: false,
-        }
-    }
-}
-
-#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
-pub struct StandbyFeedback {
-    pub reply: StandbyReply,
-    pub hs_feedback: HotStandbyFeedback,
-}
-
-impl StandbyFeedback {
-    pub fn empty() -> Self {
-        StandbyFeedback {
-            reply: StandbyReply::empty(),
-            hs_feedback: HotStandbyFeedback::empty(),
-        }
-    }
-}
-
 /// WalSenders registry. Timeline holds it (wrapped in Arc).
 pub struct WalSenders {
     mutex: Mutex<WalSendersShared>,
@@ -341,25 +284,6 @@ impl WalSendersShared {
     }
 }
 
-// Serialized is used only for pretty printing in json.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct WalSenderState {
-    ttid: TenantTimelineId,
-    addr: SocketAddr,
-    conn_id: ConnectionId,
-    // postgres application_name
-    appname: Option<String>,
-    feedback: ReplicationFeedback,
-}
-
-// Receiver is either pageserver or regular standby, which have different
-// feedbacks.
-#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
-enum ReplicationFeedback {
-    Pageserver(PageserverFeedback),
-    Standby(StandbyFeedback),
-}
-
 // id of the occupied slot in WalSenders to access it (and save in the
 // WalSenderGuard). We could give Arc directly to the slot, but there is not
 // much sense in that as values aggregation which is performed on each feedback
@@ -888,6 +812,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
 
 #[cfg(test)]
 mod tests {
+    use safekeeper_api::models::FullTransactionId;
     use utils::id::{TenantId, TimelineId};
 
     use super::*;
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index 941b7e67d0..c6ae6c1d2b 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -5,7 +5,7 @@ use std::{cmp::max, ops::Deref};
 
 use anyhow::{bail, Result};
 use postgres_ffi::WAL_SEGMENT_SIZE;
-use safekeeper_api::models::TimelineTermBumpResponse;
+use safekeeper_api::{models::TimelineTermBumpResponse, ServerInfo, Term};
 use serde::{Deserialize, Serialize};
 use utils::{
     id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -14,10 +14,7 @@ use utils::{
 
 use crate::{
     control_file,
-    safekeeper::{
-        AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory,
-        UNKNOWN_SERVER_VERSION,
-    },
+    safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, TermHistory, UNKNOWN_SERVER_VERSION},
     timeline::TimelineError,
     wal_backup_partial::{self},
 };
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 94d6ef1061..36860a0da2 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -4,8 +4,8 @@
 use anyhow::{anyhow, bail, Result};
 use camino::{Utf8Path, Utf8PathBuf};
 use remote_storage::RemotePath;
-use safekeeper_api::models::TimelineTermBumpResponse;
-use serde::{Deserialize, Serialize};
+use safekeeper_api::models::{PeerInfo, TimelineTermBumpResponse};
+use safekeeper_api::Term;
 use tokio::fs::{self};
 use tokio_util::sync::CancellationToken;
 use utils::id::TenantId;
@@ -31,9 +31,7 @@ use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use crate::control_file;
 use crate::rate_limit::RateLimiter;
 use crate::receive_wal::WalReceivers;
-use crate::safekeeper::{
-    AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, Term, TermLsn,
-};
+use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn};
 use crate::send_wal::WalSenders;
 use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, TimelineState};
 use crate::timeline_guard::ResidenceGuard;
@@ -47,40 +45,17 @@ use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
 use crate::SafeKeeperConf;
 use crate::{debug_dump, timeline_manager, wal_storage};
 
-/// Things safekeeper should know about timeline state on peers.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct PeerInfo {
-    pub sk_id: NodeId,
-    pub term: Term,
-    /// Term of the last entry.
-    pub last_log_term: Term,
-    /// LSN of the last record.
-    pub flush_lsn: Lsn,
-    pub commit_lsn: Lsn,
-    /// Since which LSN safekeeper has WAL.
-    pub local_start_lsn: Lsn,
-    /// When info was received. Serde annotations are not very useful but make
-    /// the code compile -- we don't rely on this field externally.
-    #[serde(skip)]
-    #[serde(default = "Instant::now")]
-    ts: Instant,
-    pub pg_connstr: String,
-    pub http_connstr: String,
-}
-
-impl PeerInfo {
-    fn from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo {
-        PeerInfo {
-            sk_id: NodeId(sk_info.safekeeper_id),
-            term: sk_info.term,
-            last_log_term: sk_info.last_log_term,
-            flush_lsn: Lsn(sk_info.flush_lsn),
-            commit_lsn: Lsn(sk_info.commit_lsn),
-            local_start_lsn: Lsn(sk_info.local_start_lsn),
-            pg_connstr: sk_info.safekeeper_connstr.clone(),
-            http_connstr: sk_info.http_connstr.clone(),
-            ts,
-        }
+fn peer_info_from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo {
+    PeerInfo {
+        sk_id: NodeId(sk_info.safekeeper_id),
+        term: sk_info.term,
+        last_log_term: sk_info.last_log_term,
+        flush_lsn: Lsn(sk_info.flush_lsn),
+        commit_lsn: Lsn(sk_info.commit_lsn),
+        local_start_lsn: Lsn(sk_info.local_start_lsn),
+        pg_connstr: sk_info.safekeeper_connstr.clone(),
+        http_connstr: sk_info.http_connstr.clone(),
+        ts,
     }
 }
 
@@ -697,7 +672,7 @@ impl Timeline {
         {
             let mut shared_state = self.write_shared_state().await;
             shared_state.sk.record_safekeeper_info(&sk_info).await?;
-            let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now());
+            let peer_info = peer_info_from_sk_info(&sk_info, Instant::now());
             shared_state.peers_info.upsert(&peer_info);
         }
         Ok(())
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index c02fb904cf..a33994dcab 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -14,6 +14,7 @@ use std::{
 
 use futures::channel::oneshot;
 use postgres_ffi::XLogSegNo;
+use safekeeper_api::{models::PeerInfo, Term};
 use serde::{Deserialize, Serialize};
 use tokio::{
     task::{JoinError, JoinHandle},
@@ -32,10 +33,9 @@ use crate::{
     rate_limit::{rand_duration, RateLimiter},
     recovery::recovery_main,
     remove_wal::calc_horizon_lsn,
-    safekeeper::Term,
     send_wal::WalSenders,
     state::TimelineState,
-    timeline::{ManagerTimeline, PeerInfo, ReadGuardSharedState, StateSK, WalResidentTimeline},
+    timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline},
     timeline_guard::{AccessService, GuardId, ResidenceGuard},
     timelines_set::{TimelineSetGuard, TimelinesSet},
     wal_backup::{self, WalBackupTaskHandle},
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index e1241ceb9b..ad29c9f66c 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -4,7 +4,6 @@
 
 use crate::defaults::DEFAULT_EVICTION_CONCURRENCY;
 use crate::rate_limit::RateLimiter;
-use crate::safekeeper::ServerInfo;
 use crate::state::TimelinePersistentState;
 use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
 use crate::timelines_set::TimelinesSet;
@@ -13,6 +12,7 @@ use crate::{control_file, wal_storage, SafeKeeperConf};
 use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
 use camino_tempfile::Utf8TempDir;
+use safekeeper_api::ServerInfo;
 use serde::Serialize;
 use std::collections::HashMap;
 use std::str::FromStr;
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 34b5dbeaa1..8517fa0344 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -3,6 +3,7 @@ use anyhow::{Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::stream::FuturesOrdered;
 use futures::StreamExt;
+use safekeeper_api::models::PeerInfo;
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use utils::backoff;
@@ -30,7 +31,7 @@ use tracing::*;
 use utils::{id::TenantTimelineId, lsn::Lsn};
 
 use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
-use crate::timeline::{PeerInfo, WalResidentTimeline};
+use crate::timeline::WalResidentTimeline;
 use crate::timeline_manager::{Manager, StateSnapshot};
 use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME};
 
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index bddfca50e4..4e5b34a9bf 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -22,6 +22,7 @@
 use camino::Utf8PathBuf;
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use remote_storage::RemotePath;
+use safekeeper_api::Term;
 use serde::{Deserialize, Serialize};
 
 use tokio_util::sync::CancellationToken;
@@ -31,7 +32,6 @@ use utils::{id::NodeId, lsn::Lsn};
 use crate::{
     metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
     rate_limit::{rand_duration, RateLimiter},
-    safekeeper::Term,
     timeline::WalResidentTimeline,
     timeline_manager::StateSnapshot,
     wal_backup::{self},
diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs
index f8c0c502cd..aea628c208 100644
--- a/safekeeper/src/wal_reader_stream.rs
+++ b/safekeeper/src/wal_reader_stream.rs
@@ -4,12 +4,12 @@ use async_stream::try_stream;
 use bytes::Bytes;
 use futures::Stream;
 use postgres_backend::CopyStreamHandlerEnd;
+use safekeeper_api::Term;
 use std::time::Duration;
 use tokio::time::timeout;
 use utils::lsn::Lsn;
 
 use crate::{
-    safekeeper::Term,
     send_wal::{EndWatch, WalSenderGuard},
     timeline::WalResidentTimeline,
 };
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index 1ff83918a7..1ebcb060e7 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -4,6 +4,7 @@
 //!
 use anyhow::{Context, Result};
 use postgres_backend::QueryError;
+use safekeeper_api::models::ConnectionId;
 use std::sync::Arc;
 use std::time::Duration;
 use tokio::net::TcpStream;
@@ -114,8 +115,6 @@ async fn handle_socket(
         .await
 }
 
-/// Unique WAL service connection ids are logged in spans for observability.
-pub type ConnectionId = u32;
 pub type ConnectionCount = u32;
 
 pub fn issue_connection_id(count: &mut ConnectionCount) -> ConnectionId {
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 12aa025771..efcdd89e7d 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -15,12 +15,13 @@ use desim::{
 };
 use http::Uri;
 use safekeeper::{
-    safekeeper::{ProposerAcceptorMessage, SafeKeeper, ServerInfo, UNKNOWN_SERVER_VERSION},
+    safekeeper::{ProposerAcceptorMessage, SafeKeeper, UNKNOWN_SERVER_VERSION},
     state::{TimelinePersistentState, TimelineState},
     timeline::TimelineError,
     wal_storage::Storage,
     SafeKeeperConf,
 };
+use safekeeper_api::ServerInfo;
 use tracing::{debug, info_span, warn};
 use utils::{
     id::{NodeId, TenantId, TenantTimelineId, TimelineId},

From 2c910628288a29435f102471c6927d3cd24266ba Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 13 Dec 2024 14:52:54 +0000
Subject: [PATCH 036/583] test_prefetch: reduce timeout to default 5m from 10m
 (#10105)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

`test_prefetch` is flaky
(https://github.com/neondatabase/neon/issues/9961), but if it passes,
the run time is less than 30 seconds — we don't need an extended timeout
for it.

## Summary of changes
- Remove extended test timeout for `test_prefetch`
---
 test_runner/regress/test_prefetch_buffer_resize.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test_runner/regress/test_prefetch_buffer_resize.py b/test_runner/regress/test_prefetch_buffer_resize.py
index 7676b78b0e..99fe80e621 100644
--- a/test_runner/regress/test_prefetch_buffer_resize.py
+++ b/test_runner/regress/test_prefetch_buffer_resize.py
@@ -7,7 +7,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder
 
 
 @pytest.mark.parametrize("shard_count", [None, 4])
-@pytest.mark.timeout(600)
 def test_prefetch(neon_env_builder: NeonEnvBuilder, shard_count: int | None):
     if shard_count is not None:
         neon_env_builder.num_pageservers = shard_count

From fcff7528517b47f79a55334c22dc6dc89c113be1 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 13 Dec 2024 17:28:21 +0100
Subject: [PATCH 037/583] fix(test_timeline_archival_chaos): flakiness caused
 by orphan layers (#10083)

The test was failing with the scary but generic message `Remote storage
metadata corrupted`.

The underlying scrubber error is `Orphan layer detected: ...`.

The test kills pageserver at random points, hence it's expected that we
leak layers if we're killed in the window after layer upload but before
it's referenced from index part.

Refer to generation numbers RFC for details.

Refs:
- fixes https://github.com/neondatabase/neon/issues/9988
- root-cause analysis
https://github.com/neondatabase/neon/issues/9988#issuecomment-2520673167
---
 test_runner/regress/test_timeline_archive.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index e808dd1396..addf702893 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -435,6 +435,14 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
             ]
         )
 
+    env.storage_scrubber.allowed_errors.extend(
+        [
+            # Unclcean shutdowns of pageserver can legitimately result in orphan layers
+            # (https://github.com/neondatabase/neon/issues/9988#issuecomment-2520558211)
+            f".*Orphan layer detected: tenants/{tenant_id}/.*"
+        ]
+    )
+
     class TimelineState:
         def __init__(self):
             self.timeline_id = TimelineId.generate()

From eeabecd89f89a24fa8ee642c916efd97b4e8fa06 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 13 Dec 2024 19:40:26 +0200
Subject: [PATCH 038/583] Correctly update LFC used_pages in case of LFC resize
 (#10128)

## Problem

LFC used_pages statistic is not updated in case of LFC resize (shrinking
`neon.file_cache_size_limit`)

## Summary of changes

Update `lfc_ctl->used_pages` in `lfc_change_limit_hook`

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 70b250d394..f49415be68 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -365,6 +365,10 @@ lfc_change_limit_hook(int newval, void *extra)
 			neon_log(LOG, "Failed to punch hole in file: %m");
 #endif
 		/* We remove the old entry, and re-enter a hole to the hash table */
+		for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+		{
+			lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
+		}
 		hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
 
 		memset(&holetag, 0, sizeof(holetag));

From 07d1db54b3f0b9c113aa28fcdb85bacf15979cd9 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 13 Dec 2024 12:10:42 -0600
Subject: [PATCH 039/583] Improve comments and log messages in the logical
 replication monitor (#9974)

Improved comments will help others when they read the code, and the log
messages will help others understand why the logical replication monitor
works the way it does.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 pgxn/neon/logical_replication_monitor.c | 53 ++++++++++++++-----------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/pgxn/neon/logical_replication_monitor.c b/pgxn/neon/logical_replication_monitor.c
index 5eee5a1679..b94faafdfa 100644
--- a/pgxn/neon/logical_replication_monitor.c
+++ b/pgxn/neon/logical_replication_monitor.c
@@ -131,8 +131,8 @@ get_snapshots_cutoff_lsn(void)
 	{
 		cutoff = snapshot_descriptors[logical_replication_max_snap_files - 1].lsn;
 		elog(LOG,
-			"ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %zu snapshot files, limit is %d",
-			LSN_FORMAT_ARGS(cutoff), snapshot_index, logical_replication_max_snap_files);
+			"ls_monitor: number of snapshot files, %zu, is larger than limit of %d",
+			snapshot_index, logical_replication_max_snap_files);
 	}
 
 	/* Is the size of the logical snapshots directory larger than specified?
@@ -162,8 +162,8 @@ get_snapshots_cutoff_lsn(void)
 		}
 
 		if (cutoff != original)
-			elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower than %X/%X, " SNAPDIR " is larger than %d KB",
-					LSN_FORMAT_ARGS(cutoff), logical_replication_max_logicalsnapdir_size);
+			elog(LOG, "ls_monitor: " SNAPDIR " is larger than %d KB",
+				 logical_replication_max_logicalsnapdir_size);
 	}
 
 	pfree(snapshot_descriptors);
@@ -214,9 +214,13 @@ InitLogicalReplicationMonitor(void)
 }
 
 /*
- * Unused logical replication slots pins WAL and prevents deletion of snapshots.
+ * Unused logical replication slots pins WAL and prevent deletion of snapshots.
  * WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which
- * need too many .snap files.
+ * need too many .snap files. These files are stored as AUX files, which are a
+ * pageserver mechanism for storing non-relation data. AUX files are shipped in
+ * in the basebackup which is requested by compute_ctl before Postgres starts.
+ * The larger the time to retrieve the basebackup, the more likely it is the
+ * compute will be killed by the control plane due to a timeout.
  */
 void
 LogicalSlotsMonitorMain(Datum main_arg)
@@ -239,10 +243,7 @@ LogicalSlotsMonitorMain(Datum main_arg)
 			ProcessConfigFile(PGC_SIGHUP);
 		}
 
-		/*
-		 * If there are too many .snap files, just drop all logical slots to
-		 * prevent aux files bloat.
-		 */
+		/* Get the cutoff LSN */
 		cutoff_lsn = get_snapshots_cutoff_lsn();
 		if (cutoff_lsn > 0)
 		{
@@ -252,31 +253,37 @@ LogicalSlotsMonitorMain(Datum main_arg)
 				ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
 				XLogRecPtr	restart_lsn;
 
-				/* find the name */
 				LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
-				/* Consider only logical repliction slots */
+
+				/* Consider only active logical repliction slots */
 				if (!s->in_use || !SlotIsLogical(s))
 				{
 					LWLockRelease(ReplicationSlotControlLock);
 					continue;
 				}
 
-				/* do we need to drop it? */
+				/*
+				 * Retrieve the restart LSN to determine if we need to drop the
+				 * slot
+				 */
 				SpinLockAcquire(&s->mutex);
 				restart_lsn = s->data.restart_lsn;
 				SpinLockRelease(&s->mutex);
+
+				strlcpy(slot_name, s->data.name.data, sizeof(slot_name));
+				LWLockRelease(ReplicationSlotControlLock);
+
 				if (restart_lsn >= cutoff_lsn)
 				{
-					LWLockRelease(ReplicationSlotControlLock);
+					elog(LOG, "ls_monitor: not dropping replication slot %s because restart LSN %X/%X is greater than cutoff LSN %X/%X",
+						 slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
 					continue;
 				}
 
-				strlcpy(slot_name, s->data.name.data, NAMEDATALEN);
-				elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X",
+				elog(LOG, "ls_monitor: dropping replication slot %s because restart LSN %X/%X lower than cutoff LSN %X/%X",
 					 slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
-				LWLockRelease(ReplicationSlotControlLock);
 
-				/* now try to drop it, killing owner before if any */
+				/* now try to drop it, killing owner before, if any */
 				for (;;)
 				{
 					pid_t		active_pid;
@@ -288,9 +295,9 @@ LogicalSlotsMonitorMain(Datum main_arg)
 					if (active_pid == 0)
 					{
 						/*
-						 * Slot is releasted, try to drop it. Though of course
+						 * Slot is released, try to drop it. Though of course,
 						 * it could have been reacquired, so drop can ERROR
-						 * out. Similarly it could have been dropped in the
+						 * out. Similarly, it could have been dropped in the
 						 * meanwhile.
 						 *
 						 * In principle we could remove pg_try/pg_catch, that
@@ -300,14 +307,14 @@ LogicalSlotsMonitorMain(Datum main_arg)
 						PG_TRY();
 						{
 							ReplicationSlotDrop(slot_name, true);
-							elog(LOG, "ls_monitor: slot %s dropped", slot_name);
+							elog(LOG, "ls_monitor: replication slot %s dropped", slot_name);
 						}
 						PG_CATCH();
 						{
 							/* log ERROR and reset elog stack */
 							EmitErrorReport();
 							FlushErrorState();
-							elog(LOG, "ls_monitor: failed to drop slot %s", slot_name);
+							elog(LOG, "ls_monitor: failed to drop replication slot %s", slot_name);
 						}
 						PG_END_TRY();
 						break;
@@ -315,7 +322,7 @@ LogicalSlotsMonitorMain(Datum main_arg)
 					else
 					{
 						/* kill the owner and wait for release */
-						elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid);
+						elog(LOG, "ls_monitor: killing replication slot %s owner %d", slot_name, active_pid);
 						(void) kill(active_pid, SIGTERM);
 						/* We shouldn't get stuck, but to be safe add timeout. */
 						ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP);

From 7ee5dca752c9e9b7e65752c2561798bdd91ea3f6 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 13 Dec 2024 13:22:25 -0500
Subject: [PATCH 040/583] fix(pageserver): race between gc-compaction and
 repartition (#10127)

## Problem

close https://github.com/neondatabase/neon/issues/10124

gc-compaction split_gc_jobs is holding the repartition lock for too long
time.

## Summary of changes

* Ensure split_gc_compaction_jobs drops the repartition lock once it
finishes cloning the structures.
* Update comments.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs            | 5 ++++-
 pageserver/src/tenant/timeline/compaction.rs | 8 +++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b5c7079226..75e268a1b9 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4064,8 +4064,11 @@ impl Timeline {
             // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
             // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
             // and hence before the compaction task starts.
+            // Note that there are a third "caller" that will take the `partitioning` lock. It is `gc_compaction_split_jobs` for
+            // gc-compaction where it uses the repartition data to determine the split jobs. In the future, it might use its own
+            // heuristics, but for now, we should allow concurrent access to it and let the caller retry compaction.
             return Err(CompactionError::Other(anyhow!(
-                "repartition() called concurrently, this should not happen"
+                "repartition() called concurrently, this is rare and a retry should be fine"
             )));
         };
         let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 701247194b..5e6290729c 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1821,10 +1821,12 @@ impl Timeline {
         let mut compact_jobs = Vec::new();
         // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
         // by estimating the amount of files read for a compaction job. We should also partition on LSN.
-        let Ok(partition) = self.partitioning.try_lock() else {
-            bail!("failed to acquire partition lock");
+        let ((dense_ks, sparse_ks), _) = {
+            let Ok(partition) = self.partitioning.try_lock() else {
+                bail!("failed to acquire partition lock");
+            };
+            partition.clone()
         };
-        let ((dense_ks, sparse_ks), _) = &*partition;
         // Truncate the key range to be within user specified compaction range.
         fn truncate_to(
             source_start: &Key,

From d56fea680ec2f2a741a3383c63a55eadf86ad602 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 13 Dec 2024 19:56:32 +0000
Subject: [PATCH 041/583] CI: always require aws-oicd-role-arn input to be set
 (#10145)

## Problem
`benchmarking` job fails because `aws-oicd-role-arn` input is not set

## Summary of changes:
- Set `aws-oicd-role-arn` for `benchmarking job
- Always require `aws-oicd-role-arn` to be set
- Rename `aws_oicd_role_arn` to `aws-oicd-role-arn` for consistency
---
 .../actions/allure-report-generate/action.yml | 14 +++---
 .../actions/allure-report-store/action.yml    | 14 +++---
 .github/actions/download/action.yml           | 12 ++---
 .../actions/run-python-test-set/action.yml    | 25 +++++-----
 .github/actions/save-coverage-data/action.yml |  4 +-
 .github/actions/upload/action.yml             |  4 +-
 .../workflows/_benchmarking_preparation.yml   |  2 +-
 .github/workflows/_build-and-test-locally.yml |  4 +-
 .github/workflows/benchmarking.yml            | 48 +++++++++----------
 .github/workflows/build_and_test.yml          | 13 +++--
 .github/workflows/cloud-regress.yml           |  5 +-
 .github/workflows/ingest_benchmark.yml        | 16 +++----
 .github/workflows/periodic_pagebench.yml      |  2 +-
 .github/workflows/pg-clients.yml              | 10 ++--
 14 files changed, 88 insertions(+), 85 deletions(-)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index d6219c31b4..d07e3e32e8 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -7,10 +7,9 @@ inputs:
     type: boolean
     required: false
     default: false
-  aws_oicd_role_arn:
-    description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role'
-    required: false
-    default: ''
+  aws-oicd-role-arn:
+    description: 'OIDC role arn to interract with S3'
+    required: true
 
 outputs:
   base-url:
@@ -84,12 +83,11 @@ runs:
         ALLURE_VERSION: 2.27.0
         ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777
 
-    - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test
-      if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }}
-      uses: aws-actions/configure-aws-credentials@v4
+    - uses: aws-actions/configure-aws-credentials@v4
+      if: ${{ !cancelled() }}
       with:
         aws-region: eu-central-1
-        role-to-assume: ${{ inputs.aws_oicd_role_arn }}
+        role-to-assume: ${{ inputs.aws-oicd-role-arn }}
         role-duration-seconds: 3600 # 1 hour should be more than enough to upload report
 
     # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml
index 3c83656c89..8548a886cf 100644
--- a/.github/actions/allure-report-store/action.yml
+++ b/.github/actions/allure-report-store/action.yml
@@ -8,10 +8,9 @@ inputs:
   unique-key:
     description: 'string to distinguish different results in the same run'
     required: true
-  aws_oicd_role_arn:
-    description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role'
-    required: false
-    default: ''
+  aws-oicd-role-arn:
+    description: 'OIDC role arn to interract with S3'
+    required: true
 
 runs:
   using: "composite"
@@ -36,12 +35,11 @@ runs:
       env:
         REPORT_DIR: ${{ inputs.report-dir }}
 
-    - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test
-      if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }}
-      uses: aws-actions/configure-aws-credentials@v4
+    - uses: aws-actions/configure-aws-credentials@v4
+      if: ${{ !cancelled() }}
       with:
         aws-region: eu-central-1
-        role-to-assume: ${{ inputs.aws_oicd_role_arn }}
+        role-to-assume: ${{ inputs.aws-oicd-role-arn }}
         role-duration-seconds: 3600 # 1 hour should be more than enough to upload report
 
     - name: Upload test results
diff --git a/.github/actions/download/action.yml b/.github/actions/download/action.yml
index d6b1fac9f7..14b2ef8eac 100644
--- a/.github/actions/download/action.yml
+++ b/.github/actions/download/action.yml
@@ -15,19 +15,17 @@ inputs:
   prefix:
     description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
     required: false
-  aws_oicd_role_arn:
-    description: "the OIDC role arn for aws auth"
-    required: false
-    default: ""
+  aws-oicd-role-arn:
+    description: 'OIDC role arn to interract with S3'
+    required: true
 
 runs:
   using: "composite"
   steps:
-    - name: Configure AWS credentials
-      uses: aws-actions/configure-aws-credentials@v4
+    - uses: aws-actions/configure-aws-credentials@v4
       with:
         aws-region: eu-central-1
-        role-to-assume: ${{ inputs.aws_oicd_role_arn }}
+        role-to-assume: ${{ inputs.aws-oicd-role-arn }}
         role-duration-seconds: 3600
 
     - name: Download artifact
diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index dd5c890f5b..9a0261d430 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -48,10 +48,9 @@ inputs:
     description: 'benchmark durations JSON'
     required: false
     default: '{}'
-  aws_oicd_role_arn:
-    description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role'
-    required: false
-    default: ''
+  aws-oicd-role-arn:
+    description: 'OIDC role arn to interract with S3'
+    required: true
 
 runs:
   using: "composite"
@@ -62,7 +61,7 @@ runs:
       with:
         name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
         path: /tmp/neon
-        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
+        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
 
     - name: Download Neon binaries for the previous release
       if: inputs.build_type != 'remote'
@@ -71,7 +70,7 @@ runs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
         path: /tmp/neon-previous
         prefix: latest
-        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
+        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
 
     - name: Download compatibility snapshot
       if: inputs.build_type != 'remote'
@@ -83,7 +82,7 @@ runs:
         # The lack of compatibility snapshot (for example, for the new Postgres version)
         # shouldn't fail the whole job. Only relevant test should fail.
         skip-if-does-not-exist: true
-        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
+        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
 
     - name: Checkout
       if: inputs.needs_postgres_source == 'true'
@@ -221,19 +220,19 @@ runs:
         # The lack of compatibility snapshot shouldn't fail the job
         # (for example if we didn't run the test for non build-and-test workflow)
         skip-if-does-not-exist: true
-        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
+        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
 
-    - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test
-      if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }}
-      uses: aws-actions/configure-aws-credentials@v4
+    - uses: aws-actions/configure-aws-credentials@v4
+      if: ${{ !cancelled() }}
       with:
         aws-region: eu-central-1
-        role-to-assume: ${{ inputs.aws_oicd_role_arn }}
+        role-to-assume: ${{ inputs.aws-oicd-role-arn }}
         role-duration-seconds: 3600 # 1 hour should be more than enough to upload report
+
     - name: Upload test results
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-store
       with:
         report-dir: /tmp/test_output/allure/results
         unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
-        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
+        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
diff --git a/.github/actions/save-coverage-data/action.yml b/.github/actions/save-coverage-data/action.yml
index 9e3a7cba24..1bbea5400f 100644
--- a/.github/actions/save-coverage-data/action.yml
+++ b/.github/actions/save-coverage-data/action.yml
@@ -14,11 +14,11 @@ runs:
         name: coverage-data-artifact
         path: /tmp/coverage
         skip-if-does-not-exist: true # skip if there's no previous coverage to download
-        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
+        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
 
     - name: Upload coverage data
       uses: ./.github/actions/upload
       with:
         name: coverage-data-artifact
         path: /tmp/coverage
-        aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }}
+        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml
index 6616d08899..ac5579ccea 100644
--- a/.github/actions/upload/action.yml
+++ b/.github/actions/upload/action.yml
@@ -14,7 +14,7 @@ inputs:
   prefix:
     description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
     required: false
-  aws_oicd_role_arn:
+  aws-oicd-role-arn:
     description: "the OIDC role arn for aws auth"
     required: false
     default: ""
@@ -61,7 +61,7 @@ runs:
       uses: aws-actions/configure-aws-credentials@v4
       with:
         aws-region: eu-central-1
-        role-to-assume: ${{ inputs.aws_oicd_role_arn }}
+        role-to-assume: ${{ inputs.aws-oicd-role-arn }}
         role-duration-seconds: 3600
 
     - name: Upload artifact
diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml
index 371d815fc8..fd328586b3 100644
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -70,7 +70,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     # we create a table that has one row for each database that we want to restore with the status whether the restore is done
     - name: Create benchmark_restore_status table if it does not exist
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 456399f3c3..4263bacce8 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -264,7 +264,7 @@ jobs:
         with:
           name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
           path: /tmp/neon
-          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
       # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
       - name: Merge and upload coverage data
@@ -308,7 +308,7 @@ jobs:
           real_s3_region: eu-central-1
           rerun_failed: true
           pg_version: ${{ matrix.pg_version }}
-          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         env:
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 2d37be8837..bbdcf5ef49 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -105,7 +105,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Create Neon Project
       id: create-neon-project
@@ -123,7 +123,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         # Set --sparse-ordering option of pytest-order plugin
         # to ensure tests are running in order of appears in the file.
         # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
@@ -153,7 +153,7 @@ jobs:
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
       with:
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -205,7 +205,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Run Logical Replication benchmarks
       uses: ./.github/actions/run-python-test-set
@@ -216,7 +216,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 5400
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -233,7 +233,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 5400
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -245,7 +245,7 @@ jobs:
       uses: ./.github/actions/allure-report-generate
       with:
         store-test-results-into-db: true
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
 
@@ -407,7 +407,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Create Neon Project
       if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
@@ -455,7 +455,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -470,7 +470,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -485,7 +485,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -503,7 +503,7 @@ jobs:
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
       with:
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -614,7 +614,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -629,7 +629,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -640,7 +640,7 @@ jobs:
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
       with:
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -711,7 +711,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Set up Connection String
       id: set-up-connstr
@@ -743,7 +743,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 43200 -k test_clickbench
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -757,7 +757,7 @@ jobs:
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
       with:
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -822,7 +822,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Get Connstring Secret Name
       run: |
@@ -861,7 +861,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_tpch
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -873,7 +873,7 @@ jobs:
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
       with:
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -931,7 +931,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Set up Connection String
       id: set-up-connstr
@@ -963,7 +963,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -974,7 +974,7 @@ jobs:
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
       with:
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b3556debe3..55c4bf08b9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -336,6 +336,7 @@ jobs:
           extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
           benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
           pg_version: v16
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         env:
           VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -393,7 +394,7 @@ jobs:
         uses: ./.github/actions/allure-report-generate
         with:
           store-test-results-into-db: true
-          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         env:
           REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
 
@@ -455,14 +456,14 @@ jobs:
         with:
           name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
           path: /tmp/neon
-          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
       - name: Get coverage artifact
         uses: ./.github/actions/download
         with:
           name: coverage-data-artifact
           path: /tmp/coverage
-          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
       - name: Merge coverage data
         run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
@@ -1279,6 +1280,12 @@ jobs:
           echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT}
           echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT}
 
+      - uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
       - name: Promote compatibility snapshot and Neon artifact
         env:
           BUCKET: neon-github-public-dev
diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml
index 7b9e434ec3..55f42ea533 100644
--- a/.github/workflows/cloud-regress.yml
+++ b/.github/workflows/cloud-regress.yml
@@ -79,7 +79,7 @@ jobs:
           name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
           path: /tmp/neon/
           prefix: latest
-          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
       - name: Create a new branch
         id: create-branch
@@ -95,6 +95,7 @@ jobs:
           test_selection: cloud_regress
           pg_version: ${{matrix.pg-version}}
           extra_params: -m remote_cluster
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         env:
           BENCHMARK_CONNSTR: ${{steps.create-branch.outputs.dsn}}
 
@@ -111,7 +112,7 @@ jobs:
         if: ${{ !cancelled() }}
         uses: ./.github/actions/allure-report-generate
         with:
-          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
       - name: Post to a Slack channel
         if: ${{ github.event.schedule && failure() }}
diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml
index 6773032263..fc33c0a980 100644
--- a/.github/workflows/ingest_benchmark.yml
+++ b/.github/workflows/ingest_benchmark.yml
@@ -13,7 +13,7 @@ on:
     #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
     - cron:   '0 9 * * *' # run once a day, timezone is utc
   workflow_dispatch: # adds ability to run this manually
-    
+
 defaults:
   run:
     shell: bash -euxo pipefail {0}
@@ -28,7 +28,7 @@ jobs:
     strategy:
       fail-fast: false # allow other variants to continue even if one fails
       matrix:
-        target_project: [new_empty_project, large_existing_project]  
+        target_project: [new_empty_project, large_existing_project]
     permissions:
       contents: write
       statuses: write
@@ -56,7 +56,7 @@ jobs:
       with:
         aws-region: eu-central-1
         role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-        role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role 
+        role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role
 
     - name: Download Neon artifact
       uses: ./.github/actions/download
@@ -64,7 +64,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Create Neon Project
       if: ${{ matrix.target_project == 'new_empty_project' }}
@@ -95,7 +95,7 @@ jobs:
         project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }}
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
 
-    - name: Initialize Neon project 
+    - name: Initialize Neon project
       if: ${{ matrix.target_project == 'large_existing_project' }}
       env:
           BENCHMARK_INGEST_TARGET_CONNSTR: ${{ steps.create-neon-branch-ingest-target.outputs.dsn }}
@@ -123,7 +123,7 @@ jobs:
         ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"
         echo "BENCHMARK_INGEST_TARGET_CONNSTR=${BENCHMARK_INGEST_TARGET_CONNSTR}" >> $GITHUB_ENV
 
-    - name: Invoke pgcopydb  
+    - name: Invoke pgcopydb
       uses: ./.github/actions/run-python-test-set
       with:
         build_type: remote
@@ -132,7 +132,7 @@ jobs:
         extra_params: -s -m remote_cluster --timeout 86400 -k test_ingest_performance_using_pgcopydb
         pg_version: v16
         save_perf_report: true
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_INGEST_SOURCE_CONNSTR: ${{ secrets.BENCHMARK_INGEST_SOURCE_CONNSTR }}
         TARGET_PROJECT_TYPE: ${{ matrix.target_project }}
@@ -144,7 +144,7 @@ jobs:
       run: |
         export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
         ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "\dt+"
-      
+
     - name: Delete Neon Project
       if: ${{ always() && matrix.target_project == 'new_empty_project' }}
       uses: ./.github/actions/neon-project-delete
diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
index 049990f17b..af877029e4 100644
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -137,7 +137,7 @@ jobs:
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
       with:
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml
index 5c999d3810..4947907eb0 100644
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -96,7 +96,7 @@ jobs:
           name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
           path: /tmp/neon/
           prefix: latest
-          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
       - name: Create Neon Project
         id: create-neon-project
@@ -113,6 +113,7 @@ jobs:
           run_in_parallel: false
           extra_params: -m remote_cluster
           pg_version: ${{ env.DEFAULT_PG_VERSION }}
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         env:
           BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
 
@@ -129,7 +130,7 @@ jobs:
         uses: ./.github/actions/allure-report-generate
         with:
           store-test-results-into-db: true
-          aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         env:
           REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
 
@@ -163,7 +164,7 @@ jobs:
         name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Create Neon Project
       id: create-neon-project
@@ -180,6 +181,7 @@ jobs:
         run_in_parallel: false
         extra_params: -m remote_cluster
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
 
@@ -196,7 +198,7 @@ jobs:
       uses: ./.github/actions/allure-report-generate
       with:
         store-test-results-into-db: true
-        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
 

From 2521eba6741f000d71c1f6e0d2d0b279c8465f47 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 13 Dec 2024 22:46:41 +0200
Subject: [PATCH 042/583] Check for invalid down link while prefetching B-Tree
 leave pages for index-only scan (#9867)

## Problem

See #9866

Index-only scan prefetch implementation doesn't take in account that
down link may be invalid

## Summary of changes

Check that downlink is valid block number


Correspondent Postgres PRs:
https://github.com/neondatabase/postgres/pull/534
https://github.com/neondatabase/postgres/pull/535
https://github.com/neondatabase/postgres/pull/536
https://github.com/neondatabase/postgres/pull/537

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 13ff324150..c2f65b3201 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 13ff324150fceaac72920e01742addc053db9462
+Subproject commit c2f65b3201591e02ce45b66731392f98d3388e73
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 8736b10c1d..f262d631ad 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 8736b10c1d93d11b9c0489872dd529c4c0f5338f
+Subproject commit f262d631ad477a1819e84a183e5a7ef561830085
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 81428621f7..97f9fde349 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 81428621f7c04aed03671cf80a928e0a36d92505
+Subproject commit 97f9fde349c6de6d573f5ce96db07eca60ce6185
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 01fa3c4866..010c0ea2eb 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 01fa3c48664ca030cfb69bb4a350aa9df4691d88
+Subproject commit 010c0ea2eb06afe76485a33c43954cbcf3d99f86
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 7329aa437f..afcb922a5e 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.2",
-    "01fa3c48664ca030cfb69bb4a350aa9df4691d88"
+    "010c0ea2eb06afe76485a33c43954cbcf3d99f86"
   ],
   "v16": [
     "16.6",
-    "81428621f7c04aed03671cf80a928e0a36d92505"
+    "97f9fde349c6de6d573f5ce96db07eca60ce6185"
   ],
   "v15": [
     "15.10",
-    "8736b10c1d93d11b9c0489872dd529c4c0f5338f"
+    "f262d631ad477a1819e84a183e5a7ef561830085"
   ],
   "v14": [
     "14.15",
-    "13ff324150fceaac72920e01742addc053db9462"
+    "c2f65b3201591e02ce45b66731392f98d3388e73"
   ]
 }

From cf161e1556229dae17b5b73e03c3b5781f7bf952 Mon Sep 17 00:00:00 2001
From: Mikhail Kot <mikhail@neon.tech>
Date: Sat, 14 Dec 2024 17:37:13 +0000
Subject: [PATCH 043/583] fix(adapter): password not set in role drop (#10130)

## Problem

When entry was dropped and password wasn't set, new entry
had uninitialized memory in controlplane adapter

Resolves: https://github.com/neondatabase/cloud/issues/14914

## Summary of changes

Initialize password in all cases, add tests.
Minor formatting for less indentation
---
 pgxn/neon/control_plane_connector.c        | 39 ++++++++++------------
 test_runner/regress/test_ddl_forwarding.py | 29 ++++++++++++----
 2 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index b47b22cd20..59096a1bc8 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -428,6 +428,8 @@ MergeTable()
 		hash_seq_init(&status, old_table->role_table);
 		while ((entry = hash_seq_search(&status)) != NULL)
 		{
+			RoleEntry * old;
+			bool found_old = false;
 			RoleEntry  *to_write = hash_search(
 											   CurrentDdlTable->role_table,
 											   entry->name,
@@ -435,30 +437,23 @@ MergeTable()
 											   NULL);
 
 			to_write->type = entry->type;
-			if (entry->password)
-				to_write->password = entry->password;
+			to_write->password = entry->password;
 			strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN);
-			if (entry->old_name[0] != '\0')
-			{
-				bool		found_old = false;
-				RoleEntry  *old = hash_search(
-											  CurrentDdlTable->role_table,
-											  entry->old_name,
-											  HASH_FIND,
-											  &found_old);
+			if (entry->old_name[0] == '\0')
+				continue;
 
-				if (found_old)
-				{
-					if (old->old_name[0] != '\0')
-						strlcpy(to_write->old_name, old->old_name, NAMEDATALEN);
-					else
-						strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN);
-					hash_search(CurrentDdlTable->role_table,
-								entry->old_name,
-								HASH_REMOVE,
-								NULL);
-				}
-			}
+			old = hash_search(
+							  CurrentDdlTable->role_table,
+							  entry->old_name,
+							  HASH_FIND,
+							  &found_old);
+			if (!found_old)
+				continue;
+			strlcpy(to_write->old_name, old->old_name, NAMEDATALEN);
+			hash_search(CurrentDdlTable->role_table,
+						entry->old_name,
+						HASH_REMOVE,
+						NULL);
 		}
 		hash_destroy(old_table->role_table);
 	}
diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
index de44bbcbc8..b10e38885e 100644
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -60,14 +60,12 @@ def ddl_forward_handler(
     if request.json is None:
         log.info("Received invalid JSON")
         return Response(status=400)
-    json = request.json
+    json: dict[str, list[str]] = request.json
     # Handle roles first
-    if "roles" in json:
-        for operation in json["roles"]:
-            handle_role(dbs, roles, operation)
-    if "dbs" in json:
-        for operation in json["dbs"]:
-            handle_db(dbs, roles, operation)
+    for operation in json.get("roles", []):
+        handle_role(dbs, roles, operation)
+    for operation in json.get("dbs", []):
+        handle_db(dbs, roles, operation)
     return Response(status=200)
 
 
@@ -207,6 +205,23 @@ def test_ddl_forwarding(ddl: DdlForwardingContext):
     ddl.wait()
     assert ddl.roles == {}
 
+    cur.execute("CREATE ROLE bork WITH PASSWORD 'newyork'")
+    cur.execute("BEGIN")
+    cur.execute("SAVEPOINT point")
+    cur.execute("DROP ROLE bork")
+    cur.execute("COMMIT")
+    ddl.wait()
+    assert ddl.roles == {}
+
+    cur.execute("CREATE ROLE bork WITH PASSWORD 'oldyork'")
+    cur.execute("BEGIN")
+    cur.execute("SAVEPOINT point")
+    cur.execute("ALTER ROLE bork PASSWORD NULL")
+    cur.execute("COMMIT")
+    cur.execute("DROP ROLE bork")
+    ddl.wait()
+    assert ddl.roles == {}
+
     cur.execute("CREATE ROLE bork WITH PASSWORD 'dork'")
     cur.execute("CREATE DATABASE stork WITH OWNER=bork")
     cur.execute("ALTER ROLE bork RENAME TO cork")

From f3ecd5d76ad8b858b2bfaaabba5018046aca46ac Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Sun, 15 Dec 2024 10:45:12 +0100
Subject: [PATCH 044/583] pageserver: revert flush backpressure (#8550)
 (#10135)

## Problem

In #8550, we made the flush loop wait for uploads after every layer.
This was to avoid unbounded buildup of uploads, and to reduce compaction
debt. However, the approach has several problems:

* It prevents upload parallelism.
* It prevents flush and upload pipelining.
* It slows down ingestion even when there is no need to backpressure.
* It does not directly backpressure WAL ingestion (only via
`disk_consistent_lsn`), and will build up in-memory layers.
* It does not directly backpressure based on compaction debt and read
amplification.

An alternative solution to these problems is proposed in #8390.

In the meanwhile, we revert the change to reduce the impact on ingest
throughput. This does reintroduce some risk of unbounded
upload/compaction buildup. Until
https://github.com/neondatabase/neon/issues/8390, this can be addressed
in other ways:

* Use `max_replication_apply_lag` (aka `remote_consistent_lsn`), which
will more directly limit upload debt.
* Shard the tenant, which will spread the flush/upload work across more
Pageservers and move the bottleneck to Safekeeper.

Touches #10095.

## Summary of changes

Remove waiting on the upload queue in the flush loop.
---
 pageserver/src/metrics.rs                  | 25 +----------
 pageserver/src/tenant/timeline.rs          | 38 ++++-------------
 test_runner/fixtures/metrics.py            |  1 -
 test_runner/regress/test_branching.py      | 13 ++----
 test_runner/regress/test_remote_storage.py | 48 ----------------------
 5 files changed, 13 insertions(+), 112 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index b4e20cb8b9..bdbabf3f75 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -3,7 +3,7 @@ use metrics::{
     register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
     register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
     register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
-    Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
+    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
     IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
@@ -445,15 +445,6 @@ pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-static FLUSH_WAIT_UPLOAD_TIME: Lazy<GaugeVec> = Lazy::new(|| {
-    register_gauge_vec!(
-        "pageserver_flush_wait_upload_seconds",
-        "Time spent waiting for preceding uploads during layer flush",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
         "pageserver_last_record_lsn",
@@ -2586,7 +2577,6 @@ pub(crate) struct TimelineMetrics {
     shard_id: String,
     timeline_id: String,
     pub flush_time_histo: StorageTimeMetrics,
-    pub flush_wait_upload_time_gauge: Gauge,
     pub compact_time_histo: StorageTimeMetrics,
     pub create_images_time_histo: StorageTimeMetrics,
     pub logical_size_histo: StorageTimeMetrics,
@@ -2632,9 +2622,6 @@ impl TimelineMetrics {
             &shard_id,
             &timeline_id,
         );
-        let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
         let compact_time_histo = StorageTimeMetrics::new(
             StorageTimeOperation::Compact,
             &tenant_id,
@@ -2780,7 +2767,6 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
             flush_time_histo,
-            flush_wait_upload_time_gauge,
             compact_time_histo,
             create_images_time_histo,
             logical_size_histo,
@@ -2830,14 +2816,6 @@ impl TimelineMetrics {
         self.resident_physical_size_gauge.get()
     }
 
-    pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) {
-        self.flush_wait_upload_time_gauge.add(duration);
-        crate::metrics::FLUSH_WAIT_UPLOAD_TIME
-            .get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id])
-            .unwrap()
-            .add(duration);
-    }
-
     pub(crate) fn shutdown(&self) {
         let was_shutdown = self
             .shutdown
@@ -2855,7 +2833,6 @@ impl TimelineMetrics {
         let shard_id = &self.shard_id;
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
-        let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         {
             RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 75e268a1b9..0416953c1f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -144,19 +144,15 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
+use super::config::TenantConf;
+use super::remote_timeline_client::index::IndexPart;
+use super::remote_timeline_client::RemoteTimelineClient;
+use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
+use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
+use super::upload_queue::NotInitialized;
+use super::GcError;
 use super::{
-    config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized,
-    MaybeOffloaded,
-};
-use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
-use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
-use super::{
-    remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
-    storage_layer::ReadableLayer,
-};
-use super::{
-    secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
-    GcError,
+    debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf, MaybeOffloaded,
 };
 
 #[cfg(test)]
@@ -3897,24 +3893,6 @@ impl Timeline {
             // release lock on 'layers'
         };
 
-        // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
-        // This makes us refuse ingest until the new layers have been persisted to the remote
-        let start = Instant::now();
-        self.remote_client
-            .wait_completion()
-            .await
-            .map_err(|e| match e {
-                WaitCompletionError::UploadQueueShutDownOrStopped
-                | WaitCompletionError::NotInitialized(
-                    NotInitialized::ShuttingDown | NotInitialized::Stopped,
-                ) => FlushLayerError::Cancelled,
-                WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => {
-                    FlushLayerError::Other(anyhow!(e).into())
-                }
-            })?;
-        let duration = start.elapsed().as_secs_f64();
-        self.metrics.flush_wait_upload_time_gauge_add(duration);
-
         // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
         // a compaction can delete the file and then it won't be available for uploads any more.
         // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index c5295360c3..eb3d06b949 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -170,7 +170,6 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     "pageserver_evictions_with_low_residence_duration_total",
     "pageserver_aux_file_estimated_size",
     "pageserver_valid_lsn_lease_count",
-    "pageserver_flush_wait_upload_seconds",
     counter("pageserver_tenant_throttling_count_accounted_start"),
     counter("pageserver_tenant_throttling_count_accounted_finish"),
     counter("pageserver_tenant_throttling_wait_usecs_sum"),
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 34e4e994cb..a4056404f0 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -19,6 +19,7 @@ from fixtures.pageserver.utils import wait_until_tenant_active
 from fixtures.utils import query_scalar
 from performance.test_perf_pgbench import get_scales_matrix
 from requests import RequestException
+from requests.exceptions import RetryError
 
 
 # Test branch creation
@@ -176,11 +177,8 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
 
         env.neon_cli.mappings_map_branch(initial_branch, env.initial_tenant, env.initial_timeline)
 
-        with pytest.raises(RuntimeError, match="ERROR: Not found: Timeline"):
-            env.endpoints.create_start(
-                initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2
-            )
-        ps_http.configure_failpoints(("before-upload-index-pausable", "off"))
+        with pytest.raises(RuntimeError, match="is not active, state: Loading"):
+            env.endpoints.create_start(initial_branch, tenant_id=env.initial_tenant)
     finally:
         env.pageserver.stop(immediate=True)
 
@@ -221,10 +219,7 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
 
         branch_id = TimelineId.generate()
 
-        with pytest.raises(
-            PageserverApiException,
-            match="Cannot branch off the timeline that's not present in pageserver",
-        ):
+        with pytest.raises(RetryError, match="too many 503 error responses"):
             ps_http.timeline_create(
                 env.pg_version,
                 env.initial_tenant,
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 76a42ef4a2..52b6b254aa 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -784,54 +784,6 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
         create_thread.join()
 
 
-def test_paused_upload_stalls_checkpoint(
-    neon_env_builder: NeonEnvBuilder,
-):
-    """
-    This test checks that checkpoints block on uploads to remote storage.
-    """
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
-
-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
-            # Set a small compaction threshold
-            "compaction_threshold": "3",
-            # Disable GC
-            "gc_period": "0s",
-            # disable PITR
-            "pitr_interval": "0s",
-        }
-    )
-
-    env.pageserver.allowed_errors.append(
-        f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
-    )
-
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-
-    client = env.pageserver.http_client()
-    layers_at_creation = client.layer_map_info(tenant_id, timeline_id)
-    deltas_at_creation = len(layers_at_creation.delta_layers())
-    assert (
-        deltas_at_creation == 1
-    ), "are you fixing #5863? make sure we end up with 2 deltas at the end of endpoint lifecycle"
-
-    # Make new layer uploads get stuck.
-    # Note that timeline creation waits for the initial layers to reach remote storage.
-    # So at this point, the `layers_at_creation` are in remote storage.
-    client.configure_failpoints(("before-upload-layer-pausable", "pause"))
-
-    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
-        # Build two tables with some data inside
-        endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)")
-        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-
-        with pytest.raises(ReadTimeout):
-            client.timeline_checkpoint(tenant_id, timeline_id, timeout=5)
-        client.configure_failpoints(("before-upload-layer-pausable", "off"))
-
-
 def wait_upload_queue_empty(
     client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
 ):

From 117c1b5ddec110677f06dbc769712a722b55f2d3 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 16 Dec 2024 08:03:53 +0200
Subject: [PATCH 045/583] Do not perform prefetch for temp relations (#10146)

## Problem

See https://neondb.slack.com/archives/C04DGM6SMTM/p1734002916827019

With recent prefetch fixes for pg17 and `effective_io_concurrency=100`
pg_regress test stats.sql is failed when set temp_buffers to 100.
Stream API will try to lock all this 100 buffers for prefetch.

## Summary of changes

Disable such behaviour for temp relations.
Postgres PR: https://github.com/neondatabase/postgres/pull/548

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 010c0ea2eb..65c4e46baf 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 010c0ea2eb06afe76485a33c43954cbcf3d99f86
+Subproject commit 65c4e46baf56ec05412c7dd63d62faff0b33dcfb
diff --git a/vendor/revisions.json b/vendor/revisions.json
index afcb922a5e..c8db81c73f 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v17": [
     "17.2",
-    "010c0ea2eb06afe76485a33c43954cbcf3d99f86"
+    "65c4e46baf56ec05412c7dd63d62faff0b33dcfb"
   ],
   "v16": [
     "16.6",

From ebcbc1a4822c566a865ca2dd9138e5af2838c794 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 16 Dec 2024 10:06:08 +0000
Subject: [PATCH 046/583] pageserver: tighten up code around SLRU dir key
 handling (#10082)

## Problem

Changes in #9786 were functionally complete but missed some edges that
made testing less robust than it should have been:
- `is_key_disposable` didn't consider SLRU dir keys disposable
- Timeline `init_empty` was always creating SLRU dir keys on all shards

The result was that when we had a bug
(https://github.com/neondatabase/neon/pull/10080), it wasn't apparent in
tests, because one would only encounter the issue if running on a
long-lived timeline with enough compaction to drop the initially created
empty SLRU dir keys, _and_ some CLog truncation going on.

Closes: https://github.com/neondatabase/cloud/issues/21516

## Summary of changes

- Update is_key_global and init_empty to handle SLRU dir keys properly
-- the only functional impact is that we avoid writing some spurious
keys in shards >0, but this makes testing much more robust.
- Make `test_clog_truncate` explicitly use a sharded tenant

The net result is that if one reverts #10080, then tests fail (i.e. this
PR is a reproducer for the issue)
---
 libs/pageserver_api/src/key.rs            |  4 ++++
 libs/pageserver_api/src/shard.rs          |  6 ++++-
 pageserver/src/pgdatadir_mapping.rs       | 29 +++++++++++++----------
 test_runner/regress/test_clog_truncate.py | 24 ++++++++++++-------
 4 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 373329c9b4..f0cd713c38 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -565,6 +565,10 @@ impl Key {
             && self.field5 == 0
             && self.field6 == u32::MAX
     }
+
+    pub fn is_slru_dir_key(&self) -> bool {
+        slru_dir_kind(self).is_some()
+    }
 }
 
 #[inline(always)]
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index cf0cd3a46b..4cc0a739e8 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -173,7 +173,11 @@ impl ShardIdentity {
 
     /// Return true if the key should be stored on all shards, not just one.
     pub fn is_key_global(&self, key: &Key) -> bool {
-        if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() {
+        if key.is_slru_block_key()
+            || key.is_slru_segment_size_key()
+            || key.is_aux_file_key()
+            || key.is_slru_dir_key()
+        {
             // Special keys that are only stored on shard 0
             false
         } else if key.is_rel_block_key() {
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 255bd01e25..3eaecd3a08 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1319,18 +1319,23 @@ impl<'a> DatadirModification<'a> {
 
         let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into();
         let empty_dir = Value::Image(buf);
-        self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone());
-        self.pending_directory_entries
-            .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
-        self.put(
-            slru_dir_to_key(SlruKind::MultiXactMembers),
-            empty_dir.clone(),
-        );
-        self.pending_directory_entries
-            .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
-        self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir);
-        self.pending_directory_entries
-            .push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0));
+
+        // Initialize SLRUs on shard 0 only: creating these on other shards would be
+        // harmless but they'd just be dropped on later compaction.
+        if self.tline.tenant_shard_id.is_shard_zero() {
+            self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone());
+            self.pending_directory_entries
+                .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
+            self.put(
+                slru_dir_to_key(SlruKind::MultiXactMembers),
+                empty_dir.clone(),
+            );
+            self.pending_directory_entries
+                .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
+            self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir);
+            self.pending_directory_entries
+                .push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0));
+        }
 
         Ok(())
     }
diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py
index 10027ce689..2ae38e6d88 100644
--- a/test_runner/regress/test_clog_truncate.py
+++ b/test_runner/regress/test_clog_truncate.py
@@ -1,18 +1,19 @@
 from __future__ import annotations
 
 import os
-import time
 
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv
-from fixtures.utils import query_scalar
+from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.utils import query_scalar, wait_until
 
 
 #
 # Test compute node start after clog truncation
 #
-def test_clog_truncate(neon_simple_env: NeonEnv):
-    env = neon_simple_env
+def test_clog_truncate(neon_env_builder: NeonEnvBuilder):
+    # Use a multi-sharded tenant because WAL ingest logic is shard-dependent, and
+    # this test is one of the very few that exercises a CLogTruncate WAL record.
+    env = neon_env_builder.init_start(initial_tenant_shard_count=2)
 
     # set aggressive autovacuum to make sure that truncation will happen
     config = [
@@ -31,6 +32,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv):
     endpoint.safe_psql("CREATE EXTENSION neon_test_utils")
 
     # Consume many xids to advance clog
+    log.info("Consuming xids...")
     with endpoint.cursor() as cur:
         cur.execute("select test_consume_xids(1000*1000*10);")
         log.info("xids consumed")
@@ -47,11 +49,17 @@ def test_clog_truncate(neon_simple_env: NeonEnv):
     pg_xact_0000_path = os.path.join(endpoint.pg_xact_dir_path(), "0000")
     log.info(f"pg_xact_0000_path = {pg_xact_0000_path}")
 
-    while os.path.isfile(pg_xact_0000_path):
-        log.info(f"file exists. wait for truncation: {pg_xact_0000_path=}")
-        time.sleep(5)
+    def assert_file_removed():
+        exists = os.path.isfile(pg_xact_0000_path)
+        if exists:
+            log.info(f"file exists. wait for truncation: {pg_xact_0000_path=}")
+        assert not exists
+
+    log.info("Waiting for truncation...")
+    wait_until(assert_file_removed)
 
     # checkpoint to advance latest lsn
+    log.info("Checkpointing...")
     with endpoint.cursor() as cur:
         cur.execute("CHECKPOINT;")
         lsn_after_truncation = query_scalar(cur, "select pg_current_wal_insert_lsn()")

From 24d658791410d81d9b854ba1c8036068d6467bc2 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 16 Dec 2024 11:15:25 +0000
Subject: [PATCH 047/583] chore(proxy): refactor self-signed config (#10154)

## Problem

While reviewing #10152 I found it tricky to actually determine whether
the connection used `allow_self_signed_compute` or not.

I've tried to remove this setting in the past:
* https://github.com/neondatabase/neon/pull/7884
* https://github.com/neondatabase/neon/pull/7437
* https://github.com/neondatabase/cloud/pull/13702

But each time it seems it is used by e2e tests

## Summary of changes

The `node_info.allow_self_signed_computes` is always initialised to
false, and then sometimes inherits the proxy config value. There's no
need this needs to be in the node_info, so removing it and propagating
it via `TcpMechansim` is simpler.
---
 proxy/src/auth/backend/console_redirect.rs        |  1 -
 proxy/src/auth/backend/local.rs                   |  1 -
 proxy/src/console_redirect_proxy.rs               |  2 +-
 proxy/src/control_plane/client/cplane_proxy_v1.rs |  1 -
 proxy/src/control_plane/client/mock.rs            |  1 -
 proxy/src/control_plane/mod.rs                    | 13 +++----------
 proxy/src/proxy/connect_compute.rs                | 11 ++++++++---
 proxy/src/proxy/mod.rs                            |  2 +-
 proxy/src/proxy/tests/mod.rs                      | 14 ++++++--------
 proxy/src/redis/notifications.rs                  |  2 +-
 proxy/src/serverless/backend.rs                   |  2 --
 11 files changed, 20 insertions(+), 30 deletions(-)

diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index 575d60be85..c3de77b352 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -187,7 +187,6 @@ async fn authenticate(
         NodeInfo {
             config,
             aux: db_info.aux,
-            allow_self_signed_compute: false, // caller may override
         },
         db_info.allowed_ips,
     ))
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index d4273fb521..d10f0e82b2 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -37,7 +37,6 @@ impl LocalBackend {
                     branch_id: BranchIdTag::get_interner().get_or_intern("local"),
                     cold_start_info: ColdStartInfo::WarmCached,
                 },
-                allow_self_signed_compute: false,
             },
         }
     }
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 65702e0e4c..02398fb777 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -213,9 +213,9 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
             params_compat: true,
             params: &params,
             locks: &config.connect_compute_locks,
+            allow_self_signed_compute: config.allow_self_signed_compute,
         },
         &user_info,
-        config.allow_self_signed_compute,
         config.wake_compute_retry_config,
         config.connect_to_compute_retry_config,
     )
diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs
index e33a37f643..00038a6ac6 100644
--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -250,7 +250,6 @@ impl NeonControlPlaneClient {
             let node = NodeInfo {
                 config,
                 aux: body.aux,
-                allow_self_signed_compute: false,
             };
 
             Ok(node)
diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs
index eaf692ab27..93edd65476 100644
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -174,7 +174,6 @@ impl MockControlPlane {
                 branch_id: (&BranchId::from("branch")).into(),
                 cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm,
             },
-            allow_self_signed_compute: false,
         };
 
         Ok(node)
diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs
index 41972e4e44..c0718920b4 100644
--- a/proxy/src/control_plane/mod.rs
+++ b/proxy/src/control_plane/mod.rs
@@ -67,28 +67,21 @@ pub(crate) struct NodeInfo {
 
     /// Labels for proxy's metrics.
     pub(crate) aux: MetricsAuxInfo,
-
-    /// Whether we should accept self-signed certificates (for testing)
-    pub(crate) allow_self_signed_compute: bool,
 }
 
 impl NodeInfo {
     pub(crate) async fn connect(
         &self,
         ctx: &RequestContext,
+        allow_self_signed_compute: bool,
         timeout: Duration,
     ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
         self.config
-            .connect(
-                ctx,
-                self.allow_self_signed_compute,
-                self.aux.clone(),
-                timeout,
-            )
+            .connect(ctx, allow_self_signed_compute, self.aux.clone(), timeout)
             .await
     }
+
     pub(crate) fn reuse_settings(&mut self, other: Self) {
-        self.allow_self_signed_compute = other.allow_self_signed_compute;
         self.config.reuse_password(other.config);
     }
 
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index a3027abd7c..6da4c90a53 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -73,6 +73,9 @@ pub(crate) struct TcpMechanism<'a> {
 
     /// connect_to_compute concurrency lock
     pub(crate) locks: &'static ApiLocks<Host>,
+
+    /// Whether we should accept self-signed certificates (for testing)
+    pub(crate) allow_self_signed_compute: bool,
 }
 
 #[async_trait]
@@ -90,7 +93,11 @@ impl ConnectMechanism for TcpMechanism<'_> {
     ) -> Result<PostgresConnection, Self::Error> {
         let host = node_info.config.get_host();
         let permit = self.locks.get_permit(&host).await?;
-        permit.release_result(node_info.connect(ctx, timeout).await)
+        permit.release_result(
+            node_info
+                .connect(ctx, self.allow_self_signed_compute, timeout)
+                .await,
+        )
     }
 
     fn update_connect_config(&self, config: &mut compute::ConnCfg) {
@@ -104,7 +111,6 @@ pub(crate) async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBac
     ctx: &RequestContext,
     mechanism: &M,
     user_info: &B,
-    allow_self_signed_compute: bool,
     wake_compute_retry_config: RetryConfig,
     connect_to_compute_retry_config: RetryConfig,
 ) -> Result<M::Connection, M::Error>
@@ -117,7 +123,6 @@ where
         wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?;
 
     node_info.set_keys(user_info.get_keys());
-    node_info.allow_self_signed_compute = allow_self_signed_compute;
     mechanism.update_connect_config(&mut node_info.config);
 
     // try once
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index cc04bc5e5c..de0ec0f799 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -355,9 +355,9 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
             params_compat,
             params: &params,
             locks: &config.connect_compute_locks,
+            allow_self_signed_compute: mode.allow_self_signed_compute(config),
         },
         &user_info,
-        mode.allow_self_signed_compute(config),
         config.wake_compute_retry_config,
         config.connect_to_compute_retry_config,
     )
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 911b349416..3899ba4267 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -553,7 +553,6 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn
             branch_id: (&BranchId::from("branch")).into(),
             cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm,
         },
-        allow_self_signed_compute: false,
     };
     let (_, node2) = cache.insert_unit("key".into(), Ok(node.clone()));
     node2.map(|()| node)
@@ -588,7 +587,7 @@ async fn connect_to_compute_success() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -606,7 +605,7 @@ async fn connect_to_compute_retry() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -625,7 +624,7 @@ async fn connect_to_compute_non_retry_1() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, config, config)
         .await
         .unwrap_err();
     mechanism.verify();
@@ -644,7 +643,7 @@ async fn connect_to_compute_non_retry_2() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -674,7 +673,6 @@ async fn connect_to_compute_non_retry_3() {
         &ctx,
         &mechanism,
         &user_info,
-        false,
         wake_compute_retry_config,
         connect_to_compute_retry_config,
     )
@@ -696,7 +694,7 @@ async fn wake_retry() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -715,7 +713,7 @@ async fn wake_non_retry() {
         max_retries: 5,
         backoff_factor: 2.0,
     };
-    connect_to_compute(&ctx, &mechanism, &user_info, false, config, config)
+    connect_to_compute(&ctx, &mechanism, &user_info, config, config)
         .await
         .unwrap_err();
     mechanism.verify();
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index f3aa97c032..d18dfd2465 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -6,6 +6,7 @@ use pq_proto::CancelKeyData;
 use redis::aio::PubSub;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
+use tracing::Instrument;
 use uuid::Uuid;
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
@@ -13,7 +14,6 @@ use crate::cache::project_info::ProjectInfoCache;
 use crate::cancellation::{CancelMap, CancellationHandler};
 use crate::intern::{ProjectIdInt, RoleNameInt};
 use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
-use tracing::Instrument;
 
 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
 pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates";
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 251aa47084..15d883bdb0 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -195,7 +195,6 @@ impl PoolingBackend {
                 locks: &self.config.connect_compute_locks,
             },
             &backend,
-            false, // do not allow self signed compute for http flow
             self.config.wake_compute_retry_config,
             self.config.connect_to_compute_retry_config,
         )
@@ -237,7 +236,6 @@ impl PoolingBackend {
                 locks: &self.config.connect_compute_locks,
             },
             &backend,
-            false, // do not allow self signed compute for http flow
             self.config.wake_compute_retry_config,
             self.config.connect_to_compute_retry_config,
         )

From 1ed0e52bc837fd8eb356847e452444b5df1f4b0b Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Mon, 16 Dec 2024 15:07:24 +0300
Subject: [PATCH 048/583] Extract safekeeper http client to separate crate.
 (#10140)

## Problem

We want to use safekeeper http client in storage controller and
neon_local.

## Summary of changes

Extract it to separate crate. No functional changes.
---
 Cargo.lock                                          | 13 +++++++++++++
 Cargo.toml                                          |  2 ++
 safekeeper/Cargo.toml                               |  1 +
 safekeeper/client/Cargo.toml                        | 13 +++++++++++++
 safekeeper/client/src/lib.rs                        |  1 +
 .../{src/http/client.rs => client/src/mgmt_api.rs}  |  4 ----
 safekeeper/src/http/mod.rs                          |  1 -
 safekeeper/src/pull_timeline.rs                     |  5 +++--
 8 files changed, 33 insertions(+), 7 deletions(-)
 create mode 100644 safekeeper/client/Cargo.toml
 create mode 100644 safekeeper/client/src/lib.rs
 rename safekeeper/{src/http/client.rs => client/src/mgmt_api.rs} (96%)

diff --git a/Cargo.lock b/Cargo.lock
index c4f80f63c9..d1f7746969 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5535,6 +5535,7 @@ dependencies = [
  "remote_storage",
  "reqwest",
  "safekeeper_api",
+ "safekeeper_client",
  "scopeguard",
  "sd-notify",
  "serde",
@@ -5572,6 +5573,18 @@ dependencies = [
  "utils",
 ]
 
+[[package]]
+name = "safekeeper_client"
+version = "0.1.0"
+dependencies = [
+ "reqwest",
+ "safekeeper_api",
+ "serde",
+ "thiserror",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "same-file"
 version = "1.0.6"
diff --git a/Cargo.toml b/Cargo.toml
index 0654c25a3d..056cd5798f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,6 +11,7 @@ members = [
     "pageserver/pagebench",
     "proxy",
     "safekeeper",
+    "safekeeper/client",
     "storage_broker",
     "storage_controller",
     "storage_controller/client",
@@ -233,6 +234,7 @@ postgres_initdb = { path = "./libs/postgres_initdb" }
 pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
 remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
 safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
+safekeeper_client = { path = "./safekeeper/client" }
 desim = { version = "0.1", path = "./libs/desim" }
 storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
 storage_controller_client = { path = "./storage_controller/client" }
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 0422c46ab1..086407603f 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -55,6 +55,7 @@ postgres_ffi.workspace = true
 pq_proto.workspace = true
 remote_storage.workspace = true
 safekeeper_api.workspace = true
+safekeeper_client.workspace = true
 sha2.workspace = true
 sd-notify.workspace = true
 storage_broker.workspace = true
diff --git a/safekeeper/client/Cargo.toml b/safekeeper/client/Cargo.toml
new file mode 100644
index 0000000000..6c5a52de3a
--- /dev/null
+++ b/safekeeper/client/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "safekeeper_client"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+safekeeper_api.workspace = true
+thiserror.workspace = true
+reqwest = { workspace = true, features = [ "stream" ] }
+serde.workspace = true
+utils.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/safekeeper/client/src/lib.rs b/safekeeper/client/src/lib.rs
new file mode 100644
index 0000000000..3963fd466c
--- /dev/null
+++ b/safekeeper/client/src/lib.rs
@@ -0,0 +1 @@
+pub mod mgmt_api;
diff --git a/safekeeper/src/http/client.rs b/safekeeper/client/src/mgmt_api.rs
similarity index 96%
rename from safekeeper/src/http/client.rs
rename to safekeeper/client/src/mgmt_api.rs
index 669a9c0ce9..f78745043a 100644
--- a/safekeeper/src/http/client.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -2,10 +2,6 @@
 //!
 //! Partially copied from pageserver client; some parts might be better to be
 //! united.
-//!
-//! It would be also good to move it out to separate crate, but this needs
-//! duplication of internal-but-reported structs like WalSenderState, ServerInfo
-//! etc.
 
 use reqwest::{IntoUrl, Method, StatusCode};
 use safekeeper_api::models::TimelineStatus;
diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs
index 7229ccb739..d82a713f8a 100644
--- a/safekeeper/src/http/mod.rs
+++ b/safekeeper/src/http/mod.rs
@@ -1,4 +1,3 @@
-pub mod client;
 pub mod routes;
 pub use routes::make_router;
 
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 00777273cb..f2d8e4c85f 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -5,6 +5,8 @@ use chrono::{DateTime, Utc};
 use futures::{SinkExt, StreamExt, TryStreamExt};
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use safekeeper_api::{models::TimelineStatus, Term};
+use safekeeper_client::mgmt_api;
+use safekeeper_client::mgmt_api::Client;
 use serde::{Deserialize, Serialize};
 use std::{
     cmp::min,
@@ -22,7 +24,6 @@ use tracing::{error, info, instrument};
 use crate::{
     control_file::CONTROL_FILE_NAME,
     debug_dump,
-    http::client::{self, Client},
     state::{EvictionState, TimelinePersistentState},
     timeline::{Timeline, WalResidentTimeline},
     timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
@@ -419,7 +420,7 @@ pub async fn handle_request(
     let http_hosts = request.http_hosts.clone();
 
     // Figure out statuses of potential donors.
-    let responses: Vec<Result<TimelineStatus, client::Error>> =
+    let responses: Vec<Result<TimelineStatus, mgmt_api::Error>> =
         futures::future::join_all(http_hosts.iter().map(|url| async {
             let cclient = Client::new(url.clone(), sk_auth_token.clone());
             let info = cclient

From c5e3314c6e3a3d693310626a4c9c8deec34931c3 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Mon, 16 Dec 2024 17:53:04 +0300
Subject: [PATCH 049/583] Add test restarting compute at WAL page boundary
 (#10111)

## Problem

We've had similar test in test_logical_replication, but then removed it
because it wasn't needed to trigger LR related bug. Restarting at WAL
page boundary is still a useful test, so add it separately back.

## Summary of changes

Add the test.
---
 test_runner/regress/test_wal_acceptor.py | 56 ++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 23d4f23cdb..0a8900b351 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1090,6 +1090,62 @@ def test_restart_endpoint_after_switch_wal(neon_env_builder: NeonEnvBuilder):
     endpoint.safe_psql("SELECT 'works'")
 
 
+# Test restarting compute at WAL page boundary.
+def test_restart_endpoint_wal_page_boundary(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+
+    ep = env.endpoints.create_start("main")
+    ep.safe_psql("create table t (i int)")
+
+    with ep.cursor() as cur:
+        # measure how much space logical message takes. Sometimes first attempt
+        # creates huge message and then it stabilizes, have no idea why.
+        for _ in range(3):
+            lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
+            log.info(f"current_lsn={lsn_before}")
+            # Non-transactional logical message doesn't write WAL, only XLogInsert's
+            # it, so use transactional. Which is a bit problematic as transactional
+            # necessitates commit record. Alternatively we can do smth like
+            #   select neon_xlogflush(pg_current_wal_insert_lsn());
+            # but isn't much better + that particular call complains on 'xlog flush
+            # request 0/282C018 is not satisfied' as pg_current_wal_insert_lsn skips
+            # page headers.
+            payload = "blahblah"
+            cur.execute(f"select pg_logical_emit_message(true, 'pref', '{payload}')")
+            lsn_after_by_curr_wal_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
+            lsn_diff = lsn_after_by_curr_wal_lsn - lsn_before
+            logical_message_base = lsn_after_by_curr_wal_lsn - lsn_before - len(payload)
+            log.info(
+                f"before {lsn_before}, after {lsn_after_by_curr_wal_lsn}, lsn diff is {lsn_diff}, base {logical_message_base}"
+            )
+
+        # and write logical message spanning exactly as we want
+        lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
+        log.info(f"current_lsn={lsn_before}")
+        curr_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
+        offs = int(curr_lsn) % 8192
+        till_page = 8192 - offs
+        target_lsn = curr_lsn + till_page
+        payload_len = (
+            till_page - logical_message_base - 8
+        )  # not sure why 8 is here, it is deduced from experiments
+        log.info(
+            f"current_lsn={curr_lsn}, offs {offs}, till_page {till_page}, target_lsn {target_lsn}"
+        )
+
+        cur.execute(f"select pg_logical_emit_message(true, 'pref', 'f{'a' * payload_len}')")
+        supposedly_contrecord_end = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
+        log.info(f"supposedly_page_boundary={supposedly_contrecord_end}")
+        # The calculations to hit the page boundary are very fuzzy, so just
+        # ignore test if we fail to reach it.
+        if not (int(supposedly_contrecord_end) % 8192 == 0):
+            pytest.skip(f"missed page boundary, bad luck: lsn is {supposedly_contrecord_end}")
+
+    ep.stop(mode="immediate")
+    ep = env.endpoints.create_start("main")
+    ep.safe_psql("insert into t values (42)")  # should be ok
+
+
 # Context manager which logs passed time on exit.
 class DurationLogger:
     def __init__(self, desc):

From 6565fd4056e0c040f26cc593a03561fe775595ff Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 16 Dec 2024 15:33:21 +0000
Subject: [PATCH 050/583] chore: fix clippy lints 2024-12-06 (#10138)

---
 libs/desim/src/time.rs                        |  2 +-
 .../wal_craft/src/xlog_utils_test.rs          |  2 +-
 .../proxy/tokio-postgres2/src/to_statement.rs |  2 +-
 libs/remote_storage/src/azure_blob.rs         |  4 +--
 libs/remote_storage/src/lib.rs                |  4 +--
 libs/remote_storage/src/local_fs.rs           |  4 +--
 libs/remote_storage/src/s3_bucket.rs          |  4 +--
 libs/remote_storage/src/simulate_failures.rs  |  4 +--
 pageserver/compaction/src/compact_tiered.rs   |  2 +-
 pageserver/compaction/src/identify_levels.rs  |  5 ++--
 pageserver/compaction/src/interface.rs        |  2 +-
 pageserver/compaction/src/simulator.rs        |  2 +-
 pageserver/src/basebackup.rs                  |  2 +-
 pageserver/src/pgdatadir_mapping.rs           |  8 +++---
 pageserver/src/tenant/blob_io.rs              |  2 +-
 pageserver/src/tenant/block_io.rs             |  2 +-
 pageserver/src/tenant/disk_btree.rs           |  2 +-
 pageserver/src/tenant/ephemeral_file.rs       |  6 ++---
 pageserver/src/tenant/layer_map.rs            |  4 +--
 .../tenant/remote_timeline_client/download.rs |  4 +--
 .../tenant/remote_timeline_client/upload.rs   |  4 +--
 pageserver/src/tenant/storage_layer.rs        |  5 +---
 .../src/tenant/storage_layer/delta_layer.rs   |  4 +--
 .../src/tenant/storage_layer/image_layer.rs   |  2 +-
 .../inmemory_layer/vectored_dio_read.rs       | 26 +++++++++----------
 pageserver/src/tenant/timeline.rs             |  2 +-
 pageserver/src/tenant/timeline/compaction.rs  |  4 +--
 safekeeper/src/receive_wal.rs                 |  2 +-
 safekeeper/src/safekeeper.rs                  |  5 +---
 storage_controller/src/service.rs             |  5 +---
 30 files changed, 58 insertions(+), 68 deletions(-)

diff --git a/libs/desim/src/time.rs b/libs/desim/src/time.rs
index 7bb71db95c..7ce605bda8 100644
--- a/libs/desim/src/time.rs
+++ b/libs/desim/src/time.rs
@@ -91,7 +91,7 @@ impl Timing {
 
     /// Return true if there is a ready event.
     fn is_event_ready(&self, queue: &mut BinaryHeap<Pending>) -> bool {
-        queue.peek().map_or(false, |x| x.time <= self.now())
+        queue.peek().is_some_and(|x| x.time <= self.now())
     }
 
     /// Clear all pending events.
diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
index 9eb3f0e95a..4a33dbe25b 100644
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -81,7 +81,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
                 continue;
             }
             let mut f = File::options().write(true).open(file.path()).unwrap();
-            const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
+            static ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
             f.write_all(
                 &ZEROS[0..min(
                     WAL_SEGMENT_SIZE,
diff --git a/libs/proxy/tokio-postgres2/src/to_statement.rs b/libs/proxy/tokio-postgres2/src/to_statement.rs
index 427f77dd79..7e12992728 100644
--- a/libs/proxy/tokio-postgres2/src/to_statement.rs
+++ b/libs/proxy/tokio-postgres2/src/to_statement.rs
@@ -11,7 +11,7 @@ mod private {
         Query(&'a str),
     }
 
-    impl<'a> ToStatementType<'a> {
+    impl ToStatementType<'_> {
         pub async fn into_statement(self, client: &Client) -> Result<Statement, Error> {
             match self {
                 ToStatementType::Statement(s) => Ok(s.clone()),
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 32c51bc2ad..19c8251ccd 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -544,9 +544,9 @@ impl RemoteStorage for AzureBlobStorage {
             .await
     }
 
-    async fn delete_objects<'a>(
+    async fn delete_objects(
         &self,
-        paths: &'a [RemotePath],
+        paths: &[RemotePath],
         cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
         let kind = RequestKind::Delete;
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 2a3468f986..7a864151ec 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -341,9 +341,9 @@ pub trait RemoteStorage: Send + Sync + 'static {
     /// If the operation fails because of timeout or cancellation, the root cause of the error will be
     /// set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went
     /// through.
-    async fn delete_objects<'a>(
+    async fn delete_objects(
         &self,
-        paths: &'a [RemotePath],
+        paths: &[RemotePath],
         cancel: &CancellationToken,
     ) -> anyhow::Result<()>;
 
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 1a2d421c66..a8b00173ba 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -562,9 +562,9 @@ impl RemoteStorage for LocalFs {
         }
     }
 
-    async fn delete_objects<'a>(
+    async fn delete_objects(
         &self,
-        paths: &'a [RemotePath],
+        paths: &[RemotePath],
         cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
         for path in paths {
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 2891f92d07..d3f19f0b11 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -813,9 +813,9 @@ impl RemoteStorage for S3Bucket {
         .await
     }
 
-    async fn delete_objects<'a>(
+    async fn delete_objects(
         &self,
-        paths: &'a [RemotePath],
+        paths: &[RemotePath],
         cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
         let kind = RequestKind::Delete;
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 51833c1fe6..63c24beb51 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -181,9 +181,9 @@ impl RemoteStorage for UnreliableWrapper {
         self.delete_inner(path, true, cancel).await
     }
 
-    async fn delete_objects<'a>(
+    async fn delete_objects(
         &self,
-        paths: &'a [RemotePath],
+        paths: &[RemotePath],
         cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
         self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
index 20f88868f9..7779ffaf8b 100644
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -272,7 +272,7 @@ struct CompactionJob<E: CompactionJobExecutor> {
     completed: bool,
 }
 
-impl<'a, E> LevelCompactionState<'a, E>
+impl<E> LevelCompactionState<'_, E>
 where
     E: CompactionJobExecutor,
 {
diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs
index 1853afffdd..e04bd15396 100644
--- a/pageserver/compaction/src/identify_levels.rs
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -224,9 +224,8 @@ impl<L> Level<L> {
             }
 
             // recalculate depth if this was the last event at this point
-            let more_events_at_this_key = events_iter
-                .peek()
-                .map_or(false, |next_e| next_e.key == e.key);
+            let more_events_at_this_key =
+                events_iter.peek().is_some_and(|next_e| next_e.key == e.key);
             if !more_events_at_this_key {
                 let mut active_depth = 0;
                 for (_end_lsn, is_image, _idx) in active_set.iter().rev() {
diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs
index 5bc9b5ca1d..8ed393a645 100644
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -148,7 +148,7 @@ pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLay
         Self: 'a;
 
     /// Return all keys in this delta layer.
-    fn load_keys<'a>(
+    fn load_keys(
         &self,
         ctx: &E::RequestContext,
     ) -> impl Future<Output = anyhow::Result<Vec<Self::DeltaEntry<'_>>>> + Send;
diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs
index 776c537d03..673b80c313 100644
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -143,7 +143,7 @@ impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
 impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
     type DeltaEntry<'a> = MockRecord;
 
-    async fn load_keys<'a>(&self, _ctx: &MockRequestContext) -> anyhow::Result<Vec<MockRecord>> {
+    async fn load_keys(&self, _ctx: &MockRequestContext) -> anyhow::Result<Vec<MockRecord>> {
         Ok(self.records.clone())
     }
 }
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index cae0ffb980..e1b5676f46 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -248,7 +248,7 @@ where
     }
 }
 
-impl<'a, W> Basebackup<'a, W>
+impl<W> Basebackup<'_, W>
 where
     W: AsyncWrite + Send + Sync + Unpin,
 {
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 3eaecd3a08..14c7e0d2f8 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1242,7 +1242,7 @@ pub struct DatadirModification<'a> {
     pending_metadata_bytes: usize,
 }
 
-impl<'a> DatadirModification<'a> {
+impl DatadirModification<'_> {
     // When a DatadirModification is committed, we do a monolithic serialization of all its contents.  WAL records can
     // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
     // additionally specify a limit on how much payload a DatadirModification may contain before it should be committed.
@@ -1263,7 +1263,7 @@ impl<'a> DatadirModification<'a> {
     pub(crate) fn has_dirty_data(&self) -> bool {
         self.pending_data_batch
             .as_ref()
-            .map_or(false, |b| b.has_data())
+            .is_some_and(|b| b.has_data())
     }
 
     /// Set the current lsn
@@ -2230,7 +2230,7 @@ impl<'a> DatadirModification<'a> {
                 assert!(!self
                     .pending_data_batch
                     .as_ref()
-                    .map_or(false, |b| b.updates_key(&key)));
+                    .is_some_and(|b| b.updates_key(&key)));
             }
         }
 
@@ -2299,7 +2299,7 @@ pub enum Version<'a> {
     Modified(&'a DatadirModification<'a>),
 }
 
-impl<'a> Version<'a> {
+impl Version<'_> {
     async fn get(
         &self,
         timeline: &Timeline,
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index dd70f6bbff..7b55df52a5 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -35,7 +35,7 @@ pub struct CompressionInfo {
     pub compressed_size: Option<usize>,
 }
 
-impl<'a> BlockCursor<'a> {
+impl BlockCursor<'_> {
     /// Read a blob into a new buffer.
     pub async fn read_blob(
         &self,
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 2bd7f2d619..990211f80a 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -89,7 +89,7 @@ pub(crate) enum BlockReaderRef<'a> {
     VirtualFile(&'a VirtualFile),
 }
 
-impl<'a> BlockReaderRef<'a> {
+impl BlockReaderRef<'_> {
     #[inline(always)]
     async fn read_blk(
         &self,
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index b302cbc975..c77342b144 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -532,7 +532,7 @@ pub struct DiskBtreeIterator<'a> {
     >,
 }
 
-impl<'a> DiskBtreeIterator<'a> {
+impl DiskBtreeIterator<'_> {
     pub async fn next(&mut self) -> Option<std::result::Result<(Vec<u8>, u64), DiskBtreeError>> {
         self.stream.next().await
     }
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index aaec8a4c31..ba79672bc7 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -174,11 +174,11 @@ impl EphemeralFile {
 }
 
 impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile {
-    async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>(
-        &'b self,
+    async fn read_exact_at_eof_ok<B: IoBufAlignedMut + Send>(
+        &self,
         start: u64,
         dst: tokio_epoll_uring::Slice<B>,
-        ctx: &'a RequestContext,
+        ctx: &RequestContext,
     ) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
         let submitted_offset = self.buffered_writer.bytes_submitted();
 
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 7f15baed10..1b6924425c 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -392,8 +392,8 @@ impl LayerMap {
         image_layer: Option<Arc<PersistentLayerDesc>>,
         end_lsn: Lsn,
     ) -> Option<SearchResult> {
-        assert!(delta_layer.as_ref().map_or(true, |l| l.is_delta()));
-        assert!(image_layer.as_ref().map_or(true, |l| !l.is_delta()));
+        assert!(delta_layer.as_ref().is_none_or(|l| l.is_delta()));
+        assert!(image_layer.as_ref().is_none_or(|l| !l.is_delta()));
 
         match (delta_layer, image_layer) {
             (None, None) => None,
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index d15f161fb6..b4d45dca75 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -145,8 +145,8 @@ pub async fn download_layer_file<'a>(
 ///
 /// If Err() is returned, there was some error. The file at `dst_path` has been unlinked.
 /// The unlinking has _not_ been made durable.
-async fn download_object<'a>(
-    storage: &'a GenericRemoteStorage,
+async fn download_object(
+    storage: &GenericRemoteStorage,
     src_path: &RemotePath,
     dst_path: &Utf8PathBuf,
     #[cfg_attr(target_os = "macos", allow(unused_variables))] gate: &utils::sync::gate::Gate,
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index 0cd5d05aa2..e434d24e5f 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -25,8 +25,8 @@ use utils::id::{TenantId, TimelineId};
 use tracing::info;
 
 /// Serializes and uploads the given index part data to the remote storage.
-pub(crate) async fn upload_index_part<'a>(
-    storage: &'a GenericRemoteStorage,
+pub(crate) async fn upload_index_part(
+    storage: &GenericRemoteStorage,
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
     generation: Generation,
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 9e3a25cbbc..b8206fca5a 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -345,10 +345,7 @@ impl LayerFringe {
     }
 
     pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
-        let read_desc = match self.planned_visits_by_lsn.pop() {
-            Some(desc) => desc,
-            None => return None,
-        };
+        let read_desc = self.planned_visits_by_lsn.pop()?;
 
         let removed = self.visit_reads.remove_entry(&read_desc.layer_to_visit_id);
 
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index fec8a0a16c..ade1b794c6 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1486,7 +1486,7 @@ pub struct ValueRef<'a> {
     layer: &'a DeltaLayerInner,
 }
 
-impl<'a> ValueRef<'a> {
+impl ValueRef<'_> {
     /// Loads the value from disk
     pub async fn load(&self, ctx: &RequestContext) -> Result<Value> {
         let buf = self.load_raw(ctx).await?;
@@ -1543,7 +1543,7 @@ pub struct DeltaLayerIterator<'a> {
     is_end: bool,
 }
 
-impl<'a> DeltaLayerIterator<'a> {
+impl DeltaLayerIterator<'_> {
     pub(crate) fn layer_dbg_info(&self) -> String {
         self.delta_layer.layer_dbg_info()
     }
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 834d1931d0..0d3c9d5a44 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1052,7 +1052,7 @@ pub struct ImageLayerIterator<'a> {
     is_end: bool,
 }
 
-impl<'a> ImageLayerIterator<'a> {
+impl ImageLayerIterator<'_> {
     pub(crate) fn layer_dbg_info(&self) -> String {
         self.image_layer.layer_dbg_info()
     }
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
index a4bb3a6bfc..1d86015fab 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
@@ -25,11 +25,11 @@ pub trait File: Send {
     /// [`std::io::ErrorKind::UnexpectedEof`] error if the file is shorter than `start+dst.len()`.
     ///
     /// No guarantees are made about the remaining bytes in `dst` in case of a short read.
-    async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>(
-        &'b self,
+    async fn read_exact_at_eof_ok<B: IoBufAlignedMut + Send>(
+        &self,
         start: u64,
         dst: Slice<B>,
-        ctx: &'a RequestContext,
+        ctx: &RequestContext,
     ) -> std::io::Result<(Slice<B>, usize)>;
 }
 
@@ -479,11 +479,11 @@ mod tests {
     }
 
     impl File for InMemoryFile {
-        async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
-            &'b self,
+        async fn read_exact_at_eof_ok<B: IoBufMut + Send>(
+            &self,
             start: u64,
             mut dst: Slice<B>,
-            _ctx: &'a RequestContext,
+            _ctx: &RequestContext,
         ) -> std::io::Result<(Slice<B>, usize)> {
             let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed();
             let nread = {
@@ -609,12 +609,12 @@ mod tests {
         }
     }
 
-    impl<'x> File for RecorderFile<'x> {
-        async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>(
-            &'b self,
+    impl File for RecorderFile<'_> {
+        async fn read_exact_at_eof_ok<B: IoBufAlignedMut + Send>(
+            &self,
             start: u64,
             dst: Slice<B>,
-            ctx: &'a RequestContext,
+            ctx: &RequestContext,
         ) -> std::io::Result<(Slice<B>, usize)> {
             let (dst, nread) = self.file.read_exact_at_eof_ok(start, dst, ctx).await?;
             self.recorded.borrow_mut().push(RecordedRead {
@@ -740,11 +740,11 @@ mod tests {
     }
 
     impl File for MockFile {
-        async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
-            &'b self,
+        async fn read_exact_at_eof_ok<B: IoBufMut + Send>(
+            &self,
             start: u64,
             mut dst: Slice<B>,
-            _ctx: &'a RequestContext,
+            _ctx: &RequestContext,
         ) -> std::io::Result<(Slice<B>, usize)> {
             let ExpectedRead {
                 expect_pos,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0416953c1f..87f5a03382 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5842,7 +5842,7 @@ enum OpenLayerAction {
     None,
 }
 
-impl<'a> TimelineWriter<'a> {
+impl TimelineWriter<'_> {
     async fn handle_open_layer_action(
         &mut self,
         at: Lsn,
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 5e6290729c..8b6cc8ed84 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1110,7 +1110,7 @@ impl Timeline {
                 return Err(CompactionError::ShuttingDown);
             }
 
-            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
+            let same_key = prev_key == Some(key);
             // We need to check key boundaries once we reach next key or end of layer with the same key
             if !same_key || lsn == dup_end_lsn {
                 let mut next_key_size = 0u64;
@@ -2904,7 +2904,7 @@ impl CompactionLayer<Key> for ResidentDeltaLayer {
 impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
     type DeltaEntry<'a> = DeltaEntry<'a>;
 
-    async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
+    async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
         self.0.get_as_delta(ctx).await?.index_entries(ctx).await
     }
 }
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 08371177cd..3e9ce1da8e 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -318,7 +318,7 @@ struct NetworkReader<'a, IO> {
     global_timelines: Arc<GlobalTimelines>,
 }
 
-impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
+impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
     async fn read_first_message(
         &mut self,
     ) -> Result<(WalResidentTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index ccd7940c72..6ceaf325b0 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -125,10 +125,7 @@ impl TermHistory {
             );
             last_common_idx = Some(i);
         }
-        let last_common_idx = match last_common_idx {
-            None => return None, // no common point
-            Some(lci) => lci,
-        };
+        let last_common_idx = last_common_idx?;
         // Now find where it ends at both prop and sk and take min. End of
         // (common) term is the start of the next except it is the last one;
         // there it is flush_lsn in case of safekeeper or, in case of proposer
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 746177c089..a89e4741f6 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -6873,10 +6873,7 @@ impl Service {
         let mut plan = Vec::new();
 
         for (node_id, attached) in nodes_by_load {
-            let available = locked
-                .nodes
-                .get(&node_id)
-                .map_or(false, |n| n.is_available());
+            let available = locked.nodes.get(&node_id).is_some_and(|n| n.is_available());
             if !available {
                 continue;
             }

From 3d30a7a9348efbb0d1faca6d44dccc1885c5b8c9 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 16 Dec 2024 16:54:47 +0100
Subject: [PATCH 051/583] pageserver: make
 `RemoteTimelineClient::schedule_index_upload` infallible (#10155)

Remove an unnecessary `Result` and address a `FIXME`.
---
 .../src/tenant/remote_timeline_client.rs      | 39 ++++++++-----------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 20e0536a00..fee11bc742 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -749,7 +749,7 @@ impl RemoteTimelineClient {
         // ahead of what's _actually_ on the remote during index upload.
         upload_queue.dirty.metadata = metadata.clone();
 
-        self.schedule_index_upload(upload_queue)?;
+        self.schedule_index_upload(upload_queue);
 
         Ok(())
     }
@@ -770,7 +770,7 @@ impl RemoteTimelineClient {
 
         upload_queue.dirty.metadata.apply(update);
 
-        self.schedule_index_upload(upload_queue)?;
+        self.schedule_index_upload(upload_queue);
 
         Ok(())
     }
@@ -809,7 +809,7 @@ impl RemoteTimelineClient {
         if let Some(archived_at_set) = need_upload_scheduled {
             let intended_archived_at = archived_at_set.then(|| Utc::now().naive_utc());
             upload_queue.dirty.archived_at = intended_archived_at;
-            self.schedule_index_upload(upload_queue)?;
+            self.schedule_index_upload(upload_queue);
         }
 
         let need_wait = need_change(&upload_queue.clean.0.archived_at, state).is_some();
@@ -824,7 +824,7 @@ impl RemoteTimelineClient {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
         upload_queue.dirty.import_pgdata = state;
-        self.schedule_index_upload(upload_queue)?;
+        self.schedule_index_upload(upload_queue);
         Ok(())
     }
 
@@ -843,17 +843,14 @@ impl RemoteTimelineClient {
         let upload_queue = guard.initialized_mut()?;
 
         if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue)?;
+            self.schedule_index_upload(upload_queue);
         }
 
         Ok(())
     }
 
     /// Launch an index-file upload operation in the background (internal function)
-    fn schedule_index_upload(
-        self: &Arc<Self>,
-        upload_queue: &mut UploadQueueInitialized,
-    ) -> Result<(), NotInitialized> {
+    fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
         let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn();
         // fix up the duplicated field
         upload_queue.dirty.disk_consistent_lsn = disk_consistent_lsn;
@@ -880,7 +877,6 @@ impl RemoteTimelineClient {
 
         // Launch the task immediately, if possible
         self.launch_queued_tasks(upload_queue);
-        Ok(())
     }
 
     /// Reparent this timeline to a new parent.
@@ -909,7 +905,7 @@ impl RemoteTimelineClient {
                 upload_queue.dirty.metadata.reparent(new_parent);
                 upload_queue.dirty.lineage.record_previous_ancestor(&prev);
 
-                self.schedule_index_upload(upload_queue)?;
+                self.schedule_index_upload(upload_queue);
 
                 Some(self.schedule_barrier0(upload_queue))
             }
@@ -948,7 +944,7 @@ impl RemoteTimelineClient {
                     assert!(prev.is_none(), "copied layer existed already {layer}");
                 }
 
-                self.schedule_index_upload(upload_queue)?;
+                self.schedule_index_upload(upload_queue);
 
                 Some(self.schedule_barrier0(upload_queue))
             }
@@ -1004,7 +1000,7 @@ impl RemoteTimelineClient {
                     upload_queue.dirty.gc_blocking = current
                         .map(|x| x.with_reason(reason))
                         .or_else(|| Some(index::GcBlocking::started_now_for(reason)));
-                    self.schedule_index_upload(upload_queue)?;
+                    self.schedule_index_upload(upload_queue);
                     Some(self.schedule_barrier0(upload_queue))
                 }
             }
@@ -1057,8 +1053,7 @@ impl RemoteTimelineClient {
                     upload_queue.dirty.gc_blocking =
                         current.as_ref().and_then(|x| x.without_reason(reason));
                     assert!(wanted(upload_queue.dirty.gc_blocking.as_ref()));
-                    // FIXME: bogus ?
-                    self.schedule_index_upload(upload_queue)?;
+                    self.schedule_index_upload(upload_queue);
                     Some(self.schedule_barrier0(upload_queue))
                 }
             }
@@ -1125,8 +1120,8 @@ impl RemoteTimelineClient {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
 
-        let with_metadata = self
-            .schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned())?;
+        let with_metadata =
+            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());
 
         self.schedule_deletion_of_unlinked0(upload_queue, with_metadata);
 
@@ -1153,7 +1148,7 @@ impl RemoteTimelineClient {
 
         let names = gc_layers.iter().map(|x| x.layer_desc().layer_name());
 
-        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?;
+        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
 
         self.launch_queued_tasks(upload_queue);
 
@@ -1166,7 +1161,7 @@ impl RemoteTimelineClient {
         self: &Arc<Self>,
         upload_queue: &mut UploadQueueInitialized,
         names: I,
-    ) -> Result<Vec<(LayerName, LayerFileMetadata)>, NotInitialized>
+    ) -> Vec<(LayerName, LayerFileMetadata)>
     where
         I: IntoIterator<Item = LayerName>,
     {
@@ -1208,10 +1203,10 @@ impl RemoteTimelineClient {
         // index_part update, because that needs to be uploaded before we can actually delete the
         // files.
         if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue)?;
+            self.schedule_index_upload(upload_queue);
         }
 
-        Ok(with_metadata)
+        with_metadata
     }
 
     /// Schedules deletion for layer files which have previously been unlinked from the
@@ -1302,7 +1297,7 @@ impl RemoteTimelineClient {
 
         let names = compacted_from.iter().map(|x| x.layer_desc().layer_name());
 
-        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?;
+        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
         self.launch_queued_tasks(upload_queue);
 
         Ok(())

From 2e4c9c570491b6747e65bdeba64dc55f0dec2ea3 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 16 Dec 2024 16:11:39 +0000
Subject: [PATCH 052/583] chore(proxy): remove allow_self_signed from regular
 proxy (#10157)

I noticed that the only place we use this flag is for testing console
redirect proxy. Makes sense to me to make this assumption more explicit.
---
 proxy/src/proxy/mod.rs | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index de0ec0f799..5db92d748a 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -191,13 +191,6 @@ impl ClientMode {
         }
     }
 
-    pub(crate) fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool {
-        match self {
-            ClientMode::Tcp => config.allow_self_signed_compute,
-            ClientMode::Websockets { .. } => false,
-        }
-    }
-
     fn hostname<'a, S>(&'a self, s: &'a Stream<S>) -> Option<&'a str> {
         match self {
             ClientMode::Tcp => s.sni_hostname(),
@@ -355,7 +348,8 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
             params_compat,
             params: &params,
             locks: &config.connect_compute_locks,
-            allow_self_signed_compute: mode.allow_self_signed_compute(config),
+            // only used for console redirect testing.
+            allow_self_signed_compute: false,
         },
         &user_info,
         config.wake_compute_retry_config,

From 59b7ff89881ac218f35f545ce38f556d08af36dd Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 16 Dec 2024 16:37:15 +0000
Subject: [PATCH 053/583] chore(proxy): disallow unwrap and unimplemented
 (#10142)

As the title says, I updated the lint rules to no longer allow unwrap or
unimplemented.

Three special cases:
* Tests are allowed to use them
* std::sync::Mutex lock().unwrap() is common because it's usually
correct to continue panicking on poison
* `tokio::spawn_blocking(...).await.unwrap()` is common because it will
only error if the blocking fn panics, so continuing the panic is also
correct

I've introduced two extension traits to help with these last two, that
are a bit more explicit so they don't need an expect message every time.
---
 proxy/src/auth/backend/jwt.rs                 |  1 +
 proxy/src/auth/backend/mod.rs                 |  2 +
 proxy/src/auth/credentials.rs                 |  1 +
 proxy/src/cache/endpoints.rs                  |  4 +-
 proxy/src/cache/project_info.rs               |  1 +
 proxy/src/cancellation.rs                     |  4 +-
 proxy/src/config.rs                           | 13 ++----
 proxy/src/context/parquet.rs                  | 14 ++++--
 proxy/src/control_plane/client/mock.rs        |  4 +-
 proxy/src/ext.rs                              | 41 +++++++++++++++++
 proxy/src/http/health_server.rs               |  7 +--
 proxy/src/intern.rs                           |  3 +-
 proxy/src/lib.rs                              |  5 ++-
 proxy/src/logging.rs                          | 12 ++++-
 proxy/src/parse.rs                            | 15 -------
 proxy/src/protocol2.rs                        |  1 +
 proxy/src/proxy/copy_bidirectional.rs         |  1 +
 proxy/src/proxy/mod.rs                        |  2 +-
 proxy/src/proxy/tests/mod.rs                  |  1 +
 proxy/src/rate_limiter/leaky_bucket.rs        |  2 +-
 .../src/rate_limiter/limit_algorithm/aimd.rs  |  1 +
 proxy/src/rate_limiter/limiter.rs             |  4 +-
 .../connection_with_credentials_provider.rs   |  6 ++-
 proxy/src/sasl/messages.rs                    |  5 ++-
 proxy/src/scram/messages.rs                   |  1 +
 proxy/src/scram/mod.rs                        |  1 +
 proxy/src/scram/secret.rs                     |  1 +
 proxy/src/scram/threadpool.rs                 | 16 +++----
 proxy/src/serverless/backend.rs               |  6 ++-
 proxy/src/serverless/conn_pool.rs             | 34 ++++++--------
 proxy/src/serverless/conn_pool_lib.rs         | 45 +++++++++----------
 proxy/src/serverless/http_util.rs             | 11 +++--
 proxy/src/serverless/json.rs                  |  6 ++-
 proxy/src/serverless/local_conn_pool.rs       | 11 ++---
 proxy/src/serverless/mod.rs                   |  5 ++-
 proxy/src/serverless/sql_over_http.rs         |  1 +
 proxy/src/serverless/websocket.rs             |  1 +
 proxy/src/url.rs                              |  1 +
 proxy/src/usage_metrics.rs                    |  1 +
 39 files changed, 178 insertions(+), 113 deletions(-)
 create mode 100644 proxy/src/ext.rs

diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index a258090b15..df716f8455 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -776,6 +776,7 @@ impl From<&jose_jwk::Key> for KeyType {
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use std::future::IntoFuture;
     use std::net::SocketAddr;
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index f38ecf715f..50cb94bfa0 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -463,6 +463,8 @@ impl ComputeConnectBackend for Backend<'_, ComputeCredentials> {
 
 #[cfg(test)]
 mod tests {
+    #![allow(clippy::unimplemented, clippy::unwrap_used)]
+
     use std::net::IpAddr;
     use std::sync::Arc;
     use std::time::Duration;
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index f6bce9f2d8..eff49a402a 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -250,6 +250,7 @@ fn project_name_valid(name: &str) -> bool {
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use serde_json::json;
     use ComputeUserInfoParseError::*;
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 20db1fbb14..0136446d6d 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -12,6 +12,7 @@ use tracing::info;
 
 use crate::config::EndpointCacheConfig;
 use crate::context::RequestContext;
+use crate::ext::LockExt;
 use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
 use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
 use crate::rate_limiter::GlobalRateLimiter;
@@ -96,7 +97,7 @@ impl EndpointsCache {
 
         // If the limiter allows, we can pretend like it's valid
         // (incase it is, due to redis channel lag).
-        if self.limiter.lock().unwrap().check() {
+        if self.limiter.lock_propagate_poison().check() {
             return true;
         }
 
@@ -258,6 +259,7 @@ impl EndpointsCache {
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use super::*;
 
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index 84430dc812..cab0b8b905 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -365,6 +365,7 @@ impl Cache for ProjectInfoCacheImpl {
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use super::*;
     use crate::scram::ServerSecret;
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index ed717507ee..dd3edd6abc 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -13,6 +13,7 @@ use uuid::Uuid;
 
 use crate::auth::{check_peer_addr_is_in_list, IpPattern};
 use crate::error::ReportableError;
+use crate::ext::LockExt;
 use crate::metrics::{CancellationRequest, CancellationSource, Metrics};
 use crate::rate_limiter::LeakyBucketRateLimiter;
 use crate::redis::cancellation_publisher::{
@@ -114,7 +115,7 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
                 IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here
                 IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
             };
-            if !self.limiter.lock().unwrap().check(subnet_key, 1) {
+            if !self.limiter.lock_propagate_poison().check(subnet_key, 1) {
                 // log only the subnet part of the IP address to know which subnet is rate limited
                 tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}");
                 Metrics::get()
@@ -283,6 +284,7 @@ impl<P> Drop for Session<P> {
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use super::*;
 
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 8bc8e3f96f..1f991d595e 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -221,15 +221,10 @@ impl CertResolver {
     ) -> anyhow::Result<()> {
         let priv_key = {
             let key_bytes = std::fs::read(key_path)
-                .context(format!("Failed to read TLS keys at '{key_path}'"))?;
-            let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec();
-
-            ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
-            PrivateKeyDer::Pkcs8(
-                keys.pop()
-                    .unwrap()
-                    .context(format!("Failed to parse TLS keys at '{key_path}'"))?,
-            )
+                .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?;
+            rustls_pemfile::private_key(&mut &key_bytes[..])
+                .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))?
+                .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))?
         };
 
         let cert_chain_bytes = std::fs::read(cert_path)
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 3105d08526..5f65b17374 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -23,6 +23,7 @@ use utils::backoff;
 use super::{RequestContextInner, LOG_CHAN};
 use crate::config::remote_storage_from_toml;
 use crate::context::LOG_CHAN_DISCONNECT;
+use crate::ext::TaskExt;
 
 #[derive(clap::Args, Clone, Debug)]
 pub struct ParquetUploadArgs {
@@ -171,7 +172,9 @@ pub async fn worker(
     };
 
     let (tx, mut rx) = mpsc::unbounded_channel();
-    LOG_CHAN.set(tx.downgrade()).unwrap();
+    LOG_CHAN
+        .set(tx.downgrade())
+        .expect("only one worker should set the channel");
 
     // setup row stream that will close on cancellation
     let cancellation_token2 = cancellation_token.clone();
@@ -207,7 +210,9 @@ pub async fn worker(
         config.parquet_upload_disconnect_events_remote_storage
     {
         let (tx_disconnect, mut rx_disconnect) = mpsc::unbounded_channel();
-        LOG_CHAN_DISCONNECT.set(tx_disconnect.downgrade()).unwrap();
+        LOG_CHAN_DISCONNECT
+            .set(tx_disconnect.downgrade())
+            .expect("only one worker should set the channel");
 
         // setup row stream that will close on cancellation
         tokio::spawn(async move {
@@ -326,7 +331,7 @@ where
         Ok::<_, parquet::errors::ParquetError>((rows, w, rg_meta))
     })
     .await
-    .unwrap()?;
+    .propagate_task_panic()?;
 
     rows.clear();
     Ok((rows, w, rg_meta))
@@ -352,7 +357,7 @@ async fn upload_parquet(
             Ok((buffer, metadata))
         })
         .await
-        .unwrap()?;
+        .propagate_task_panic()?;
 
     let data = buffer.split().freeze();
 
@@ -409,6 +414,7 @@ async fn upload_parquet(
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use std::net::Ipv4Addr;
     use std::num::NonZeroUsize;
diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs
index 93edd65476..5f8bda0f35 100644
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -102,7 +102,9 @@ impl MockControlPlane {
                     Some(s) => {
                         info!("got allowed_ips: {s}");
                         s.split(',')
-                            .map(|s| IpPattern::from_str(s).unwrap())
+                            .map(|s| {
+                                IpPattern::from_str(s).expect("mocked ip pattern should be correct")
+                            })
                             .collect()
                     }
                     None => vec![],
diff --git a/proxy/src/ext.rs b/proxy/src/ext.rs
new file mode 100644
index 0000000000..8d00afbf51
--- /dev/null
+++ b/proxy/src/ext.rs
@@ -0,0 +1,41 @@
+use std::panic::resume_unwind;
+use std::sync::{Mutex, MutexGuard};
+
+use tokio::task::JoinError;
+
+pub(crate) trait LockExt<T> {
+    fn lock_propagate_poison(&self) -> MutexGuard<'_, T>;
+}
+
+impl<T> LockExt<T> for Mutex<T> {
+    /// Lock the mutex and panic if the mutex was poisoned.
+    #[track_caller]
+    fn lock_propagate_poison(&self) -> MutexGuard<'_, T> {
+        match self.lock() {
+            Ok(guard) => guard,
+            // poison occurs when another thread panicked while holding the lock guard.
+            // since panicking is often unrecoverable, propagating the poison panic is reasonable.
+            Err(poison) => panic!("{poison}"),
+        }
+    }
+}
+
+pub(crate) trait TaskExt<T> {
+    fn propagate_task_panic(self) -> T;
+}
+
+impl<T> TaskExt<T> for Result<T, JoinError> {
+    /// Unwrap the result and panic if the inner task panicked.
+    /// Also panics if the task was cancelled
+    #[track_caller]
+    fn propagate_task_panic(self) -> T {
+        match self {
+            Ok(t) => t,
+            // Using resume_unwind prevents the panic hook being called twice.
+            // Since we use this for structured concurrency, there is only
+            // 1 logical panic, so this is more correct.
+            Err(e) if e.is_panic() => resume_unwind(e.into_panic()),
+            Err(e) => panic!("unexpected task error: {e}"),
+        }
+    }
+}
diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs
index 978ad9f761..6ca091feb7 100644
--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
@@ -14,6 +14,7 @@ use utils::http::error::ApiError;
 use utils::http::json::json_response;
 use utils::http::{RouterBuilder, RouterService};
 
+use crate::ext::{LockExt, TaskExt};
 use crate::jemalloc;
 
 async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -76,7 +77,7 @@ async fn prometheus_metrics_handler(
     let body = tokio::task::spawn_blocking(move || {
         let _span = span.entered();
 
-        let mut state = state.lock().unwrap();
+        let mut state = state.lock_propagate_poison();
         let PrometheusHandler { encoder, metrics } = &mut *state;
 
         metrics
@@ -94,13 +95,13 @@ async fn prometheus_metrics_handler(
         body
     })
     .await
-    .unwrap();
+    .propagate_task_panic();
 
     let response = Response::builder()
         .status(200)
         .header(CONTENT_TYPE, "text/plain; version=0.0.4")
         .body(Body::from(body))
-        .unwrap();
+        .expect("response headers should be valid");
 
     Ok(response)
 }
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index f56d92a6b3..79c6020302 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -83,7 +83,7 @@ impl<Id: InternId> StringInterner<Id> {
     pub(crate) fn new() -> Self {
         StringInterner {
             inner: ThreadedRodeo::with_capacity_memory_limits_and_hasher(
-                Capacity::new(2500, NonZeroUsize::new(1 << 16).unwrap()),
+                Capacity::new(2500, NonZeroUsize::new(1 << 16).expect("value is nonzero")),
                 // unbounded
                 MemoryLimits::for_memory_usage(usize::MAX),
                 BuildHasherDefault::<FxHasher>::default(),
@@ -207,6 +207,7 @@ impl From<ProjectId> for ProjectIdInt {
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use std::sync::OnceLock;
 
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index ba69f9cf2d..a5a72f26d9 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -22,8 +22,8 @@
     clippy::string_add,
     clippy::string_to_string,
     clippy::todo,
-    // TODO: consider clippy::unimplemented
-    // TODO: consider clippy::unwrap_used
+    clippy::unimplemented,
+    clippy::unwrap_used,
 )]
 // List of permanently allowed lints.
 #![allow(
@@ -82,6 +82,7 @@ pub mod console_redirect_proxy;
 pub mod context;
 pub mod control_plane;
 pub mod error;
+mod ext;
 pub mod http;
 pub mod intern;
 pub mod jemalloc;
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index 74d2b9a1d0..41f10f052f 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -18,8 +18,16 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
     let env_filter = EnvFilter::builder()
         .with_default_directive(LevelFilter::INFO.into())
         .from_env_lossy()
-        .add_directive("aws_config=info".parse().unwrap())
-        .add_directive("azure_core::policies::transport=off".parse().unwrap());
+        .add_directive(
+            "aws_config=info"
+                .parse()
+                .expect("this should be a valid filter directive"),
+        )
+        .add_directive(
+            "azure_core::policies::transport=off"
+                .parse()
+                .expect("this should be a valid filter directive"),
+        );
 
     let fmt_layer = tracing_subscriber::fmt::layer()
         .with_ansi(false)
diff --git a/proxy/src/parse.rs b/proxy/src/parse.rs
index 8c0f251066..095d6278cc 100644
--- a/proxy/src/parse.rs
+++ b/proxy/src/parse.rs
@@ -8,14 +8,6 @@ pub(crate) fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> {
     Some((cstr, other))
 }
 
-/// See <https://doc.rust-lang.org/std/primitive.slice.html#method.split_array_ref>.
-pub(crate) fn split_at_const<const N: usize>(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> {
-    (bytes.len() >= N).then(|| {
-        let (head, tail) = bytes.split_at(N);
-        (head.try_into().unwrap(), tail)
-    })
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -33,11 +25,4 @@ mod tests {
         assert_eq!(cstr.to_bytes(), b"foo");
         assert_eq!(rest, b"bar");
     }
-
-    #[test]
-    fn test_split_at_const() {
-        assert!(split_at_const::<0>(b"").is_some());
-        assert!(split_at_const::<1>(b"").is_none());
-        assert!(matches!(split_at_const::<1>(b"ok"), Some((b"o", b"k"))));
-    }
 }
diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index 33a5eb5e1e..0dc97b7097 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -396,6 +396,7 @@ impl NetworkEndianIpv6 {
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use tokio::io::AsyncReadExt;
 
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 4e4af88634..3336a9556a 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -257,6 +257,7 @@ impl CopyBuffer {
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use tokio::io::AsyncWriteExt;
 
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 5db92d748a..4e5ecda237 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -488,7 +488,7 @@ impl NeonOptions {
 
 pub(crate) fn neon_option(bytes: &str) -> Option<(&str, &str)> {
     static RE: OnceCell<Regex> = OnceCell::new();
-    let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").unwrap());
+    let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").expect("regex should be correct"));
 
     let cap = re.captures(bytes)?;
     let (_, [k, v]) = cap.extract();
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 3899ba4267..95c518fed9 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -1,4 +1,5 @@
 //! A group of high-level tests for connection establishing logic and auth.
+#![allow(clippy::unimplemented, clippy::unwrap_used)]
 
 mod mitm;
 
diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs
index 45f9630dde..bff800f0a2 100644
--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -83,7 +83,7 @@ impl From<LeakyBucketConfig> for utils::leaky_bucket::LeakyBucketConfig {
 }
 
 #[cfg(test)]
-#[allow(clippy::float_cmp)]
+#[allow(clippy::float_cmp, clippy::unwrap_used)]
 mod tests {
     use std::time::Duration;
 
diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
index 3000cc4c2a..04e136b6d5 100644
--- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -63,6 +63,7 @@ impl LimitAlgorithm for Aimd {
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use std::time::Duration;
 
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index a048721e77..6f6a8c9d47 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -12,6 +12,7 @@ use rand::{Rng, SeedableRng};
 use tokio::time::{Duration, Instant};
 use tracing::info;
 
+use crate::ext::LockExt;
 use crate::intern::EndpointIdInt;
 
 pub struct GlobalRateLimiter {
@@ -246,12 +247,13 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
         let n = self.map.shards().len();
         // this lock is ok as the periodic cycle of do_gc makes this very unlikely to collide
         // (impossible, infact, unless we have 2048 threads)
-        let shard = self.rand.lock().unwrap().gen_range(0..n);
+        let shard = self.rand.lock_propagate_poison().gen_range(0..n);
         self.map.shards()[shard].write().clear();
     }
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use std::hash::BuildHasherDefault;
     use std::time::Duration;
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index 82139ea1d5..0f6e765b02 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -69,7 +69,11 @@ impl ConnectionWithCredentialsProvider {
 
     pub fn new_with_static_credentials<T: IntoConnectionInfo>(params: T) -> Self {
         Self {
-            credentials: Credentials::Static(params.into_connection_info().unwrap()),
+            credentials: Credentials::Static(
+                params
+                    .into_connection_info()
+                    .expect("static configured redis credentials should be a valid format"),
+            ),
             con: None,
             refresh_token_task: None,
             mutex: tokio::sync::Mutex::new(()),
diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs
index 1373dfba3d..4922ece615 100644
--- a/proxy/src/sasl/messages.rs
+++ b/proxy/src/sasl/messages.rs
@@ -2,7 +2,7 @@
 
 use pq_proto::{BeAuthenticationSaslMessage, BeMessage};
 
-use crate::parse::{split_at_const, split_cstr};
+use crate::parse::split_cstr;
 
 /// SASL-specific payload of [`PasswordMessage`](pq_proto::FeMessage::PasswordMessage).
 #[derive(Debug)]
@@ -19,7 +19,7 @@ impl<'a> FirstMessage<'a> {
         let (method_cstr, tail) = split_cstr(bytes)?;
         let method = method_cstr.to_str().ok()?;
 
-        let (len_bytes, bytes) = split_at_const(tail)?;
+        let (len_bytes, bytes) = tail.split_first_chunk()?;
         let len = u32::from_be_bytes(*len_bytes) as usize;
         if len != bytes.len() {
             return None;
@@ -51,6 +51,7 @@ impl<'a> ServerMessage<&'a str> {
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use super::*;
 
diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs
index 5ee3a51352..0e54e7ded9 100644
--- a/proxy/src/scram/messages.rs
+++ b/proxy/src/scram/messages.rs
@@ -185,6 +185,7 @@ impl fmt::Debug for OwnedServerFirstMessage {
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use super::*;
 
diff --git a/proxy/src/scram/mod.rs b/proxy/src/scram/mod.rs
index 718445f61d..b49a9f32ee 100644
--- a/proxy/src/scram/mod.rs
+++ b/proxy/src/scram/mod.rs
@@ -57,6 +57,7 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use super::threadpool::ThreadPool;
     use super::{Exchange, ServerSecret};
diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs
index 8c6a08d432..eb21b26ab4 100644
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -72,6 +72,7 @@ impl ServerSecret {
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use super::*;
 
diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs
index ebc6dd2a3c..8f1684c75b 100644
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -33,14 +33,11 @@ thread_local! {
 }
 
 impl ThreadPool {
-    pub fn new(n_workers: u8) -> Arc<Self> {
+    pub fn new(mut n_workers: u8) -> Arc<Self> {
         // rayon would be nice here, but yielding in rayon does not work well afaict.
 
         if n_workers == 0 {
-            return Arc::new(Self {
-                runtime: None,
-                metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)),
-            });
+            n_workers = 1;
         }
 
         Arc::new_cyclic(|pool| {
@@ -66,7 +63,7 @@ impl ThreadPool {
                     });
                 })
                 .build()
-                .unwrap();
+                .expect("password threadpool runtime should be configured correctly");
 
             Self {
                 runtime: Some(runtime),
@@ -79,7 +76,7 @@ impl ThreadPool {
         JobHandle(
             self.runtime
                 .as_ref()
-                .unwrap()
+                .expect("runtime is always set")
                 .spawn(JobSpec { pbkdf2, endpoint }),
         )
     }
@@ -87,7 +84,10 @@ impl ThreadPool {
 
 impl Drop for ThreadPool {
     fn drop(&mut self) {
-        self.runtime.take().unwrap().shutdown_background();
+        self.runtime
+            .take()
+            .expect("runtime is always set")
+            .shutdown_background();
     }
 }
 
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 15d883bdb0..449d50b6e7 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -268,7 +268,11 @@ impl PoolingBackend {
 
         if !self.local_pool.initialized(&conn_info) {
             // only install and grant usage one at a time.
-            let _permit = local_backend.initialize.acquire().await.unwrap();
+            let _permit = local_backend
+                .initialize
+                .acquire()
+                .await
+                .expect("semaphore should never be closed");
 
             // check again for race
             if !self.local_pool.initialized(&conn_info) {
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index cac5a173cb..447103edce 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -186,8 +186,8 @@ impl ClientDataRemote {
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
-    use std::mem;
     use std::sync::atomic::AtomicBool;
 
     use super::*;
@@ -269,39 +269,33 @@ mod tests {
             assert_eq!(0, pool.get_global_connections_count());
         }
         {
-            let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
-            client.do_drop().unwrap()();
-            mem::forget(client); // drop the client
+            let client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
+            drop(client);
             assert_eq!(1, pool.get_global_connections_count());
         }
         {
-            let mut closed_client = Client::new(
+            let closed_client = Client::new(
                 create_inner_with(MockClient::new(true)),
                 conn_info.clone(),
                 ep_pool.clone(),
             );
-            closed_client.do_drop().unwrap()();
-            mem::forget(closed_client); // drop the client
-                                        // The closed client shouldn't be added to the pool.
+            drop(closed_client);
             assert_eq!(1, pool.get_global_connections_count());
         }
         let is_closed: Arc<AtomicBool> = Arc::new(false.into());
         {
-            let mut client = Client::new(
+            let client = Client::new(
                 create_inner_with(MockClient(is_closed.clone())),
                 conn_info.clone(),
                 ep_pool.clone(),
             );
-            client.do_drop().unwrap()();
-            mem::forget(client); // drop the client
-
+            drop(client);
             // The client should be added to the pool.
             assert_eq!(2, pool.get_global_connections_count());
         }
         {
-            let mut client = Client::new(create_inner(), conn_info, ep_pool);
-            client.do_drop().unwrap()();
-            mem::forget(client); // drop the client
+            let client = Client::new(create_inner(), conn_info, ep_pool);
+            drop(client);
 
             // The client shouldn't be added to the pool. Because the ep-pool is full.
             assert_eq!(2, pool.get_global_connections_count());
@@ -319,15 +313,13 @@ mod tests {
             &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
         );
         {
-            let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
-            client.do_drop().unwrap()();
-            mem::forget(client); // drop the client
+            let client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
+            drop(client);
             assert_eq!(3, pool.get_global_connections_count());
         }
         {
-            let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
-            client.do_drop().unwrap()();
-            mem::forget(client); // drop the client
+            let client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
+            drop(client);
 
             // The client shouldn't be added to the pool. Because the global pool is full.
             assert_eq!(3, pool.get_global_connections_count());
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index 2a46c8f9c5..44eac77e8f 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -187,19 +187,22 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
 
     pub(crate) fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInnerCommon<C>) {
         let conn_id = client.get_conn_id();
-        let pool_name = pool.read().get_name().to_string();
+        let (max_conn, conn_count, pool_name) = {
+            let pool = pool.read();
+            (
+                pool.global_pool_size_max_conns,
+                pool.global_connections_count
+                    .load(atomic::Ordering::Relaxed),
+                pool.get_name().to_string(),
+            )
+        };
+
         if client.inner.is_closed() {
             info!(%conn_id, "{}: throwing away connection '{conn_info}' because connection is closed", pool_name);
             return;
         }
 
-        let global_max_conn = pool.read().global_pool_size_max_conns;
-        if pool
-            .read()
-            .global_connections_count
-            .load(atomic::Ordering::Relaxed)
-            >= global_max_conn
-        {
+        if conn_count >= max_conn {
             info!(%conn_id, "{}: throwing away connection '{conn_info}' because pool is full", pool_name);
             return;
         }
@@ -633,35 +636,29 @@ impl<C: ClientInnerExt> Client<C> {
     }
 
     pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
-        let aux = &self.inner.as_ref().unwrap().aux;
+        let aux = &self
+            .inner
+            .as_ref()
+            .expect("client inner should not be removed")
+            .aux;
         USAGE_METRICS.register(Ids {
             endpoint_id: aux.endpoint_id,
             branch_id: aux.branch_id,
         })
     }
+}
 
-    pub(crate) fn do_drop(&mut self) -> Option<impl FnOnce() + use<C>> {
+impl<C: ClientInnerExt> Drop for Client<C> {
+    fn drop(&mut self) {
         let conn_info = self.conn_info.clone();
         let client = self
             .inner
             .take()
             .expect("client inner should not be removed");
         if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
-            let current_span = self.span.clone();
+            let _current_span = self.span.enter();
             // return connection to the pool
-            return Some(move || {
-                let _span = current_span.enter();
-                EndpointConnPool::put(&conn_pool, &conn_info, client);
-            });
-        }
-        None
-    }
-}
-
-impl<C: ClientInnerExt> Drop for Client<C> {
-    fn drop(&mut self) {
-        if let Some(drop) = self.do_drop() {
-            tokio::task::spawn_blocking(drop);
+            EndpointConnPool::put(&conn_pool, &conn_info, client);
         }
     }
 }
diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs
index c0208d4f68..d5c948777c 100644
--- a/proxy/src/serverless/http_util.rs
+++ b/proxy/src/serverless/http_util.rs
@@ -81,11 +81,14 @@ impl HttpErrorBody {
             .header(http::header::CONTENT_TYPE, "application/json")
             // we do not have nested maps with non string keys so serialization shouldn't fail
             .body(
-                Full::new(Bytes::from(serde_json::to_string(self).unwrap()))
-                    .map_err(|x| match x {})
-                    .boxed(),
+                Full::new(Bytes::from(
+                    serde_json::to_string(self)
+                        .expect("serialising HttpErrorBody should never fail"),
+                ))
+                .map_err(|x| match x {})
+                .boxed(),
             )
-            .unwrap()
+            .expect("content-type header should be valid")
     }
 }
 
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index 25b25c66d3..ab012bd020 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -204,7 +204,10 @@ fn pg_array_parse_inner(
 
         if c == '\\' {
             escaped = true;
-            (i, c) = pg_array_chr.next().unwrap();
+            let Some(x) = pg_array_chr.next() else {
+                return Err(JsonConversionError::UnbalancedArray);
+            };
+            (i, c) = x;
         }
 
         match c {
@@ -253,6 +256,7 @@ fn pg_array_parse_inner(
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use serde_json::json;
 
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index b84cde9e25..c51a2bc9ba 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -179,7 +179,6 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
         info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection");
     });
     let pool = Arc::downgrade(&global_pool);
-    let pool_clone = pool.clone();
 
     let db_user = conn_info.db_and_user();
     let idle = global_pool.get_idle_timeout();
@@ -273,11 +272,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
         }),
     };
 
-    Client::new(
-        inner,
-        conn_info,
-        Arc::downgrade(&pool_clone.upgrade().unwrap().global_pool),
-    )
+    Client::new(inner, conn_info, Arc::downgrade(&global_pool.global_pool))
 }
 
 impl ClientInnerCommon<postgres_client::Client> {
@@ -321,7 +316,8 @@ fn resign_jwt(sk: &SigningKey, payload: &[u8], jti: u64) -> Result<String, HttpC
     let mut buffer = itoa::Buffer::new();
 
     // encode the jti integer to a json rawvalue
-    let jti = serde_json::from_str::<&RawValue>(buffer.format(jti)).unwrap();
+    let jti = serde_json::from_str::<&RawValue>(buffer.format(jti))
+        .expect("itoa formatted integer should be guaranteed valid json");
 
     // update the jti in-place
     let payload =
@@ -368,6 +364,7 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use p256::ecdsa::SigningKey;
     use typed_json::json;
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 80b42f9e55..c2623e0eca 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -46,6 +46,7 @@ use utils::http::error::ApiError;
 use crate::cancellation::CancellationHandlerMain;
 use crate::config::{ProxyConfig, ProxyProtocolV2};
 use crate::context::RequestContext;
+use crate::ext::TaskExt;
 use crate::metrics::Metrics;
 use crate::protocol2::{read_proxy_protocol, ChainRW, ConnectHeader, ConnectionInfo};
 use crate::proxy::run_until_cancelled;
@@ -84,7 +85,7 @@ pub async fn task_main(
             cancellation_token.cancelled().await;
             tokio::task::spawn_blocking(move || conn_pool.shutdown())
                 .await
-                .unwrap();
+                .propagate_task_panic();
         }
     });
 
@@ -104,7 +105,7 @@ pub async fn task_main(
             cancellation_token.cancelled().await;
             tokio::task::spawn_blocking(move || http_conn_pool.shutdown())
                 .await
-                .unwrap();
+                .propagate_task_panic();
         }
     });
 
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 5e85f5ec40..3e42787a09 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1110,6 +1110,7 @@ impl Discard<'_> {
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use super::*;
 
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index bdb83fe6be..812fedaf04 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -178,6 +178,7 @@ pub(crate) async fn serve_websocket(
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use std::pin::pin;
 
diff --git a/proxy/src/url.rs b/proxy/src/url.rs
index 270cd7c24d..d73a84057a 100644
--- a/proxy/src/url.rs
+++ b/proxy/src/url.rs
@@ -50,6 +50,7 @@ impl std::fmt::Display for ApiUrl {
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use super::*;
 
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index 65e74466f2..487504d709 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -407,6 +407,7 @@ async fn upload_backup_events(
 }
 
 #[cfg(test)]
+#[expect(clippy::unwrap_used)]
 mod tests {
     use std::fs;
     use std::io::BufReader;

From 28ccda0a63f9661d961dbb1e5372f4a78d80346d Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 16 Dec 2024 18:10:55 +0100
Subject: [PATCH 054/583] test_runner: ignore error in
 `test_timeline_archival_chaos` (#10161)

Resolves #10159.
---
 test_runner/regress/test_timeline_archive.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index addf702893..87579f9e92 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -426,6 +426,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
             [
                 ".*removing local file.*because it has unexpected length.*",
                 ".*__temp.*",
+                ".*method=POST path=\\S+/timeline .*: Not activating a Stopping timeline.*",
                 # FIXME: there are still anyhow::Error paths in timeline creation/deletion which
                 # generate 500 results when called during shutdown (https://github.com/neondatabase/neon/issues/9768)
                 ".*InternalServerError.*",

From aa7ab9b3aca4c399e6caa5fd8fb31e6f0a7723b0 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 16 Dec 2024 19:56:24 +0100
Subject: [PATCH 055/583] proxy: Allow dumping TLS session keys for debugging
 (#10163)

## Problem

To debug issues with TLS connections there's no easy way to decrypt
packets unless a client has special support for logging the keys.

## Summary of changes

Add TLS session keys logging to proxy via `SSLKEYLOGFILE` env var gated
by flag.
---
 proxy/src/bin/proxy.rs | 4 ++++
 proxy/src/config.rs    | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 97c4037009..e90555e250 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -105,6 +105,9 @@ struct ProxyCliArgs {
     /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
     #[clap(short = 'c', long, alias = "ssl-cert")]
     tls_cert: Option<String>,
+    /// Allow writing TLS session keys to the given file pointed to by the environment variable `SSLKEYLOGFILE`.
+    #[clap(long, alias = "allow-ssl-keylogfile")]
+    allow_tls_keylogfile: bool,
     /// path to directory with TLS certificates for client postgres connections
     #[clap(long)]
     certs_dir: Option<String>,
@@ -555,6 +558,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             key_path,
             cert_path,
             args.certs_dir.as_ref(),
+            args.allow_tls_keylogfile,
         )?),
         (None, None) => None,
         _ => bail!("either both or neither tls-key and tls-cert must be specified"),
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 1f991d595e..debd77ac32 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -95,6 +95,7 @@ pub fn configure_tls(
     key_path: &str,
     cert_path: &str,
     certs_dir: Option<&String>,
+    allow_tls_keylogfile: bool,
 ) -> anyhow::Result<TlsConfig> {
     let mut cert_resolver = CertResolver::new();
 
@@ -135,6 +136,11 @@ pub fn configure_tls(
 
     config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()];
 
+    if allow_tls_keylogfile {
+        // KeyLogFile will check for the SSLKEYLOGFILE environment variable.
+        config.key_log = Arc::new(rustls::KeyLogFile::new());
+    }
+
     Ok(TlsConfig {
         config: Arc::new(config),
         common_names,

From e226d7a3d1901c1a89f640c8d95b2efd057628b1 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 17 Dec 2024 09:16:54 +0100
Subject: [PATCH 056/583] Fix docker compose with PG17 (#10165)

## Problem
It's impossible to run docker compose with compute v17 due to `pg_anon`
extension which is not supported under PG17.
## Summary of changes
The auto-loading of `pg_anon` is disabled by default
---
 .../compute_wrapper/var/db/postgres/specs/spec.json  |  5 -----
 docker-compose/docker_compose_test.sh                | 12 ++++++------
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
index 8e582e74e1..0308cab451 100644
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
@@ -132,11 +132,6 @@
                 "name": "cron.database",
                 "value": "postgres",
                 "vartype": "string"
-            },
-            {
-                "name": "session_preload_libraries",
-                "value": "anon",
-                "vartype": "string"
             }
         ]
     },
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index c97dfaa901..063664d0c6 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -35,11 +35,11 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
     echo "clean up containers if exists"
     cleanup
     PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version))
-    # The support of pg_anon not yet added to PG17, so we have to remove the corresponding option
-    if [ $pg_version -eq 17 ]; then
+    # The support of pg_anon not yet added to PG17, so we have to add the corresponding option for other PG versions
+    if [ "${pg_version}" -ne 17 ]; then
       SPEC_PATH="compute_wrapper/var/db/postgres/specs"
       mv $SPEC_PATH/spec.json $SPEC_PATH/spec.bak
-      jq 'del(.cluster.settings[] | select (.name == "session_preload_libraries"))' $SPEC_PATH/spec.bak > $SPEC_PATH/spec.json
+      jq '.cluster.settings += [{"name": "session_preload_libraries","value": "anon","vartype": "string"}]' "${SPEC_PATH}/spec.bak" > "${SPEC_PATH}/spec.json"
     fi
     PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d
 
@@ -106,8 +106,8 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
         fi
     fi
     cleanup
-    # The support of pg_anon not yet added to PG17, so we have to remove the corresponding option
-    if [ $pg_version -eq 17 ]; then
-      mv $SPEC_PATH/spec.bak $SPEC_PATH/spec.json
+    # Restore the original spec.json
+    if [ "$pg_version" -ne 17 ]; then
+      mv "$SPEC_PATH/spec.bak" "$SPEC_PATH/spec.json"
     fi
 done

From b0e43c2f88bf81473c931e3dcf50a22d62f3ebd4 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 17 Dec 2024 11:35:00 +0100
Subject: [PATCH 057/583] postgres_ffi: add
 `WalStreamDecoder::complete_record()` benchmark (#10158)

Touches #10097.
---
 Cargo.lock                              |  2 +
 libs/postgres_ffi/Cargo.toml            |  6 +++
 libs/postgres_ffi/benches/README.md     | 26 +++++++++++++
 libs/postgres_ffi/benches/waldecoder.rs | 49 +++++++++++++++++++++++++
 libs/postgres_ffi/src/wal_generator.rs  | 16 ++++++++
 5 files changed, 99 insertions(+)
 create mode 100644 libs/postgres_ffi/benches/README.md
 create mode 100644 libs/postgres_ffi/benches/waldecoder.rs

diff --git a/Cargo.lock b/Cargo.lock
index d1f7746969..5ec5253719 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4401,11 +4401,13 @@ dependencies = [
  "bindgen",
  "bytes",
  "crc32c",
+ "criterion",
  "env_logger",
  "log",
  "memoffset 0.9.0",
  "once_cell",
  "postgres",
+ "pprof",
  "regex",
  "serde",
  "thiserror",
diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml
index e1f5443cbe..b7a376841d 100644
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -9,9 +9,11 @@ regex.workspace = true
 bytes.workspace = true
 anyhow.workspace = true
 crc32c.workspace = true
+criterion.workspace = true
 once_cell.workspace = true
 log.workspace = true
 memoffset.workspace = true
+pprof.workspace = true
 thiserror.workspace = true
 serde.workspace = true
 utils.workspace = true
@@ -24,3 +26,7 @@ postgres.workspace = true
 [build-dependencies]
 anyhow.workspace = true
 bindgen.workspace = true
+
+[[bench]]
+name = "waldecoder"
+harness = false
diff --git a/libs/postgres_ffi/benches/README.md b/libs/postgres_ffi/benches/README.md
new file mode 100644
index 0000000000..00a8980174
--- /dev/null
+++ b/libs/postgres_ffi/benches/README.md
@@ -0,0 +1,26 @@
+## Benchmarks
+
+To run benchmarks:
+
+```sh
+# All benchmarks.
+cargo bench --package postgres_ffi
+
+# Specific file.
+cargo bench --package postgres_ffi --bench waldecoder
+
+# Specific benchmark.
+cargo bench --package postgres_ffi --bench waldecoder complete_record/size=1024
+
+# List available benchmarks.
+cargo bench --package postgres_ffi --benches -- --list
+
+# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds.
+# Output in target/criterion/*/profile/flamegraph.svg.
+cargo bench --package postgres_ffi --bench waldecoder complete_record/size=1024 -- --profile-time 10
+```
+
+Additional charts and statistics are available in `target/criterion/report/index.html`.
+
+Benchmarks are automatically compared against the previous run. To compare against other runs, see
+`--baseline` and `--save-baseline`.
\ No newline at end of file
diff --git a/libs/postgres_ffi/benches/waldecoder.rs b/libs/postgres_ffi/benches/waldecoder.rs
new file mode 100644
index 0000000000..c8cf0d322a
--- /dev/null
+++ b/libs/postgres_ffi/benches/waldecoder.rs
@@ -0,0 +1,49 @@
+use std::ffi::CStr;
+
+use criterion::{criterion_group, criterion_main, Bencher, Criterion};
+use postgres_ffi::v17::wal_generator::LogicalMessageGenerator;
+use postgres_ffi::v17::waldecoder_handler::WalStreamDecoderHandler;
+use postgres_ffi::waldecoder::WalStreamDecoder;
+use pprof::criterion::{Output, PProfProfiler};
+use utils::lsn::Lsn;
+
+const KB: usize = 1024;
+
+// Register benchmarks with Criterion.
+criterion_group!(
+    name = benches;
+    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets = bench_complete_record,
+);
+criterion_main!(benches);
+
+/// Benchmarks WalStreamDecoder::complete_record() for a logical message of varying size.
+fn bench_complete_record(c: &mut Criterion) {
+    let mut g = c.benchmark_group("complete_record");
+    for size in [64, KB, 8 * KB, 128 * KB] {
+        // Kind of weird to change the group throughput per benchmark, but it's the only way
+        // to vary it per benchmark. It works.
+        g.throughput(criterion::Throughput::Bytes(size as u64));
+        g.bench_function(format!("size={size}"), |b| run_bench(b, size).unwrap());
+    }
+
+    fn run_bench(b: &mut Bencher, size: usize) -> anyhow::Result<()> {
+        const PREFIX: &CStr = c"";
+        let value_size = LogicalMessageGenerator::make_value_size(size, PREFIX);
+        let value = vec![1; value_size];
+
+        let mut decoder = WalStreamDecoder::new(Lsn(0), 170000);
+        let msg = LogicalMessageGenerator::new(PREFIX, &value)
+            .next()
+            .unwrap()
+            .encode(Lsn(0));
+        assert_eq!(msg.len(), size);
+
+        b.iter(|| {
+            let msg = msg.clone(); // Bytes::clone() is cheap
+            decoder.complete_record(msg).unwrap();
+        });
+
+        Ok(())
+    }
+}
diff --git a/libs/postgres_ffi/src/wal_generator.rs b/libs/postgres_ffi/src/wal_generator.rs
index dc679eea33..69cc4b771f 100644
--- a/libs/postgres_ffi/src/wal_generator.rs
+++ b/libs/postgres_ffi/src/wal_generator.rs
@@ -231,6 +231,22 @@ impl LogicalMessageGenerator {
         };
         [&header.encode(), prefix, message].concat().into()
     }
+
+    /// Computes how large a value must be to get a record of the given size. Convenience method to
+    /// construct records of pre-determined size. Panics if the record size is too small.
+    pub fn make_value_size(record_size: usize, prefix: &CStr) -> usize {
+        let xlog_header_size = XLOG_SIZE_OF_XLOG_RECORD;
+        let lm_header_size = size_of::<XlLogicalMessage>();
+        let prefix_size = prefix.to_bytes_with_nul().len();
+        let data_header_size = match record_size - xlog_header_size - 2 {
+            0..=255 => 2,
+            256..=258 => panic!("impossible record_size {record_size}"),
+            259.. => 5,
+        };
+        record_size
+            .checked_sub(xlog_header_size + lm_header_size + prefix_size + data_header_size)
+            .expect("record_size too small")
+    }
 }
 
 impl Iterator for LogicalMessageGenerator {

From b5833ef2594b39b6616526b60cbc8c506e5870fc Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 17 Dec 2024 12:24:51 +0000
Subject: [PATCH 058/583] remote_storage: configurable connection pooling for
 ABS (#10169)

## Problem

The ABS SDK's default behavior is to do no connection pooling, i.e. open
and close a fresh connection for each request. Under high request rates,
this can result in an accumulation of TCP connections in TIME_WAIT or
CLOSE_WAIT state, and in extreme cases exhaustion of client ports.

Related: https://github.com/neondatabase/cloud/issues/20971

## Summary of changes

- Add a configurable `conn_pool_size` parameter for Azure storage,
defaulting to zero (current behavior)
- Construct a custom reqwest client using this connection pool size.
---
 Cargo.lock                                   |  1 +
 libs/remote_storage/Cargo.toml               |  1 +
 libs/remote_storage/src/azure_blob.rs        | 20 ++++++++++++++++++--
 libs/remote_storage/src/config.rs            | 14 ++++++++++++++
 libs/remote_storage/tests/test_real_azure.rs |  1 +
 5 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5ec5253719..b9b89efa02 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5064,6 +5064,7 @@ dependencies = [
  "once_cell",
  "pin-project-lite",
  "rand 0.8.5",
+ "reqwest",
  "scopeguard",
  "serde",
  "serde_json",
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 1816825bda..33fa6e89f5 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -18,6 +18,7 @@ camino = { workspace = true, features = ["serde1"] }
 humantime-serde.workspace = true
 hyper = { workspace = true, features = ["client"] }
 futures.workspace = true
+reqwest.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 19c8251ccd..c89f50ef2b 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -8,6 +8,7 @@ use std::io;
 use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::str::FromStr;
+use std::sync::Arc;
 use std::time::Duration;
 use std::time::SystemTime;
 
@@ -15,6 +16,8 @@ use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Context;
 use anyhow::Result;
 use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
+use azure_core::HttpClient;
+use azure_core::TransportOptions;
 use azure_core::{Continuable, RetryOptions};
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::blob::CopyStatus;
@@ -80,8 +83,13 @@ impl AzureBlobStorage {
             StorageCredentials::token_credential(token_credential)
         };
 
-        // we have an outer retry
-        let builder = ClientBuilder::new(account, credentials).retry(RetryOptions::none());
+        let builder = ClientBuilder::new(account, credentials)
+            // we have an outer retry
+            .retry(RetryOptions::none())
+            // Customize transport to configure conneciton pooling
+            .transport(TransportOptions::new(Self::reqwest_client(
+                azure_config.conn_pool_size,
+            )));
 
         let client = builder.container_client(azure_config.container_name.to_owned());
 
@@ -106,6 +114,14 @@ impl AzureBlobStorage {
         })
     }
 
+    fn reqwest_client(conn_pool_size: usize) -> Arc<dyn HttpClient> {
+        let client = reqwest::ClientBuilder::new()
+            .pool_max_idle_per_host(conn_pool_size)
+            .build()
+            .expect("failed to build `reqwest` client");
+        Arc::new(client)
+    }
+
     pub fn relative_path_to_name(&self, path: &RemotePath) -> String {
         assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
         let path_string = path.get_path().as_str();
diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index f6ef31077c..dd49d4d5e7 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -114,6 +114,16 @@ fn default_max_keys_per_list_response() -> Option<i32> {
     DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
 }
 
+fn default_azure_conn_pool_size() -> usize {
+    // Conservative default: no connection pooling.  At time of writing this is the Azure
+    // SDK's default as well, due to historic reports of hard-to-reproduce issues
+    // (https://github.com/hyperium/hyper/issues/2312)
+    //
+    // However, using connection pooling is important to avoid exhausting client ports when
+    // doing huge numbers of requests (https://github.com/neondatabase/cloud/issues/20971)
+    0
+}
+
 impl Debug for S3Config {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("S3Config")
@@ -146,6 +156,8 @@ pub struct AzureConfig {
     pub concurrency_limit: NonZeroUsize,
     #[serde(default = "default_max_keys_per_list_response")]
     pub max_keys_per_list_response: Option<i32>,
+    #[serde(default = "default_azure_conn_pool_size")]
+    pub conn_pool_size: usize,
 }
 
 fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize {
@@ -302,6 +314,7 @@ timeout = '5s'";
     container_region = 'westeurope'
     upload_storage_class = 'INTELLIGENT_TIERING'
     timeout = '7s'
+    conn_pool_size = 8
     ";
 
         let config = parse(toml).unwrap();
@@ -316,6 +329,7 @@ timeout = '5s'";
                     prefix_in_container: None,
                     concurrency_limit: default_remote_storage_azure_concurrency_limit(),
                     max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
+                    conn_pool_size: 8,
                 }),
                 timeout: Duration::from_secs(7),
                 small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index 92d579fec8..15004dbf83 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -218,6 +218,7 @@ async fn create_azure_client(
             prefix_in_container: Some(format!("test_{millis}_{random:08x}/")),
             concurrency_limit: NonZeroUsize::new(100).unwrap(),
             max_keys_per_list_response,
+            conn_pool_size: 8,
         }),
         timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
         small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,

From 2dfd3cab8cd7840afee57acffaa45aa803d04f20 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 17 Dec 2024 17:14:07 +0100
Subject: [PATCH 059/583] fix(compute): Report
 compute_backpressure_throttling_seconds as counter (#10125)

## Problem

It was reported as `gauge`, but it's actually a `counter`.

Also add `_total` suffix as that's the convention for counters.

The corresponding flux-fleet PR:
https://github.com/neondatabase/flux-fleet/pull/386
---
 compute/etc/neon_collector.jsonnet                          | 2 +-
 ...compute_backpressure_throttling_seconds_total.libsonnet} | 6 +++---
 ...ql => compute_backpressure_throttling_seconds_total.sql} | 0
 3 files changed, 4 insertions(+), 4 deletions(-)
 rename compute/etc/sql_exporter/{compute_backpressure_throttling_seconds.libsonnet => compute_backpressure_throttling_seconds_total.libsonnet} (61%)
 rename compute/etc/sql_exporter/{compute_backpressure_throttling_seconds.sql => compute_backpressure_throttling_seconds_total.sql} (100%)

diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet
index aa6cc1cfc8..f8f4cab63b 100644
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -3,7 +3,7 @@
   metrics: [
     import 'sql_exporter/checkpoints_req.libsonnet',
     import 'sql_exporter/checkpoints_timed.libsonnet',
-    import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet',
+    import 'sql_exporter/compute_backpressure_throttling_seconds_total.libsonnet',
     import 'sql_exporter/compute_current_lsn.libsonnet',
     import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
     import 'sql_exporter/compute_logical_snapshots_bytes.libsonnet',
diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.libsonnet b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.libsonnet
similarity index 61%
rename from compute/etc/sql_exporter/compute_backpressure_throttling_seconds.libsonnet
rename to compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.libsonnet
index 02c803cfa6..31725bd179 100644
--- a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.libsonnet
+++ b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.libsonnet
@@ -1,10 +1,10 @@
 {
-  metric_name: 'compute_backpressure_throttling_seconds',
-  type: 'gauge',
+  metric_name: 'compute_backpressure_throttling_seconds_total',
+  type: 'counter',
   help: 'Time compute has spent throttled',
   key_labels: null,
   values: [
     'throttled',
   ],
-  query: importstr 'sql_exporter/compute_backpressure_throttling_seconds.sql',
+  query: importstr 'sql_exporter/compute_backpressure_throttling_seconds_total.sql',
 }
diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql
similarity index 100%
rename from compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql
rename to compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql

From 007b13b79af0fa75151a8fba58a65d3c4933942f Mon Sep 17 00:00:00 2001
From: Mikhail Kot <mikhail@neon.tech>
Date: Tue, 17 Dec 2024 16:43:54 +0000
Subject: [PATCH 060/583] Don't build tests in compute image, use ninja
 (#10149)

Don't build tests in h3 and rdkit: ~15 min speedup.
Use Ninja as cmake generator where possible: ~10 min speedup.
Clean apt cache for smaller images: around 250mb size loss for
intermediate layers
---
 compute/compute-node.Dockerfile | 78 ++++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 30 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 33d2a10285..1e11efeaf8 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -35,10 +35,12 @@ RUN case $DEBIAN_VERSION in \
       ;; \
     esac && \
     apt update &&  \
-    apt install --no-install-recommends -y git autoconf automake libtool build-essential bison flex libreadline-dev \
+    apt install --no-install-recommends --no-install-suggests -y \
+    ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \
     zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \
     libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd \
-    $VERSION_INSTALLS
+    $VERSION_INSTALLS \
+    && apt clean && rm -rf /var/lib/apt/lists/*
 
 #########################################################################################
 #
@@ -113,10 +115,12 @@ ARG DEBIAN_VERSION
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install --no-install-recommends -y gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
+    apt install --no-install-recommends --no-install-suggests -y \
+    gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
     libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \
     libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
-    protobuf-c-compiler xsltproc
+    protobuf-c-compiler xsltproc \
+    && apt clean && rm -rf /var/lib/apt/lists/*
 
 
 # Postgis 3.5.0 requires SFCGAL 1.4+
@@ -143,9 +147,9 @@ RUN case "${DEBIAN_VERSION}" in \
     wget https://gitlab.com/sfcgal/SFCGAL/-/archive/v${SFCGAL_VERSION}/SFCGAL-v${SFCGAL_VERSION}.tar.gz -O SFCGAL.tar.gz && \
     echo "${SFCGAL_CHECKSUM} SFCGAL.tar.gz" | sha256sum --check && \
     mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
-    cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
-    DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
-    make clean && cp -R /sfcgal/* /
+    cmake -DCMAKE_BUILD_TYPE=Release -GNinja . && ninja -j $(getconf _NPROCESSORS_ONLN) && \
+    DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \
+    ninja clean && cp -R /sfcgal/* /
 
 ENV PATH="/usr/local/pgsql/bin:$PATH"
 
@@ -213,9 +217,9 @@ RUN case "${PG_VERSION}" in \
     echo "${PGROUTING_CHECKSUM} pgrouting.tar.gz" | sha256sum --check && \
     mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \
     mkdir build && cd build && \
-    cmake -DCMAKE_BUILD_TYPE=Release .. && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. && \
+    ninja -j $(getconf _NPROCESSORS_ONLN) && \
+    ninja -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
     find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
     cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
@@ -235,7 +239,9 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch
 
 RUN apt update && \
-    apt install --no-install-recommends -y ninja-build python3-dev libncurses5 binutils clang
+    apt install --no-install-recommends --no-install-suggests -y \
+    ninja-build python3-dev libncurses5 binutils clang \
+    && apt clean && rm -rf /var/lib/apt/lists/*
 
 # plv8 3.2.3 supports v17
 # last release v3.2.3 - Sep 7, 2024
@@ -301,9 +307,10 @@ RUN mkdir -p /h3/usr/ && \
     echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \
     mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \
     mkdir build && cd build && \
-    cmake .. -DCMAKE_BUILD_TYPE=Release && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    DESTDIR=/h3 make install && \
+    cmake .. -GNinja -DBUILD_BENCHMARKS=0 -DCMAKE_BUILD_TYPE=Release \
+        -DBUILD_FUZZERS=0 -DBUILD_FILTERS=0 -DBUILD_GENERATORS=0 -DBUILD_TESTING=0 \
+    && ninja -j $(getconf _NPROCESSORS_ONLN) && \
+    DESTDIR=/h3 ninja install && \
     cp -R /h3/usr / && \
     rm -rf build
 
@@ -650,14 +657,15 @@ FROM build-deps AS rdkit-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN apt-get update && \
-    apt-get install --no-install-recommends -y \
+RUN apt update && \
+    apt install --no-install-recommends --no-install-suggests -y \
         libboost-iostreams1.74-dev \
         libboost-regex1.74-dev \
         libboost-serialization1.74-dev \
         libboost-system1.74-dev \
         libeigen3-dev \
-        libboost-all-dev
+        libboost-all-dev \
+    && apt clean && rm -rf /var/lib/apt/lists/*
 
 # rdkit Release_2024_09_1 supports v17
 # last release Release_2024_09_1 - Sep 27, 2024
@@ -693,6 +701,8 @@ RUN case "${PG_VERSION}" in \
         -D RDK_BUILD_MOLINTERCHANGE_SUPPORT=OFF \
         -D RDK_BUILD_YAEHMOP_SUPPORT=OFF \
         -D RDK_BUILD_STRUCTCHECKER_SUPPORT=OFF \
+        -D RDK_TEST_MULTITHREADED=OFF \
+        -D RDK_BUILD_CPP_TESTS=OFF \
         -D RDK_USE_URF=OFF \
         -D RDK_BUILD_PGSQL=ON \
         -D RDK_PGSQL_STATIC=ON \
@@ -704,9 +714,10 @@ RUN case "${PG_VERSION}" in \
         -D RDK_INSTALL_COMIC_FONTS=OFF \
         -D RDK_BUILD_FREETYPE_SUPPORT=OFF \
         -D CMAKE_BUILD_TYPE=Release \
+        -GNinja \
         . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    ninja -j $(getconf _NPROCESSORS_ONLN) && \
+    ninja -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control
 
 #########################################################################################
@@ -849,8 +860,9 @@ FROM build-deps AS rust-extensions-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN apt-get update && \
-    apt-get install --no-install-recommends -y curl libclang-dev && \
+RUN apt update && \
+    apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
+    apt clean && rm -rf /var/lib/apt/lists/* && \
     useradd -ms /bin/bash nonroot -b /home
 
 ENV HOME=/home/nonroot
@@ -885,8 +897,9 @@ FROM build-deps AS rust-extensions-build-pgrx12
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN apt-get update && \
-    apt-get install --no-install-recommends -y curl libclang-dev && \
+RUN apt update && \
+    apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
+    apt clean && rm -rf /var/lib/apt/lists/* && \
     useradd -ms /bin/bash nonroot -b /home
 
 ENV HOME=/home/nonroot
@@ -914,18 +927,22 @@ FROM rust-extensions-build-pgrx12 AS pg-onnx-build
 
 # cmake 3.26 or higher is required, so installing it using pip (bullseye-backports has cmake 3.25).
 # Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise
-RUN apt-get update && apt-get install -y python3 python3-pip python3-venv && \
+RUN apt update && apt install --no-install-recommends --no-install-suggests -y \
+    python3 python3-pip python3-venv && \
+    apt clean && rm -rf /var/lib/apt/lists/* && \
     python3 -m venv venv && \
     . venv/bin/activate && \
     python3 -m pip install cmake==3.30.5 && \
     wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \
     mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
-    ./build.sh --config Release --parallel --skip_submodule_sync --skip_tests --allow_running_as_root
+    ./build.sh --config Release --parallel --cmake_generator Ninja \
+    --skip_submodule_sync --skip_tests --allow_running_as_root
 
 
 FROM pg-onnx-build AS pgrag-pg-build
 
-RUN apt-get install -y protobuf-compiler && \
+RUN apt update && apt install --no-install-recommends --no-install-suggests -y protobuf-compiler \
+    && apt clean && rm -rf /var/lib/apt/lists/* && \
     wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz &&  \
     echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \
     mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \
@@ -1279,8 +1296,8 @@ COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/fast_
 
 FROM debian:$DEBIAN_FLAVOR AS pgbouncer
 RUN set -e \
-    && apt-get update \
-    && apt-get install --no-install-recommends -y \
+    && apt update \
+    && apt install --no-install-suggests --no-install-recommends -y \
         build-essential \
         git \
         ca-certificates \
@@ -1288,7 +1305,8 @@ RUN set -e \
         automake \
         libevent-dev \
         libtool \
-        pkg-config
+        pkg-config \
+    && apt clean && rm -rf /var/lib/apt/lists/*
 
 # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
 ENV PGBOUNCER_TAG=pgbouncer_1_22_1
@@ -1519,7 +1537,7 @@ RUN apt update && \
         procps \
         ca-certificates \
         $VERSION_INSTALLS && \
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
+    apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
 
 # s5cmd 2.2.2 from https://github.com/peak/s5cmd/releases/tag/v2.2.2

From a55853f67fad71c06fd0b856ada7cf8fea4306bd Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 17 Dec 2024 17:51:58 +0100
Subject: [PATCH 061/583] utils: symbolize heap profiles (#10153)

## Problem

Jemalloc heap profiles aren't symbolized. This is inconvenient, and
doesn't work with Grafana Cloud Profiles.

Resolves #9964.

## Summary of changes

Symbolize the heap profiles in-process, and strip unnecessary cruft.

This uses about 100 MB additional memory to cache the DWARF information,
but I believe this is already the case with CPU profiles, which use the
same library for symbolization. With cached DWARF information, the
symbolization CPU overhead is negligible.

Example profiles:

*
[pageserver.pb.gz](https://github.com/user-attachments/files/18141395/pageserver.pb.gz)
*
[safekeeper.pb.gz](https://github.com/user-attachments/files/18141396/safekeeper.pb.gz)
---
 Cargo.lock                      | 105 +++++++++++-------
 Cargo.toml                      |   1 +
 libs/utils/Cargo.toml           |   5 +-
 libs/utils/src/http/endpoint.rs |  46 ++++++--
 libs/utils/src/lib.rs           |   2 +
 libs/utils/src/pprof.rs         | 190 ++++++++++++++++++++++++++++++++
 6 files changed, 298 insertions(+), 51 deletions(-)
 create mode 100644 libs/utils/src/pprof.rs

diff --git a/Cargo.lock b/Cargo.lock
index b9b89efa02..d9ac167042 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10,9 +10,9 @@ checksum = "8b5ace29ee3216de37c0546865ad08edef58b0f9e76838ed8959a84a990e58c5"
 
 [[package]]
 name = "addr2line"
-version = "0.21.0"
+version = "0.24.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb"
+checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1"
 dependencies = [
  "gimli",
 ]
@@ -23,6 +23,12 @@ version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
+[[package]]
+name = "adler2"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
+
 [[package]]
 name = "ahash"
 version = "0.8.11"
@@ -871,17 +877,17 @@ dependencies = [
 
 [[package]]
 name = "backtrace"
-version = "0.3.69"
+version = "0.3.74"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837"
+checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a"
 dependencies = [
  "addr2line",
- "cc",
  "cfg-if",
  "libc",
- "miniz_oxide",
+ "miniz_oxide 0.8.0",
  "object",
  "rustc-demangle",
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
@@ -1127,7 +1133,7 @@ dependencies = [
  "num-traits",
  "serde",
  "wasm-bindgen",
- "windows-targets 0.52.4",
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
@@ -2107,7 +2113,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
 dependencies = [
  "crc32fast",
- "miniz_oxide",
+ "miniz_oxide 0.7.1",
 ]
 
 [[package]]
@@ -2308,9 +2314,9 @@ dependencies = [
 
 [[package]]
 name = "gimli"
-version = "0.28.1"
+version = "0.31.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
+checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
 
 [[package]]
 name = "git-version"
@@ -3404,6 +3410,15 @@ dependencies = [
  "adler",
 ]
 
+[[package]]
+name = "miniz_oxide"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1"
+dependencies = [
+ "adler2",
+]
+
 [[package]]
 name = "mio"
 version = "0.8.11"
@@ -3638,9 +3653,9 @@ dependencies = [
 
 [[package]]
 name = "object"
-version = "0.32.2"
+version = "0.36.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
+checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e"
 dependencies = [
  "memchr",
 ]
@@ -5323,9 +5338,9 @@ dependencies = [
 
 [[package]]
 name = "rustc-demangle"
-version = "0.1.23"
+version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
+checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
 
 [[package]]
 name = "rustc-hash"
@@ -7219,6 +7234,7 @@ dependencies = [
  "anyhow",
  "arc-swap",
  "async-compression",
+ "backtrace",
  "bincode",
  "byteorder",
  "bytes",
@@ -7229,12 +7245,14 @@ dependencies = [
  "criterion",
  "diatomic-waker",
  "fail",
+ "flate2",
  "futures",
  "git-version",
  "hex",
  "hex-literal",
  "humantime",
  "hyper 0.14.30",
+ "itertools 0.10.5",
  "jemalloc_pprof",
  "jsonwebtoken",
  "metrics",
@@ -7591,7 +7609,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be"
 dependencies = [
  "windows-core",
- "windows-targets 0.52.4",
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
@@ -7600,7 +7618,7 @@ version = "0.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
 dependencies = [
- "windows-targets 0.52.4",
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
@@ -7618,7 +7636,7 @@ version = "0.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
 dependencies = [
- "windows-targets 0.52.4",
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
@@ -7638,17 +7656,18 @@ dependencies = [
 
 [[package]]
 name = "windows-targets"
-version = "0.52.4"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
 dependencies = [
- "windows_aarch64_gnullvm 0.52.4",
- "windows_aarch64_msvc 0.52.4",
- "windows_i686_gnu 0.52.4",
- "windows_i686_msvc 0.52.4",
- "windows_x86_64_gnu 0.52.4",
- "windows_x86_64_gnullvm 0.52.4",
- "windows_x86_64_msvc 0.52.4",
+ "windows_aarch64_gnullvm 0.52.6",
+ "windows_aarch64_msvc 0.52.6",
+ "windows_i686_gnu 0.52.6",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc 0.52.6",
+ "windows_x86_64_gnu 0.52.6",
+ "windows_x86_64_gnullvm 0.52.6",
+ "windows_x86_64_msvc 0.52.6",
 ]
 
 [[package]]
@@ -7659,9 +7678,9 @@ checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
 
 [[package]]
 name = "windows_aarch64_gnullvm"
-version = "0.52.4"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
 
 [[package]]
 name = "windows_aarch64_msvc"
@@ -7671,9 +7690,9 @@ checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
 
 [[package]]
 name = "windows_aarch64_msvc"
-version = "0.52.4"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
 
 [[package]]
 name = "windows_i686_gnu"
@@ -7683,9 +7702,15 @@ checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
 
 [[package]]
 name = "windows_i686_gnu"
-version = "0.52.4"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
 
 [[package]]
 name = "windows_i686_msvc"
@@ -7695,9 +7720,9 @@ checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
 
 [[package]]
 name = "windows_i686_msvc"
-version = "0.52.4"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
 
 [[package]]
 name = "windows_x86_64_gnu"
@@ -7707,9 +7732,9 @@ checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
 
 [[package]]
 name = "windows_x86_64_gnu"
-version = "0.52.4"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
@@ -7719,9 +7744,9 @@ checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
-version = "0.52.4"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
 
 [[package]]
 name = "windows_x86_64_msvc"
@@ -7731,9 +7756,9 @@ checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
 
 [[package]]
 name = "windows_x86_64_msvc"
-version = "0.52.4"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
 [[package]]
 name = "winnow"
diff --git a/Cargo.toml b/Cargo.toml
index 056cd5798f..885f02ba81 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -52,6 +52,7 @@ anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
+backtrace = "0.3.74"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 66500fb141..02bf77760a 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -15,17 +15,20 @@ arc-swap.workspace = true
 sentry.workspace = true
 async-compression.workspace = true
 anyhow.workspace = true
+backtrace.workspace = true
 bincode.workspace = true
 bytes.workspace = true
 camino.workspace = true
 chrono.workspace = true
 diatomic-waker.workspace = true
+flate2.workspace = true
 git-version.workspace = true
 hex = { workspace = true, features = ["serde"] }
 humantime.workspace = true
 hyper0 = { workspace = true, features = ["full"] }
+itertools.workspace = true
 fail.workspace = true
-futures = { workspace = true}
+futures = { workspace = true }
 jemalloc_pprof.workspace = true
 jsonwebtoken.workspace = true
 nix.workspace = true
diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index d975b63677..9b37b69939 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,15 +1,22 @@
 use crate::auth::{AuthError, Claims, SwappableJwtAuth};
 use crate::http::error::{api_error_handler, route_error_handler, ApiError};
 use crate::http::request::{get_query_param, parse_query_param};
+use crate::pprof;
+use ::pprof::protos::Message as _;
+use ::pprof::ProfilerGuardBuilder;
 use anyhow::{anyhow, Context};
+use bytes::{Bytes, BytesMut};
 use hyper::header::{HeaderName, AUTHORIZATION, CONTENT_DISPOSITION};
 use hyper::http::HeaderValue;
 use hyper::Method;
 use hyper::{header::CONTENT_TYPE, Body, Request, Response};
 use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
+use regex::Regex;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
+use tokio::sync::{mpsc, Mutex};
+use tokio_stream::wrappers::ReceiverStream;
 use tokio_util::io::ReaderStream;
 use tracing::{debug, info, info_span, warn, Instrument};
 
@@ -18,11 +25,6 @@ use std::io::Write as _;
 use std::str::FromStr;
 use std::time::Duration;
 
-use bytes::{Bytes, BytesMut};
-use pprof::protos::Message as _;
-use tokio::sync::{mpsc, Mutex};
-use tokio_stream::wrappers::ReceiverStream;
-
 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "libmetrics_metric_handler_requests_total",
@@ -365,7 +367,7 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
 
     // Take the profile.
     let report = tokio::task::spawn_blocking(move || {
-        let guard = pprof::ProfilerGuardBuilder::default()
+        let guard = ProfilerGuardBuilder::default()
             .frequency(frequency_hz)
             .blocklist(&["libc", "libgcc", "pthread", "vdso"])
             .build()?;
@@ -457,10 +459,34 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
         }
 
         Format::Pprof => {
-            let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof())
-                .await
-                .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
-                .map_err(ApiError::InternalServerError)?;
+            let data = tokio::task::spawn_blocking(move || {
+                let bytes = prof_ctl.dump_pprof()?;
+                // Symbolize the profile.
+                // TODO: consider moving this upstream to jemalloc_pprof and avoiding the
+                // serialization roundtrip.
+                static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
+                    // Functions to strip from profiles. If true, also remove child frames.
+                    vec![
+                        (Regex::new("^__rust").unwrap(), false),
+                        (Regex::new("^_start$").unwrap(), false),
+                        (Regex::new("^irallocx_prof").unwrap(), true),
+                        (Regex::new("^prof_alloc_prep").unwrap(), true),
+                        (Regex::new("^std::rt::lang_start").unwrap(), false),
+                        (Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
+                    ]
+                });
+                let profile = pprof::decode(&bytes)?;
+                let profile = pprof::symbolize(profile)?;
+                let profile = pprof::strip_locations(
+                    profile,
+                    &["libc", "libgcc", "pthread", "vdso"],
+                    &STRIP_FUNCTIONS,
+                );
+                pprof::encode(&profile)
+            })
+            .await
+            .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
+            .map_err(ApiError::InternalServerError)?;
             Response::builder()
                 .status(200)
                 .header(CONTENT_TYPE, "application/octet-stream")
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index bccd0e0488..2c56dd750f 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -96,6 +96,8 @@ pub mod circuit_breaker;
 
 pub mod try_rcu;
 
+pub mod pprof;
+
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
 #[doc(hidden)]
 pub use git_version;
diff --git a/libs/utils/src/pprof.rs b/libs/utils/src/pprof.rs
new file mode 100644
index 0000000000..90910897bf
--- /dev/null
+++ b/libs/utils/src/pprof.rs
@@ -0,0 +1,190 @@
+use flate2::write::{GzDecoder, GzEncoder};
+use flate2::Compression;
+use itertools::Itertools as _;
+use once_cell::sync::Lazy;
+use pprof::protos::{Function, Line, Message as _, Profile};
+use regex::Regex;
+
+use std::borrow::Cow;
+use std::collections::{HashMap, HashSet};
+use std::ffi::c_void;
+use std::io::Write as _;
+
+/// Decodes a gzip-compressed Protobuf-encoded pprof profile.
+pub fn decode(bytes: &[u8]) -> anyhow::Result<Profile> {
+    let mut gz = GzDecoder::new(Vec::new());
+    gz.write_all(bytes)?;
+    Ok(Profile::parse_from_bytes(&gz.finish()?)?)
+}
+
+/// Encodes a pprof profile as gzip-compressed Protobuf.
+pub fn encode(profile: &Profile) -> anyhow::Result<Vec<u8>> {
+    let mut gz = GzEncoder::new(Vec::new(), Compression::default());
+    profile.write_to_writer(&mut gz)?;
+    Ok(gz.finish()?)
+}
+
+/// Symbolizes a pprof profile using the current binary.
+pub fn symbolize(mut profile: Profile) -> anyhow::Result<Profile> {
+    if !profile.function.is_empty() {
+        return Ok(profile); // already symbolized
+    }
+
+    // Collect function names.
+    let mut functions: HashMap<String, Function> = HashMap::new();
+    let mut strings: HashMap<String, i64> = profile
+        .string_table
+        .into_iter()
+        .enumerate()
+        .map(|(i, s)| (s, i as i64))
+        .collect();
+
+    // Helper to look up or register a string.
+    let mut string_id = |s: &str| -> i64 {
+        // Don't use .entry() to avoid unnecessary allocations.
+        if let Some(id) = strings.get(s) {
+            return *id;
+        }
+        let id = strings.len() as i64;
+        strings.insert(s.to_string(), id);
+        id
+    };
+
+    for loc in &mut profile.location {
+        if !loc.line.is_empty() {
+            continue;
+        }
+
+        // Resolve the line and function for each location.
+        backtrace::resolve(loc.address as *mut c_void, |symbol| {
+            let Some(symname) = symbol.name() else {
+                return;
+            };
+            let mut name = symname.to_string();
+
+            // Strip the Rust monomorphization suffix from the symbol name.
+            static SUFFIX_REGEX: Lazy<Regex> =
+                Lazy::new(|| Regex::new("::h[0-9a-f]{16}$").expect("invalid regex"));
+            if let Some(m) = SUFFIX_REGEX.find(&name) {
+                name.truncate(m.start());
+            }
+
+            let function_id = match functions.get(&name) {
+                Some(function) => function.id,
+                None => {
+                    let id = functions.len() as u64 + 1;
+                    let system_name = String::from_utf8_lossy(symname.as_bytes());
+                    let filename = symbol
+                        .filename()
+                        .map(|path| path.to_string_lossy())
+                        .unwrap_or(Cow::Borrowed(""));
+                    let function = Function {
+                        id,
+                        name: string_id(&name),
+                        system_name: string_id(&system_name),
+                        filename: string_id(&filename),
+                        ..Default::default()
+                    };
+                    functions.insert(name, function);
+                    id
+                }
+            };
+            loc.line.push(Line {
+                function_id,
+                line: symbol.lineno().unwrap_or(0) as i64,
+                ..Default::default()
+            });
+        });
+    }
+
+    // Store the resolved functions, and mark the mapping as resolved.
+    profile.function = functions.into_values().sorted_by_key(|f| f.id).collect();
+    profile.string_table = strings
+        .into_iter()
+        .sorted_by_key(|(_, i)| *i)
+        .map(|(s, _)| s)
+        .collect();
+
+    for mapping in &mut profile.mapping {
+        mapping.has_functions = true;
+        mapping.has_filenames = true;
+    }
+
+    Ok(profile)
+}
+
+/// Strips locations (stack frames) matching the given mappings (substring) or function names
+/// (regex). The function bool specifies whether child frames should be stripped as well.
+///
+/// The string definitions are left behind in the profile for simplicity, to avoid rewriting all
+/// string references.
+pub fn strip_locations(
+    mut profile: Profile,
+    mappings: &[&str],
+    functions: &[(Regex, bool)],
+) -> Profile {
+    // Strip mappings.
+    let mut strip_mappings: HashSet<u64> = HashSet::new();
+
+    profile.mapping.retain(|mapping| {
+        let Some(name) = profile.string_table.get(mapping.filename as usize) else {
+            return true;
+        };
+        if mappings.iter().any(|substr| name.contains(substr)) {
+            strip_mappings.insert(mapping.id);
+            return false;
+        }
+        true
+    });
+
+    // Strip functions.
+    let mut strip_functions: HashMap<u64, bool> = HashMap::new();
+
+    profile.function.retain(|function| {
+        let Some(name) = profile.string_table.get(function.name as usize) else {
+            return true;
+        };
+        for (regex, strip_children) in functions {
+            if regex.is_match(name) {
+                strip_functions.insert(function.id, *strip_children);
+                return false;
+            }
+        }
+        true
+    });
+
+    // Strip locations. The bool specifies whether child frames should be stripped too.
+    let mut strip_locations: HashMap<u64, bool> = HashMap::new();
+
+    profile.location.retain(|location| {
+        for line in &location.line {
+            if let Some(strip_children) = strip_functions.get(&line.function_id) {
+                strip_locations.insert(location.id, *strip_children);
+                return false;
+            }
+        }
+        if strip_mappings.contains(&location.mapping_id) {
+            strip_locations.insert(location.id, false);
+            return false;
+        }
+        true
+    });
+
+    // Strip sample locations.
+    for sample in &mut profile.sample {
+        // First, find the uppermost function with child removal and truncate the stack.
+        if let Some(truncate) = sample
+            .location_id
+            .iter()
+            .rposition(|id| strip_locations.get(id) == Some(&true))
+        {
+            sample.location_id.drain(..=truncate);
+        }
+        // Next, strip any individual frames without child removal.
+        sample
+            .location_id
+            .retain(|id| !strip_locations.contains_key(id));
+    }
+
+    profile
+}

From 7dddbb9570ca52d174b7d53ca2495fb272796a7f Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 17 Dec 2024 12:36:55 -0600
Subject: [PATCH 062/583] Add pg_repack extension (#10100)

Our solutions engineers and some customers would like to have this
extension available.

Link: https://github.com/neondatabase/cloud/issues/18890

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/compute-node.Dockerfile | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 1e11efeaf8..9f1f3b7343 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1185,6 +1185,25 @@ RUN case "${PG_VERSION}" in \
     make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
 
+#########################################################################################
+#
+# Layer "pg_repack"
+# compile pg_repack extension
+#
+#########################################################################################
+
+FROM build-deps AS pg-repack-build
+ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
+
+RUN wget https://github.com/reorg/pg_repack/archive/refs/tags/ver_1.5.2.tar.gz -O pg_repack.tar.gz && \
+    echo '4516cad42251ed3ad53ff619733004db47d5755acac83f75924cd94d1c4fb681 pg_repack.tar.gz' | sha256sum --check && \
+    mkdir pg_repack-src && cd pg_repack-src && tar xzf ../pg_repack.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -1230,6 +1249,7 @@ COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-repack-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \

From 93e958341fc47317a0b430b40ae4906da7294d2f Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Tue, 17 Dec 2024 21:26:54 +0200
Subject: [PATCH 063/583] [proxy]: Use TLS for cancellation queries (#10152)

## Problem
pg_sni_router assumes that all the streams are upgradable to TLS.
Cancellation requests were declined because of using NoTls config.

## Summary of changes
Provide TLS client config for cancellation requests.

Fixes
[#21789](https://github.com/orgs/neondatabase/projects/65/views/1?pane=issue&itemId=90911361&issue=neondatabase%7Ccloud%7C21789)
---
 proxy/src/bin/pg_sni_router.rs |  2 +-
 proxy/src/cancellation.rs      | 61 ++++++++++++++++++++++++++++++++--
 proxy/src/compute.rs           |  6 ++--
 3 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 623a0fd3b2..9538384b9e 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -229,7 +229,7 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
 
             let (raw, read_buf) = stream.into_inner();
             // TODO: Normally, client doesn't send any data before
-            // server says TLS handshake is ok and read_buf is empy.
+            // server says TLS handshake is ok and read_buf is empty.
             // However, you could imagine pipelining of postgres
             // SSLRequest + TLS ClientHello in one hunk similar to
             // pipelining in our node js driver. We should probably
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index dd3edd6abc..a58e3961da 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -3,8 +3,10 @@ use std::sync::Arc;
 
 use dashmap::DashMap;
 use ipnet::{IpNet, Ipv4Net, Ipv6Net};
-use postgres_client::{CancelToken, NoTls};
+use once_cell::sync::OnceCell;
+use postgres_client::{tls::MakeTlsConnect, CancelToken};
 use pq_proto::CancelKeyData;
+use rustls::crypto::ring;
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio::sync::Mutex;
@@ -20,6 +22,9 @@ use crate::redis::cancellation_publisher::{
     CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
 };
 
+use crate::compute::{load_certs, AcceptEverythingVerifier};
+use crate::postgres_rustls::MakeRustlsConnect;
+
 pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
 pub type CancellationHandlerMain = CancellationHandler<Option<Arc<Mutex<RedisPublisherClient>>>>;
 pub(crate) type CancellationHandlerMainInternal = Option<Arc<Mutex<RedisPublisherClient>>>;
@@ -174,7 +179,10 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
                 source: self.from,
                 kind: crate::metrics::CancellationOutcome::Found,
             });
-        info!("cancelling query per user's request using key {key}");
+        info!(
+            "cancelling query per user's request using key {key}, hostname {}, address: {}",
+            cancel_closure.hostname, cancel_closure.socket_addr
+        );
         cancel_closure.try_cancel_query().await
     }
 
@@ -221,6 +229,8 @@ impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
     }
 }
 
+static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
+
 /// This should've been a [`std::future::Future`], but
 /// it's impossible to name a type of an unboxed future
 /// (we'd need something like `#![feature(type_alias_impl_trait)]`).
@@ -229,6 +239,8 @@ pub struct CancelClosure {
     socket_addr: SocketAddr,
     cancel_token: CancelToken,
     ip_allowlist: Vec<IpPattern>,
+    hostname: String, // for pg_sni router
+    allow_self_signed_compute: bool,
 }
 
 impl CancelClosure {
@@ -236,17 +248,60 @@ impl CancelClosure {
         socket_addr: SocketAddr,
         cancel_token: CancelToken,
         ip_allowlist: Vec<IpPattern>,
+        hostname: String,
+        allow_self_signed_compute: bool,
     ) -> Self {
         Self {
             socket_addr,
             cancel_token,
             ip_allowlist,
+            hostname,
+            allow_self_signed_compute,
         }
     }
     /// Cancels the query running on user's compute node.
     pub(crate) async fn try_cancel_query(self) -> Result<(), CancelError> {
         let socket = TcpStream::connect(self.socket_addr).await?;
-        self.cancel_token.cancel_query_raw(socket, NoTls).await?;
+
+        let client_config = if self.allow_self_signed_compute {
+            // Allow all certificates for creating the connection. Used only for tests
+            let verifier = Arc::new(AcceptEverythingVerifier);
+            rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
+                .with_safe_default_protocol_versions()
+                .expect("ring should support the default protocol versions")
+                .dangerous()
+                .with_custom_certificate_verifier(verifier)
+        } else {
+            let root_store = TLS_ROOTS
+                .get_or_try_init(load_certs)
+                .map_err(|_e| {
+                    CancelError::IO(std::io::Error::new(
+                        std::io::ErrorKind::Other,
+                        "TLS root store initialization failed".to_string(),
+                    ))
+                })?
+                .clone();
+            rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
+                .with_safe_default_protocol_versions()
+                .expect("ring should support the default protocol versions")
+                .with_root_certificates(root_store)
+        };
+
+        let client_config = client_config.with_no_client_auth();
+
+        let mut mk_tls = crate::postgres_rustls::MakeRustlsConnect::new(client_config);
+        let tls = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::make_tls_connect(
+            &mut mk_tls,
+            &self.hostname,
+        )
+        .map_err(|e| {
+            CancelError::IO(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                e.to_string(),
+            ))
+        })?;
+
+        self.cancel_token.cancel_query_raw(socket, tls).await?;
         debug!("query was cancelled");
         Ok(())
     }
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 4113b5bb80..42df5ff5e3 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -319,6 +319,8 @@ impl ConnCfg {
                 secret_key,
             },
             vec![],
+            host.to_string(),
+            allow_self_signed_compute,
         );
 
         let connection = PostgresConnection {
@@ -350,7 +352,7 @@ fn filtered_options(options: &str) -> Option<String> {
     Some(options)
 }
 
-fn load_certs() -> Result<Arc<rustls::RootCertStore>, Vec<rustls_native_certs::Error>> {
+pub(crate) fn load_certs() -> Result<Arc<rustls::RootCertStore>, Vec<rustls_native_certs::Error>> {
     let der_certs = rustls_native_certs::load_native_certs();
 
     if !der_certs.errors.is_empty() {
@@ -364,7 +366,7 @@ fn load_certs() -> Result<Arc<rustls::RootCertStore>, Vec<rustls_native_certs::E
 static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
 
 #[derive(Debug)]
-struct AcceptEverythingVerifier;
+pub(crate) struct AcceptEverythingVerifier;
 impl ServerCertVerifier for AcceptEverythingVerifier {
     fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
         use rustls::SignatureScheme;

From fd230227f2fd370821610f91601c9b02dda23e0d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 17 Dec 2024 20:04:09 +0000
Subject: [PATCH 064/583] storcon: include preferred AZ in compute
 notifications (#9953)

## Problem

It is unreliable for the control plane to infer the AZ for computes from
where the tenant is currently attached, because if a tenant happens to
be in a degraded state or a release is ongoing while a compute starts,
then the tenant's attached AZ can be a different one to where it will
run long-term, and the control plane doesn't check back later to restart
the compute.

This can land in parallel with
https://github.com/neondatabase/neon/pull/9947

## Summary of changes

- Thread through the preferred AZ into the compute hook code via the
reconciler
- Include the preferred AZ in the body of compute hook notifications
---
 storage_controller/src/compute_hook.rs        | 138 +++++++++++++-----
 storage_controller/src/reconciler.rs          |  15 +-
 storage_controller/src/service.rs             |  25 +++-
 storage_controller/src/tenant_shard.rs        |   1 +
 test_runner/fixtures/neon_fixtures.py         |   5 +-
 test_runner/regress/test_sharding.py          |   3 +
 .../regress/test_storage_controller.py        |   5 +
 7 files changed, 139 insertions(+), 53 deletions(-)

diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index 2b2ece3f02..69db48f8d1 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -1,3 +1,4 @@
+use std::borrow::Cow;
 use std::error::Error as _;
 use std::sync::Arc;
 use std::{collections::HashMap, time::Duration};
@@ -6,6 +7,7 @@ use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
 use control_plane::local_env::LocalEnv;
 use futures::StreamExt;
 use hyper::StatusCode;
+use pageserver_api::controller_api::AvailabilityZone;
 use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
@@ -28,6 +30,9 @@ struct UnshardedComputeHookTenant {
     // Which node is this tenant attached to
     node_id: NodeId,
 
+    // The tenant's preferred AZ, so that we may pass this on to the control plane
+    preferred_az: Option<AvailabilityZone>,
+
     // Must hold this lock to send a notification.
     send_lock: Arc<tokio::sync::Mutex<Option<ComputeRemoteState>>>,
 }
@@ -36,6 +41,9 @@ struct ShardedComputeHookTenant {
     shard_count: ShardCount,
     shards: Vec<(ShardNumber, NodeId)>,
 
+    // The tenant's preferred AZ, so that we may pass this on to the control plane
+    preferred_az: Option<AvailabilityZone>,
+
     // Must hold this lock to send a notification.  The contents represent
     // the last successfully sent notification, and are used to coalesce multiple
     // updates by only sending when there is a chance since our last successful send.
@@ -64,17 +72,24 @@ enum ComputeHookTenant {
 
 impl ComputeHookTenant {
     /// Construct with at least one shard's information
-    fn new(tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize, node_id: NodeId) -> Self {
+    fn new(
+        tenant_shard_id: TenantShardId,
+        stripe_size: ShardStripeSize,
+        preferred_az: Option<AvailabilityZone>,
+        node_id: NodeId,
+    ) -> Self {
         if tenant_shard_id.shard_count.count() > 1 {
             Self::Sharded(ShardedComputeHookTenant {
                 shards: vec![(tenant_shard_id.shard_number, node_id)],
                 stripe_size,
                 shard_count: tenant_shard_id.shard_count,
+                preferred_az,
                 send_lock: Arc::default(),
             })
         } else {
             Self::Unsharded(UnshardedComputeHookTenant {
                 node_id,
+                preferred_az,
                 send_lock: Arc::default(),
             })
         }
@@ -120,15 +135,20 @@ impl ComputeHookTenant {
 
     /// Set one shard's location.  If stripe size or shard count have changed, Self is reset
     /// and drops existing content.
-    fn update(
-        &mut self,
-        tenant_shard_id: TenantShardId,
-        stripe_size: ShardStripeSize,
-        node_id: NodeId,
-    ) {
+    fn update(&mut self, shard_update: ShardUpdate) {
+        let tenant_shard_id = shard_update.tenant_shard_id;
+        let node_id = shard_update.node_id;
+        let stripe_size = shard_update.stripe_size;
+        let preferred_az = shard_update.preferred_az;
+
         match self {
             Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => {
-                unsharded_tenant.node_id = node_id
+                unsharded_tenant.node_id = node_id;
+                if unsharded_tenant.preferred_az.as_ref()
+                    != preferred_az.as_ref().map(|az| az.as_ref())
+                {
+                    unsharded_tenant.preferred_az = preferred_az.map(|az| az.as_ref().clone());
+                }
             }
             Self::Sharded(sharded_tenant)
                 if sharded_tenant.stripe_size == stripe_size
@@ -146,10 +166,21 @@ impl ComputeHookTenant {
                         .push((tenant_shard_id.shard_number, node_id));
                     sharded_tenant.shards.sort_by_key(|s| s.0)
                 }
+
+                if sharded_tenant.preferred_az.as_ref()
+                    != preferred_az.as_ref().map(|az| az.as_ref())
+                {
+                    sharded_tenant.preferred_az = preferred_az.map(|az| az.as_ref().clone());
+                }
             }
             _ => {
                 // Shard count changed: reset struct.
-                *self = Self::new(tenant_shard_id, stripe_size, node_id);
+                *self = Self::new(
+                    tenant_shard_id,
+                    stripe_size,
+                    preferred_az.map(|az| az.into_owned()),
+                    node_id,
+                );
             }
         }
     }
@@ -165,6 +196,7 @@ struct ComputeHookNotifyRequestShard {
 #[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
 struct ComputeHookNotifyRequest {
     tenant_id: TenantId,
+    preferred_az: Option<String>,
     stripe_size: Option<ShardStripeSize>,
     shards: Vec<ComputeHookNotifyRequestShard>,
 }
@@ -238,6 +270,10 @@ impl ComputeHookTenant {
                     node_id: unsharded_tenant.node_id,
                 }],
                 stripe_size: None,
+                preferred_az: unsharded_tenant
+                    .preferred_az
+                    .as_ref()
+                    .map(|az| az.0.clone()),
             }),
             Self::Sharded(sharded_tenant)
                 if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize =>
@@ -253,6 +289,7 @@ impl ComputeHookTenant {
                         })
                         .collect(),
                     stripe_size: Some(sharded_tenant.stripe_size),
+                    preferred_az: sharded_tenant.preferred_az.as_ref().map(|az| az.0.clone()),
                 })
             }
             Self::Sharded(sharded_tenant) => {
@@ -313,6 +350,17 @@ pub(super) struct ComputeHook {
     client: reqwest::Client,
 }
 
+/// Callers may give us a list of these when asking us to send a bulk batch
+/// of notifications in the background.  This is a 'notification' in the sense of
+/// other code notifying us of a shard's status, rather than being the final notification
+/// that we send upwards to the control plane for the whole tenant.
+pub(crate) struct ShardUpdate<'a> {
+    pub(crate) tenant_shard_id: TenantShardId,
+    pub(crate) node_id: NodeId,
+    pub(crate) stripe_size: ShardStripeSize,
+    pub(crate) preferred_az: Option<Cow<'a, AvailabilityZone>>,
+}
+
 impl ComputeHook {
     pub(super) fn new(config: Config) -> Self {
         let authorization_header = config
@@ -363,6 +411,7 @@ impl ComputeHook {
             tenant_id,
             shards,
             stripe_size,
+            preferred_az: _preferred_az,
         } = reconfigure_request;
 
         let compute_pageservers = shards
@@ -503,24 +552,30 @@ impl ComputeHook {
     }
 
     /// Synchronous phase: update the per-tenant state for the next intended notification
-    fn notify_prepare(
-        &self,
-        tenant_shard_id: TenantShardId,
-        node_id: NodeId,
-        stripe_size: ShardStripeSize,
-    ) -> MaybeSendResult {
+    fn notify_prepare(&self, shard_update: ShardUpdate) -> MaybeSendResult {
         let mut state_locked = self.state.lock().unwrap();
 
         use std::collections::hash_map::Entry;
+        let tenant_shard_id = shard_update.tenant_shard_id;
+
         let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
-            Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
-                tenant_shard_id,
-                stripe_size,
-                node_id,
-            )),
+            Entry::Vacant(e) => {
+                let ShardUpdate {
+                    tenant_shard_id,
+                    node_id,
+                    stripe_size,
+                    preferred_az,
+                } = shard_update;
+                e.insert(ComputeHookTenant::new(
+                    tenant_shard_id,
+                    stripe_size,
+                    preferred_az.map(|az| az.into_owned()),
+                    node_id,
+                ))
+            }
             Entry::Occupied(e) => {
                 let tenant = e.into_mut();
-                tenant.update(tenant_shard_id, stripe_size, node_id);
+                tenant.update(shard_update);
                 tenant
             }
         };
@@ -608,13 +663,14 @@ impl ComputeHook {
     /// if something failed.
     pub(super) fn notify_background(
         self: &Arc<Self>,
-        notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>,
+        notifications: Vec<ShardUpdate>,
         result_tx: tokio::sync::mpsc::Sender<Result<(), (TenantShardId, NotifyError)>>,
         cancel: &CancellationToken,
     ) {
         let mut maybe_sends = Vec::new();
-        for (tenant_shard_id, node_id, stripe_size) in notifications {
-            let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size);
+        for shard_update in notifications {
+            let tenant_shard_id = shard_update.tenant_shard_id;
+            let maybe_send_result = self.notify_prepare(shard_update);
             maybe_sends.push((tenant_shard_id, maybe_send_result))
         }
 
@@ -678,15 +734,14 @@ impl ComputeHook {
     /// periods, but we don't retry forever.  The **caller** is responsible for handling failures and
     /// ensuring that they eventually call again to ensure that the compute is eventually notified of
     /// the proper pageserver nodes for a tenant.
-    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
-    pub(super) async fn notify(
+    #[tracing::instrument(skip_all, fields(tenant_id=%shard_update.tenant_shard_id.tenant_id, shard_id=%shard_update.tenant_shard_id.shard_slug(), node_id))]
+    pub(super) async fn notify<'a>(
         &self,
-        tenant_shard_id: TenantShardId,
-        node_id: NodeId,
-        stripe_size: ShardStripeSize,
+        shard_update: ShardUpdate<'a>,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
-        let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size);
+        let tenant_shard_id = shard_update.tenant_shard_id;
+        let maybe_send_result = self.notify_prepare(shard_update);
         self.notify_execute(maybe_send_result, tenant_shard_id, cancel)
             .await
     }
@@ -739,6 +794,7 @@ pub(crate) mod tests {
                 shard_number: ShardNumber(0),
             },
             ShardStripeSize(12345),
+            None,
             NodeId(1),
         );
 
@@ -765,30 +821,32 @@ pub(crate) mod tests {
         // Writing the first shard of a multi-sharded situation (i.e. in a split)
         // resets the tenant state and puts it in an non-notifying state (need to
         // see all shards)
-        tenant_state.update(
-            TenantShardId {
+        tenant_state.update(ShardUpdate {
+            tenant_shard_id: TenantShardId {
                 tenant_id,
                 shard_count: ShardCount::new(2),
                 shard_number: ShardNumber(1),
             },
-            ShardStripeSize(32768),
-            NodeId(1),
-        );
+            stripe_size: ShardStripeSize(32768),
+            preferred_az: None,
+            node_id: NodeId(1),
+        });
         assert!(matches!(
             tenant_state.maybe_send(tenant_id, None),
             MaybeSendResult::Noop
         ));
 
         // Writing the second shard makes it ready to notify
-        tenant_state.update(
-            TenantShardId {
+        tenant_state.update(ShardUpdate {
+            tenant_shard_id: TenantShardId {
                 tenant_id,
                 shard_count: ShardCount::new(2),
                 shard_number: ShardNumber(0),
             },
-            ShardStripeSize(32768),
-            NodeId(1),
-        );
+            stripe_size: ShardStripeSize(32768),
+            preferred_az: None,
+            node_id: NodeId(1),
+        });
 
         let send_result = tenant_state.maybe_send(tenant_id, None);
         let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 3ad386a95b..475f91eff4 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -1,13 +1,14 @@
 use crate::pageserver_client::PageserverClient;
 use crate::persistence::Persistence;
-use crate::service;
-use pageserver_api::controller_api::PlacementPolicy;
+use crate::{compute_hook, service};
+use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy};
 use pageserver_api::models::{
     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_client::mgmt_api;
 use reqwest::StatusCode;
+use std::borrow::Cow;
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
@@ -45,6 +46,7 @@ pub(super) struct Reconciler {
     pub(crate) reconciler_config: ReconcilerConfig,
 
     pub(crate) config: TenantConfig,
+    pub(crate) preferred_az: Option<AvailabilityZone>,
 
     /// Observed state from the point of view of the reconciler.
     /// This gets updated as the reconciliation makes progress.
@@ -834,9 +836,12 @@ impl Reconciler {
             let result = self
                 .compute_hook
                 .notify(
-                    self.tenant_shard_id,
-                    node.get_id(),
-                    self.shard.stripe_size,
+                    compute_hook::ShardUpdate {
+                        tenant_shard_id: self.tenant_shard_id,
+                        node_id: node.get_id(),
+                        stripe_size: self.shard.stripe_size,
+                        preferred_az: self.preferred_az.as_ref().map(Cow::Borrowed),
+                    },
                     &self.cancel,
                 )
                 .await;
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index a89e4741f6..42b50835f8 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -18,7 +18,7 @@ use crate::{
     background_node_operations::{
         Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION,
     },
-    compute_hook::NotifyError,
+    compute_hook::{self, NotifyError},
     drain_utils::{self, TenantShardDrain, TenantShardIterator},
     id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
     leadership::Leadership,
@@ -656,11 +656,14 @@ impl Service {
                     // emit a compute notification for this. In the case where our observed state does not
                     // yet match our intent, we will eventually reconcile, and that will emit a compute notification.
                     if let Some(attached_at) = tenant_shard.stably_attached() {
-                        compute_notifications.push((
-                            *tenant_shard_id,
-                            attached_at,
-                            tenant_shard.shard.stripe_size,
-                        ));
+                        compute_notifications.push(compute_hook::ShardUpdate {
+                            tenant_shard_id: *tenant_shard_id,
+                            node_id: attached_at,
+                            stripe_size: tenant_shard.shard.stripe_size,
+                            preferred_az: tenant_shard
+                                .preferred_az()
+                                .map(|az| Cow::Owned(az.clone())),
+                        });
                     }
                 }
             }
@@ -4786,7 +4789,15 @@ impl Service {
         for (child_id, child_ps, stripe_size) in child_locations {
             if let Err(e) = self
                 .compute_hook
-                .notify(child_id, child_ps, stripe_size, &self.cancel)
+                .notify(
+                    compute_hook::ShardUpdate {
+                        tenant_shard_id: child_id,
+                        node_id: child_ps,
+                        stripe_size,
+                        preferred_az: preferred_az_id.as_ref().map(Cow::Borrowed),
+                    },
+                    &self.cancel,
+                )
                 .await
             {
                 tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})",
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index f1b921646f..cba579e8a7 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1198,6 +1198,7 @@ impl TenantShard {
             detach,
             reconciler_config,
             config: self.config.clone(),
+            preferred_az: self.preferred_az_id.clone(),
             observed: self.observed.clone(),
             original_observed: self.observed.clone(),
             compute_hook: compute_hook.clone(),
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 13ada1361e..2553a0c99a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -134,6 +134,9 @@ DEFAULT_BRANCH_NAME: str = "main"
 
 BASE_PORT: int = 15000
 
+# By default we create pageservers with this phony AZ
+DEFAULT_AZ_ID: str = "us-east-2a"
+
 
 @pytest.fixture(scope="session")
 def neon_api_key() -> str:
@@ -1093,7 +1096,7 @@ class NeonEnv:
                 "pg_auth_type": pg_auth_type,
                 "http_auth_type": http_auth_type,
                 # Default which can be overriden with `NeonEnvBuilder.pageserver_config_override`
-                "availability_zone": "us-east-2a",
+                "availability_zone": DEFAULT_AZ_ID,
                 # Disable pageserver disk syncs in tests: when running tests concurrently, this avoids
                 # the pageserver taking a long time to start up due to syncfs flushing other tests' data
                 "no_sync": True,
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 743ab0088b..4c381b563f 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -11,6 +11,7 @@ from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineArchival
 from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    DEFAULT_AZ_ID,
     NeonEnv,
     NeonEnvBuilder,
     StorageControllerApiException,
@@ -793,6 +794,7 @@ def test_sharding_split_stripe_size(
         "tenant_id": str(env.initial_tenant),
         "stripe_size": None,
         "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
+        "preferred_az": DEFAULT_AZ_ID,
     }
     assert notifications[0] == expect
 
@@ -812,6 +814,7 @@ def test_sharding_split_stripe_size(
             {"node_id": int(env.pageservers[0].id), "shard_number": 0},
             {"node_id": int(env.pageservers[0].id), "shard_number": 1},
         ],
+        "preferred_az": DEFAULT_AZ_ID,
     }
     log.info(f"Got notification: {notifications[1]}")
     assert notifications[1] == expect_after
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index ae9b596a1b..0be800d103 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -16,6 +16,7 @@ from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    DEFAULT_AZ_ID,
     NeonEnv,
     NeonEnvBuilder,
     NeonPageserver,
@@ -599,6 +600,7 @@ def test_storage_controller_compute_hook(
         "tenant_id": str(env.initial_tenant),
         "stripe_size": None,
         "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
+        "preferred_az": DEFAULT_AZ_ID,
     }
     assert notifications[0] == expect
 
@@ -616,6 +618,7 @@ def test_storage_controller_compute_hook(
         "tenant_id": str(env.initial_tenant),
         "stripe_size": None,
         "shards": [{"node_id": int(env.pageservers[1].id), "shard_number": 0}],
+        "preferred_az": DEFAULT_AZ_ID,
     }
 
     def received_migration_notification():
@@ -643,6 +646,7 @@ def test_storage_controller_compute_hook(
             {"node_id": int(env.pageservers[1].id), "shard_number": 0},
             {"node_id": int(env.pageservers[1].id), "shard_number": 1},
         ],
+        "preferred_az": DEFAULT_AZ_ID,
     }
 
     def received_split_notification():
@@ -714,6 +718,7 @@ def test_storage_controller_stuck_compute_hook(
         "tenant_id": str(env.initial_tenant),
         "stripe_size": None,
         "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
+        "preferred_az": DEFAULT_AZ_ID,
     }
     assert notifications[0] == expect
 

From 2ee6bc5ec42590e958fc246092f8e98202fc9173 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 17 Dec 2024 20:06:18 +0000
Subject: [PATCH 065/583] chore(proxy): update vendored postgres libs to
 edition 2021 (#10139)

I ran `cargo fix --edition` in each project prior, and it found nothing
that needed fixing.
---
 libs/proxy/postgres-protocol2/Cargo.toml              | 2 +-
 libs/proxy/postgres-protocol2/src/lib.rs              | 3 +--
 libs/proxy/postgres-protocol2/src/message/frontend.rs | 1 -
 libs/proxy/postgres-types2/Cargo.toml                 | 2 +-
 libs/proxy/postgres-types2/src/lib.rs                 | 3 +--
 libs/proxy/tokio-postgres2/Cargo.toml                 | 2 +-
 libs/proxy/tokio-postgres2/src/lib.rs                 | 2 +-
 7 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/libs/proxy/postgres-protocol2/Cargo.toml b/libs/proxy/postgres-protocol2/Cargo.toml
index f71c1599c7..f66a292d5e 100644
--- a/libs/proxy/postgres-protocol2/Cargo.toml
+++ b/libs/proxy/postgres-protocol2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "postgres-protocol2"
 version = "0.1.0"
-edition = "2018"
+edition = "2021"
 license = "MIT/Apache-2.0"
 
 [dependencies]
diff --git a/libs/proxy/postgres-protocol2/src/lib.rs b/libs/proxy/postgres-protocol2/src/lib.rs
index 947f2f835d..6032440f9a 100644
--- a/libs/proxy/postgres-protocol2/src/lib.rs
+++ b/libs/proxy/postgres-protocol2/src/lib.rs
@@ -9,8 +9,7 @@
 //!
 //! This library assumes that the `client_encoding` backend parameter has been
 //! set to `UTF8`. It will most likely not behave properly if that is not the case.
-#![doc(html_root_url = "https://docs.rs/postgres-protocol/0.6")]
-#![warn(missing_docs, rust_2018_idioms, clippy::all)]
+#![warn(missing_docs, clippy::all)]
 
 use byteorder::{BigEndian, ByteOrder};
 use bytes::{BufMut, BytesMut};
diff --git a/libs/proxy/postgres-protocol2/src/message/frontend.rs b/libs/proxy/postgres-protocol2/src/message/frontend.rs
index bc6168f337..640f35ada3 100644
--- a/libs/proxy/postgres-protocol2/src/message/frontend.rs
+++ b/libs/proxy/postgres-protocol2/src/message/frontend.rs
@@ -3,7 +3,6 @@
 
 use byteorder::{BigEndian, ByteOrder};
 use bytes::{Buf, BufMut, BytesMut};
-use std::convert::TryFrom;
 use std::error::Error;
 use std::io;
 use std::marker;
diff --git a/libs/proxy/postgres-types2/Cargo.toml b/libs/proxy/postgres-types2/Cargo.toml
index 58cfb5571f..57efd94cd3 100644
--- a/libs/proxy/postgres-types2/Cargo.toml
+++ b/libs/proxy/postgres-types2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "postgres-types2"
 version = "0.1.0"
-edition = "2018"
+edition = "2021"
 license = "MIT/Apache-2.0"
 
 [dependencies]
diff --git a/libs/proxy/postgres-types2/src/lib.rs b/libs/proxy/postgres-types2/src/lib.rs
index 18ba032151..d4f3afdfd4 100644
--- a/libs/proxy/postgres-types2/src/lib.rs
+++ b/libs/proxy/postgres-types2/src/lib.rs
@@ -2,8 +2,7 @@
 //!
 //! This crate is used by the `tokio-postgres` and `postgres` crates. You normally don't need to depend directly on it
 //! unless you want to define your own `ToSql` or `FromSql` definitions.
-#![doc(html_root_url = "https://docs.rs/postgres-types/0.2")]
-#![warn(clippy::all, rust_2018_idioms, missing_docs)]
+#![warn(clippy::all, missing_docs)]
 
 use fallible_iterator::FallibleIterator;
 use postgres_protocol2::types;
diff --git a/libs/proxy/tokio-postgres2/Cargo.toml b/libs/proxy/tokio-postgres2/Cargo.toml
index 7130c1b726..56e7c4da47 100644
--- a/libs/proxy/tokio-postgres2/Cargo.toml
+++ b/libs/proxy/tokio-postgres2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "tokio-postgres2"
 version = "0.1.0"
-edition = "2018"
+edition = "2021"
 license = "MIT/Apache-2.0"
 
 [dependencies]
diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs
index 901ed0c96c..9155dd8279 100644
--- a/libs/proxy/tokio-postgres2/src/lib.rs
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -1,5 +1,5 @@
 //! An asynchronous, pipelined, PostgreSQL client.
-#![warn(rust_2018_idioms, clippy::all)]
+#![warn(clippy::all)]
 
 pub use crate::cancel_token::CancelToken;
 pub use crate::client::{Client, SocketConfig};

From c52514ab02b96e273f8a99362e2401e4a7bc3982 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 17 Dec 2024 21:47:44 +0100
Subject: [PATCH 066/583] Fix allure report creation on periodic `pg_regress`
 testing (#10171)

## Problem
The allure report finishes with the error `HttpError: Resource not
accessible by integration` while running the `pg_regress` test against a
cloud staging project due to a lack of permissions.
## Summary of changes
The permissions are added.
---
 .github/workflows/cloud-regress.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml
index 55f42ea533..09d6acd325 100644
--- a/.github/workflows/cloud-regress.yml
+++ b/.github/workflows/cloud-regress.yml
@@ -21,6 +21,8 @@ concurrency:
 
 permissions:
   id-token: write # aws-actions/configure-aws-credentials
+  statuses: write
+  contents: write
 
 jobs:
   regress:

From aaf980f70d38d6a6a54494aea1b6b16e5328abf5 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 18 Dec 2024 11:34:38 +0200
Subject: [PATCH 067/583] Online checkpoint replication state (#9976)

## Problem

See https://neondb.slack.com/archives/C04DGM6SMTM/p1733180965970089

Replication state is checkpointed only by shutdown checkpoint.
It means that replication snapshots are not removed till compute
shutdown.

## Summary of changes

Checkpoint replication state during online checkpoint

Related Postgres PR:
https://github.com/neondatabase/postgres/pull/546

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 65c4e46baf..7e3f3974bc 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 65c4e46baf56ec05412c7dd63d62faff0b33dcfb
+Subproject commit 7e3f3974bc8895938308f94d0e96879ffae638cd
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c8db81c73f..bff2f70931 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v17": [
     "17.2",
-    "65c4e46baf56ec05412c7dd63d62faff0b33dcfb"
+    "7e3f3974bc8895938308f94d0e96879ffae638cd"
   ],
   "v16": [
     "16.6",

From 85696297c5dc15b74384441f318c9381d99086e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 18 Dec 2024 13:47:56 +0100
Subject: [PATCH 068/583] Add safekeepers command to storcon_cli for listing
 (#10151)

Add a `safekeepers` subcommand to `storcon_cli` that allows listing the
safekeepers.

```
$ curl -X POST --url http://localhost:1234/control/v1/safekeeper/42 --data \
  '{"active":true, "id":42, "created_at":"2023-10-25T09:11:25Z", "updated_at":"2024-08-28T11:32:43Z","region_id":"neon_local","host":"localhost","port":5454,"http_port":0,"version":123,"availability_zone_id":"us-east-2b"}'
$ cargo run --bin storcon_cli  -- --api http://localhost:1234 safekeepers
    Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.38s
     Running `target/debug/storcon_cli --api 'http://localhost:1234' safekeepers`
+----+---------+-----------+------+-----------+------------+
| Id | Version | Host      | Port | Http Port | AZ Id      |
+==========================================================+
| 42 | 123     | localhost | 5454 | 0         | us-east-2b |
+----+---------+-----------+------+-----------+------------+
```

Also:

* Don't return the raw `SafekeeperPersistence` struct that contains the
raw database presentation, but instead a new
`SafekeeperDescribeResponse` struct.
* The `SafekeeperPersistence` struct leaves out the `active` field on
purpose because we want to deprecate it and replace it with a
`scheduling_policy` one.

Part of https://github.com/neondatabase/neon/issues/9981
---
 control_plane/storcon_cli/src/main.rs         | 30 ++++++++++++++++++-
 libs/pageserver_api/src/controller_api.rs     | 17 +++++++++++
 storage_controller/src/persistence.rs         | 13 ++++++++
 storage_controller/src/service.rs             | 26 +++++++++++-----
 .../regress/test_storage_controller.py        |  2 +-
 5 files changed, 78 insertions(+), 10 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index df07216fde..6ee1044c18 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -5,7 +5,8 @@ use clap::{Parser, Subcommand};
 use pageserver_api::{
     controller_api::{
         AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
-        ShardSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
+        SafekeeperDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
+        TenantDescribeResponse, TenantPolicyRequest,
     },
     models::{
         EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -211,6 +212,8 @@ enum Command {
         #[arg(long)]
         timeout: humantime::Duration,
     },
+    /// List safekeepers known to the storage controller
+    Safekeepers {},
 }
 
 #[derive(Parser)]
@@ -1020,6 +1023,31 @@ async fn main() -> anyhow::Result<()> {
                 "Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
             );
         }
+        Command::Safekeepers {} => {
+            let mut resp = storcon_client
+                .dispatch::<(), Vec<SafekeeperDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/safekeeper".to_string(),
+                    None,
+                )
+                .await?;
+
+            resp.sort_by(|a, b| a.id.cmp(&b.id));
+
+            let mut table = comfy_table::Table::new();
+            table.set_header(["Id", "Version", "Host", "Port", "Http Port", "AZ Id"]);
+            for sk in resp {
+                table.add_row([
+                    format!("{}", sk.id),
+                    format!("{}", sk.version),
+                    sk.host,
+                    format!("{}", sk.port),
+                    format!("{}", sk.http_port),
+                    sk.availability_zone_id.to_string(),
+                ]);
+            }
+            println!("{table}");
+        }
     }
 
     Ok(())
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index ec7b81423a..faf11e487c 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -372,6 +372,23 @@ pub struct MetadataHealthListOutdatedResponse {
     pub health_records: Vec<MetadataHealthRecord>,
 }
 
+/// Publicly exposed safekeeper description
+///
+/// The `active` flag which we have in the DB is not included on purpose: it is deprecated.
+#[derive(Serialize, Deserialize, Clone)]
+pub struct SafekeeperDescribeResponse {
+    pub id: NodeId,
+    pub region_id: String,
+    /// 1 is special, it means just created (not currently posted to storcon).
+    /// Zero or negative is not really expected.
+    /// Otherwise the number from `release-$(number_of_commits_on_branch)` tag.
+    pub version: i64,
+    pub host: String,
+    pub port: i32,
+    pub http_port: i32,
+    pub availability_zone_id: String,
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index e17fe78d25..cc377e606e 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -11,6 +11,7 @@ use diesel::Connection;
 use itertools::Itertools;
 use pageserver_api::controller_api::AvailabilityZone;
 use pageserver_api::controller_api::MetadataHealthRecord;
+use pageserver_api::controller_api::SafekeeperDescribeResponse;
 use pageserver_api::controller_api::ShardSchedulingPolicy;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
@@ -1241,6 +1242,18 @@ impl SafekeeperPersistence {
             availability_zone_id: &self.availability_zone_id,
         }
     }
+    pub(crate) fn as_describe_response(&self) -> SafekeeperDescribeResponse {
+        // omit the `active` flag on purpose: it is deprecated.
+        SafekeeperDescribeResponse {
+            id: NodeId(self.id as u64),
+            region_id: self.region_id.clone(),
+            version: self.version,
+            host: self.host.clone(),
+            port: self.port,
+            http_port: self.http_port,
+            availability_zone_id: self.availability_zone_id.clone(),
+        }
+    }
 }
 
 #[derive(Insertable, AsChangeset)]
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 42b50835f8..c0c5bc371a 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -46,10 +46,11 @@ use pageserver_api::{
     controller_api::{
         AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability,
         NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy,
-        ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
-        TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard,
-        TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse,
-        TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
+        SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
+        ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse,
+        TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
+        TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
+        TenantShardMigrateResponse,
     },
     models::{
         SecondaryProgress, TenantConfigPatchRequest, TenantConfigRequest,
@@ -7169,15 +7170,24 @@ impl Service {
 
     pub(crate) async fn safekeepers_list(
         &self,
-    ) -> Result<Vec<crate::persistence::SafekeeperPersistence>, DatabaseError> {
-        self.persistence.list_safekeepers().await
+    ) -> Result<Vec<SafekeeperDescribeResponse>, DatabaseError> {
+        Ok(self
+            .persistence
+            .list_safekeepers()
+            .await?
+            .into_iter()
+            .map(|v| v.as_describe_response())
+            .collect::<Vec<_>>())
     }
 
     pub(crate) async fn get_safekeeper(
         &self,
         id: i64,
-    ) -> Result<crate::persistence::SafekeeperPersistence, DatabaseError> {
-        self.persistence.safekeeper_get(id).await
+    ) -> Result<SafekeeperDescribeResponse, DatabaseError> {
+        self.persistence
+            .safekeeper_get(id)
+            .await
+            .map(|v| v.as_describe_response())
     }
 
     pub(crate) async fn upsert_safekeeper(
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 0be800d103..7062c35e05 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3009,7 +3009,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
 def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
     compared = [dict(a), dict(b)]
 
-    masked_keys = ["created_at", "updated_at"]
+    masked_keys = ["created_at", "updated_at", "active"]
 
     for d in compared:
         # keep deleting these in case we are comparing the body as it will be uploaded by real scripts

From 1d12efc42886bcb204db4c764ea64413da5c8dba Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 18 Dec 2024 10:37:26 -0500
Subject: [PATCH 069/583] fix(pageserver): allow repartition errors during
 gc-compaction smoke tests (#10164)

## Problem

part of https://github.com/neondatabase/neon/issues/9114

In https://github.com/neondatabase/neon/pull/10127 we fixed the race,
but we didn't add the errors to the allowlist.

## Summary of changes

* Allow repartition errors in the gc-compaction smoke test.

I think it might be worth to refactor the code to allow multiple threads
getting a copy of repartition status (i.e., using Rcu) in the future.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 2 +-
 test_runner/regress/test_compaction.py       | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8b6cc8ed84..a4e8f39522 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1823,7 +1823,7 @@ impl Timeline {
         // by estimating the amount of files read for a compaction job. We should also partition on LSN.
         let ((dense_ks, sparse_ks), _) = {
             let Ok(partition) = self.partitioning.try_lock() else {
-                bail!("failed to acquire partition lock");
+                bail!("failed to acquire partition lock during gc-compaction");
             };
             partition.clone()
         };
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 88873c63c2..aef9a825ee 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -134,6 +134,10 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
     }
 
     env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF)
+    env.pageserver.allowed_errors.append(
+        r".*failed to acquire partition lock during gc-compaction.*"
+    )
+    env.pageserver.allowed_errors.append(r".*repartition() called concurrently.*")
 
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline

From 1668d39b7cbe7d1bbe48152cb1b4024a6e2da90a Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 18 Dec 2024 16:51:53 +0100
Subject: [PATCH 070/583] safekeeper: fix typo in allowlist for `/profile/heap`
 (#10186)

---
 safekeeper/src/http/routes.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 9bc1bf3409..6186f4c3ba 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -564,7 +564,7 @@ pub fn make_router(
     if conf.http_auth.is_some() {
         router = router.middleware(auth_middleware(|request| {
             const ALLOWLIST_ROUTES: &[&str] =
-                &["/v1/status", "/metrics", "/profile/cpu", "profile/heap"];
+                &["/v1/status", "/metrics", "/profile/cpu", "/profile/heap"];
             if ALLOWLIST_ROUTES.contains(&request.uri().path()) {
                 None
             } else {

From d63602cc7822eb5f9670d7e7926bb412eba1ff3a Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 18 Dec 2024 16:03:14 +0000
Subject: [PATCH 071/583] chore(proxy): fully remove allow-self-signed-compute
 flag (#10168)

When https://github.com/neondatabase/cloud/pull/21856 is merged, this
flag is no longer necessary.
---
 proxy/src/bin/local_proxy.rs          |  1 -
 proxy/src/bin/proxy.rs                |  7 ---
 proxy/src/cancellation.rs             | 44 +++++++----------
 proxy/src/compute.rs                  | 69 +++------------------------
 proxy/src/config.rs                   |  1 -
 proxy/src/console_redirect_proxy.rs   |  1 -
 proxy/src/control_plane/mod.rs        |  5 +-
 proxy/src/proxy/connect_compute.rs    |  9 +---
 proxy/src/proxy/mod.rs                |  2 -
 test_runner/fixtures/neon_fixtures.py |  1 -
 10 files changed, 25 insertions(+), 115 deletions(-)

diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index 968682cf0f..56bbd94850 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -271,7 +271,6 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
     Ok(Box::leak(Box::new(ProxyConfig {
         tls_config: None,
         metric_collection: None,
-        allow_self_signed_compute: false,
         http_config,
         authentication_config: AuthenticationConfig {
             jwks_cache: JwkCache::default(),
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index e90555e250..3dcf9ca060 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -129,9 +129,6 @@ struct ProxyCliArgs {
     /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
     #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
     connect_compute_lock: String,
-    /// Allow self-signed certificates for compute nodes (for testing)
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    allow_self_signed_compute: bool,
     #[clap(flatten)]
     sql_over_http: SqlOverHttpArgs,
     /// timeout for scram authentication protocol
@@ -564,9 +561,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         _ => bail!("either both or neither tls-key and tls-cert must be specified"),
     };
 
-    if args.allow_self_signed_compute {
-        warn!("allowing self-signed compute certificates");
-    }
     let backup_metric_collection_config = config::MetricBackupCollectionConfig {
         interval: args.metric_backup_collection_interval,
         remote_storage_config: args.metric_backup_collection_remote_storage.clone(),
@@ -641,7 +635,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
     let config = ProxyConfig {
         tls_config,
         metric_collection,
-        allow_self_signed_compute: args.allow_self_signed_compute,
         http_config,
         authentication_config,
         proxy_protocol_v2: args.proxy_protocol_v2,
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index a58e3961da..ebaea173ae 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -4,7 +4,8 @@ use std::sync::Arc;
 use dashmap::DashMap;
 use ipnet::{IpNet, Ipv4Net, Ipv6Net};
 use once_cell::sync::OnceCell;
-use postgres_client::{tls::MakeTlsConnect, CancelToken};
+use postgres_client::tls::MakeTlsConnect;
+use postgres_client::CancelToken;
 use pq_proto::CancelKeyData;
 use rustls::crypto::ring;
 use thiserror::Error;
@@ -14,17 +15,16 @@ use tracing::{debug, info};
 use uuid::Uuid;
 
 use crate::auth::{check_peer_addr_is_in_list, IpPattern};
+use crate::compute::load_certs;
 use crate::error::ReportableError;
 use crate::ext::LockExt;
 use crate::metrics::{CancellationRequest, CancellationSource, Metrics};
+use crate::postgres_rustls::MakeRustlsConnect;
 use crate::rate_limiter::LeakyBucketRateLimiter;
 use crate::redis::cancellation_publisher::{
     CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
 };
 
-use crate::compute::{load_certs, AcceptEverythingVerifier};
-use crate::postgres_rustls::MakeRustlsConnect;
-
 pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
 pub type CancellationHandlerMain = CancellationHandler<Option<Arc<Mutex<RedisPublisherClient>>>>;
 pub(crate) type CancellationHandlerMainInternal = Option<Arc<Mutex<RedisPublisherClient>>>;
@@ -240,7 +240,6 @@ pub struct CancelClosure {
     cancel_token: CancelToken,
     ip_allowlist: Vec<IpPattern>,
     hostname: String, // for pg_sni router
-    allow_self_signed_compute: bool,
 }
 
 impl CancelClosure {
@@ -249,45 +248,34 @@ impl CancelClosure {
         cancel_token: CancelToken,
         ip_allowlist: Vec<IpPattern>,
         hostname: String,
-        allow_self_signed_compute: bool,
     ) -> Self {
         Self {
             socket_addr,
             cancel_token,
             ip_allowlist,
             hostname,
-            allow_self_signed_compute,
         }
     }
     /// Cancels the query running on user's compute node.
     pub(crate) async fn try_cancel_query(self) -> Result<(), CancelError> {
         let socket = TcpStream::connect(self.socket_addr).await?;
 
-        let client_config = if self.allow_self_signed_compute {
-            // Allow all certificates for creating the connection. Used only for tests
-            let verifier = Arc::new(AcceptEverythingVerifier);
-            rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
-                .with_safe_default_protocol_versions()
-                .expect("ring should support the default protocol versions")
-                .dangerous()
-                .with_custom_certificate_verifier(verifier)
-        } else {
-            let root_store = TLS_ROOTS
-                .get_or_try_init(load_certs)
-                .map_err(|_e| {
-                    CancelError::IO(std::io::Error::new(
-                        std::io::ErrorKind::Other,
-                        "TLS root store initialization failed".to_string(),
-                    ))
-                })?
-                .clone();
+        let root_store = TLS_ROOTS
+            .get_or_try_init(load_certs)
+            .map_err(|_e| {
+                CancelError::IO(std::io::Error::new(
+                    std::io::ErrorKind::Other,
+                    "TLS root store initialization failed".to_string(),
+                ))
+            })?
+            .clone();
+
+        let client_config =
             rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
                 .with_safe_default_protocol_versions()
                 .expect("ring should support the default protocol versions")
                 .with_root_certificates(root_store)
-        };
-
-        let client_config = client_config.with_no_client_auth();
+                .with_no_client_auth();
 
         let mut mk_tls = crate::postgres_rustls::MakeRustlsConnect::new(client_config);
         let tls = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::make_tls_connect(
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 42df5ff5e3..8dc9b59e81 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -10,7 +10,6 @@ use postgres_client::tls::MakeTlsConnect;
 use postgres_client::{CancelToken, RawConnection};
 use postgres_protocol::message::backend::NoticeResponseBody;
 use pq_proto::StartupMessageParams;
-use rustls::client::danger::ServerCertVerifier;
 use rustls::crypto::ring;
 use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
@@ -251,7 +250,6 @@ impl ConnCfg {
     pub(crate) async fn connect(
         &self,
         ctx: &RequestContext,
-        allow_self_signed_compute: bool,
         aux: MetricsAuxInfo,
         timeout: Duration,
     ) -> Result<PostgresConnection, ConnectionError> {
@@ -259,25 +257,17 @@ impl ConnCfg {
         let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
         drop(pause);
 
-        let client_config = if allow_self_signed_compute {
-            // Allow all certificates for creating the connection
-            let verifier = Arc::new(AcceptEverythingVerifier);
-            rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
-                .with_safe_default_protocol_versions()
-                .expect("ring should support the default protocol versions")
-                .dangerous()
-                .with_custom_certificate_verifier(verifier)
-        } else {
-            let root_store = TLS_ROOTS
-                .get_or_try_init(load_certs)
-                .map_err(ConnectionError::TlsCertificateError)?
-                .clone();
+        let root_store = TLS_ROOTS
+            .get_or_try_init(load_certs)
+            .map_err(ConnectionError::TlsCertificateError)?
+            .clone();
+
+        let client_config =
             rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
                 .with_safe_default_protocol_versions()
                 .expect("ring should support the default protocol versions")
                 .with_root_certificates(root_store)
-        };
-        let client_config = client_config.with_no_client_auth();
+                .with_no_client_auth();
 
         let mut mk_tls = crate::postgres_rustls::MakeRustlsConnect::new(client_config);
         let tls = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::make_tls_connect(
@@ -320,7 +310,6 @@ impl ConnCfg {
             },
             vec![],
             host.to_string(),
-            allow_self_signed_compute,
         );
 
         let connection = PostgresConnection {
@@ -365,50 +354,6 @@ pub(crate) fn load_certs() -> Result<Arc<rustls::RootCertStore>, Vec<rustls_nati
 }
 static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
 
-#[derive(Debug)]
-pub(crate) struct AcceptEverythingVerifier;
-impl ServerCertVerifier for AcceptEverythingVerifier {
-    fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
-        use rustls::SignatureScheme;
-        // The schemes for which `SignatureScheme::supported_in_tls13` returns true.
-        vec![
-            SignatureScheme::ECDSA_NISTP521_SHA512,
-            SignatureScheme::ECDSA_NISTP384_SHA384,
-            SignatureScheme::ECDSA_NISTP256_SHA256,
-            SignatureScheme::RSA_PSS_SHA512,
-            SignatureScheme::RSA_PSS_SHA384,
-            SignatureScheme::RSA_PSS_SHA256,
-            SignatureScheme::ED25519,
-        ]
-    }
-    fn verify_server_cert(
-        &self,
-        _end_entity: &rustls::pki_types::CertificateDer<'_>,
-        _intermediates: &[rustls::pki_types::CertificateDer<'_>],
-        _server_name: &rustls::pki_types::ServerName<'_>,
-        _ocsp_response: &[u8],
-        _now: rustls::pki_types::UnixTime,
-    ) -> Result<rustls::client::danger::ServerCertVerified, rustls::Error> {
-        Ok(rustls::client::danger::ServerCertVerified::assertion())
-    }
-    fn verify_tls12_signature(
-        &self,
-        _message: &[u8],
-        _cert: &rustls::pki_types::CertificateDer<'_>,
-        _dss: &rustls::DigitallySignedStruct,
-    ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
-        Ok(rustls::client::danger::HandshakeSignatureValid::assertion())
-    }
-    fn verify_tls13_signature(
-        &self,
-        _message: &[u8],
-        _cert: &rustls::pki_types::CertificateDer<'_>,
-        _dss: &rustls::DigitallySignedStruct,
-    ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
-        Ok(rustls::client::danger::HandshakeSignatureValid::assertion())
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index debd77ac32..33d1d2e9e4 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -25,7 +25,6 @@ use crate::types::Host;
 pub struct ProxyConfig {
     pub tls_config: Option<TlsConfig>,
     pub metric_collection: Option<MetricCollectionConfig>,
-    pub allow_self_signed_compute: bool,
     pub http_config: HttpConfig,
     pub authentication_config: AuthenticationConfig,
     pub proxy_protocol_v2: ProxyProtocolV2,
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 02398fb777..c477822e85 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -213,7 +213,6 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
             params_compat: true,
             params: &params,
             locks: &config.connect_compute_locks,
-            allow_self_signed_compute: config.allow_self_signed_compute,
         },
         &user_info,
         config.wake_compute_retry_config,
diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs
index c0718920b4..0ca1a6aae0 100644
--- a/proxy/src/control_plane/mod.rs
+++ b/proxy/src/control_plane/mod.rs
@@ -73,12 +73,9 @@ impl NodeInfo {
     pub(crate) async fn connect(
         &self,
         ctx: &RequestContext,
-        allow_self_signed_compute: bool,
         timeout: Duration,
     ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
-        self.config
-            .connect(ctx, allow_self_signed_compute, self.aux.clone(), timeout)
-            .await
+        self.config.connect(ctx, self.aux.clone(), timeout).await
     }
 
     pub(crate) fn reuse_settings(&mut self, other: Self) {
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 6da4c90a53..4a30d23985 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -73,9 +73,6 @@ pub(crate) struct TcpMechanism<'a> {
 
     /// connect_to_compute concurrency lock
     pub(crate) locks: &'static ApiLocks<Host>,
-
-    /// Whether we should accept self-signed certificates (for testing)
-    pub(crate) allow_self_signed_compute: bool,
 }
 
 #[async_trait]
@@ -93,11 +90,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
     ) -> Result<PostgresConnection, Self::Error> {
         let host = node_info.config.get_host();
         let permit = self.locks.get_permit(&host).await?;
-        permit.release_result(
-            node_info
-                .connect(ctx, self.allow_self_signed_compute, timeout)
-                .await,
-        )
+        permit.release_result(node_info.connect(ctx, timeout).await)
     }
 
     fn update_connect_config(&self, config: &mut compute::ConnCfg) {
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 4e5ecda237..dbe174cab7 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -348,8 +348,6 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
             params_compat,
             params: &params,
             locks: &config.connect_compute_locks,
-            // only used for console redirect testing.
-            allow_self_signed_compute: false,
         },
         &user_info,
         config.wake_compute_retry_config,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 2553a0c99a..9f78ad120b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3222,7 +3222,6 @@ class NeonProxy(PgProtocol):
                 # Link auth backend params
                 *["--auth-backend", "link"],
                 *["--uri", NeonProxy.link_auth_uri],
-                *["--allow-self-signed-compute", "true"],
             ]
 
     class ProxyV1(AuthBackend):

From 835287ba3aa1e6a4fea2c1929fbd601de9354218 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 18 Dec 2024 16:29:47 +0000
Subject: [PATCH 072/583] neon_local: add a `flock` to protect against
 concurrent execution (#10185)

## Problem

`neon_local` has always been unsafe to run concurrently with itself: it
uses simple text files for persistent state, and concurrent runs will
step on each other.

In some test environments we intentionally handle this with mutexes in
python land, but it's fragile to try and always remember to do that.

## Summary of changes

- Add a `flock` based mutex around the `main` function of neon_local,
using the repo directory as the file to lock
- Clean up an Option<> around control_plane_api, this is a drive-by
change because it was one of the fields that had a weird effect when
previous concurrent stuff stamped on it.
---
 control_plane/src/bin/neon_local.rs     | 59 ++++++++++++++++---------
 control_plane/src/local_env.rs          | 10 ++---
 control_plane/src/pageserver.rs         | 26 +++++------
 control_plane/src/storage_controller.rs |  4 +-
 4 files changed, 57 insertions(+), 42 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 1ea443b026..c73debae4c 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -19,6 +19,7 @@ use control_plane::storage_controller::{
     NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
 };
 use control_plane::{broker, local_env};
+use nix::fcntl::{flock, FlockArg};
 use pageserver_api::config::{
     DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
     DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
@@ -36,6 +37,8 @@ use safekeeper_api::{
 };
 use std::borrow::Cow;
 use std::collections::{BTreeSet, HashMap};
+use std::fs::File;
+use std::os::fd::AsRawFd;
 use std::path::PathBuf;
 use std::process::exit;
 use std::str::FromStr;
@@ -689,6 +692,21 @@ struct TimelineTreeEl {
     pub children: BTreeSet<TimelineId>,
 }
 
+/// A flock-based guard over the neon_local repository directory
+struct RepoLock {
+    _file: File,
+}
+
+impl RepoLock {
+    fn new() -> Result<Self> {
+        let repo_dir = File::open(local_env::base_path())?;
+        let repo_dir_fd = repo_dir.as_raw_fd();
+        flock(repo_dir_fd, FlockArg::LockExclusive)?;
+
+        Ok(Self { _file: repo_dir })
+    }
+}
+
 // Main entry point for the 'neon_local' CLI utility
 //
 // This utility helps to manage neon installation. That includes following:
@@ -700,9 +718,14 @@ fn main() -> Result<()> {
     let cli = Cli::parse();
 
     // Check for 'neon init' command first.
-    let subcommand_result = if let NeonLocalCmd::Init(args) = cli.command {
-        handle_init(&args).map(|env| Some(Cow::Owned(env)))
+    let (subcommand_result, _lock) = if let NeonLocalCmd::Init(args) = cli.command {
+        (handle_init(&args).map(|env| Some(Cow::Owned(env))), None)
     } else {
+        // This tool uses a collection of simple files to store its state, and consequently
+        // it is not generally safe to run multiple commands concurrently.  Rather than expect
+        // all callers to know this, use a lock file to protect against concurrent execution.
+        let _repo_lock = RepoLock::new().unwrap();
+
         // all other commands need an existing config
         let env = LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
         let original_env = env.clone();
@@ -728,11 +751,12 @@ fn main() -> Result<()> {
             NeonLocalCmd::Mappings(subcmd) => handle_mappings(&subcmd, env),
         };
 
-        if &original_env != env {
+        let subcommand_result = if &original_env != env {
             subcommand_result.map(|()| Some(Cow::Borrowed(env)))
         } else {
             subcommand_result.map(|()| None)
-        }
+        };
+        (subcommand_result, Some(_repo_lock))
     };
 
     match subcommand_result {
@@ -922,7 +946,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
     } else {
         // User (likely interactive) did not provide a description of the environment, give them the default
         NeonLocalInitConf {
-            control_plane_api: Some(Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap())),
+            control_plane_api: Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap()),
             broker: NeonBroker {
                 listen_addr: DEFAULT_BROKER_ADDR.parse().unwrap(),
             },
@@ -1718,18 +1742,15 @@ async fn handle_start_all_impl(
             broker::start_broker_process(env, &retry_timeout).await
         });
 
-        // Only start the storage controller if the pageserver is configured to need it
-        if env.control_plane_api.is_some() {
-            js.spawn(async move {
-                let storage_controller = StorageController::from_env(env);
-                storage_controller
-                    .start(NeonStorageControllerStartArgs::with_default_instance_id(
-                        retry_timeout,
-                    ))
-                    .await
-                    .map_err(|e| e.context("start storage_controller"))
-            });
-        }
+        js.spawn(async move {
+            let storage_controller = StorageController::from_env(env);
+            storage_controller
+                .start(NeonStorageControllerStartArgs::with_default_instance_id(
+                    retry_timeout,
+                ))
+                .await
+                .map_err(|e| e.context("start storage_controller"))
+        });
 
         for ps_conf in &env.pageservers {
             js.spawn(async move {
@@ -1774,10 +1795,6 @@ async fn neon_start_status_check(
     const RETRY_INTERVAL: Duration = Duration::from_millis(100);
     const NOTICE_AFTER_RETRIES: Duration = Duration::from_secs(5);
 
-    if env.control_plane_api.is_none() {
-        return Ok(());
-    }
-
     let storcon = StorageController::from_env(env);
 
     let retries = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis();
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 032c88a829..489f9c8509 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -76,7 +76,7 @@ pub struct LocalEnv {
 
     // Control plane upcall API for pageserver: if None, we will not run storage_controller  If set, this will
     // be propagated into each pageserver's configuration.
-    pub control_plane_api: Option<Url>,
+    pub control_plane_api: Url,
 
     // Control plane upcall API for storage controller.  If set, this will be propagated into the
     // storage controller's configuration.
@@ -133,7 +133,7 @@ pub struct NeonLocalInitConf {
     pub storage_controller: Option<NeonStorageControllerConf>,
     pub pageservers: Vec<NeonLocalInitPageserverConf>,
     pub safekeepers: Vec<SafekeeperConf>,
-    pub control_plane_api: Option<Option<Url>>,
+    pub control_plane_api: Option<Url>,
     pub control_plane_compute_hook_api: Option<Option<Url>>,
 }
 
@@ -535,7 +535,7 @@ impl LocalEnv {
                 storage_controller,
                 pageservers,
                 safekeepers,
-                control_plane_api,
+                control_plane_api: control_plane_api.unwrap(),
                 control_plane_compute_hook_api,
                 branch_name_mappings,
             }
@@ -638,7 +638,7 @@ impl LocalEnv {
                 storage_controller: self.storage_controller.clone(),
                 pageservers: vec![], // it's skip_serializing anyway
                 safekeepers: self.safekeepers.clone(),
-                control_plane_api: self.control_plane_api.clone(),
+                control_plane_api: Some(self.control_plane_api.clone()),
                 control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(),
                 branch_name_mappings: self.branch_name_mappings.clone(),
             },
@@ -768,7 +768,7 @@ impl LocalEnv {
             storage_controller: storage_controller.unwrap_or_default(),
             pageservers: pageservers.iter().map(Into::into).collect(),
             safekeepers,
-            control_plane_api: control_plane_api.unwrap_or_default(),
+            control_plane_api: control_plane_api.unwrap(),
             control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(),
             branch_name_mappings: Default::default(),
         };
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 9d3f018345..ef5b3d6593 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -95,21 +95,19 @@ impl PageServerNode {
 
         let mut overrides = vec![pg_distrib_dir_param, broker_endpoint_param];
 
-        if let Some(control_plane_api) = &self.env.control_plane_api {
-            overrides.push(format!(
-                "control_plane_api='{}'",
-                control_plane_api.as_str()
-            ));
+        overrides.push(format!(
+            "control_plane_api='{}'",
+            self.env.control_plane_api.as_str()
+        ));
 
-            // Storage controller uses the same auth as pageserver: if JWT is enabled
-            // for us, we will also need it to talk to them.
-            if matches!(conf.http_auth_type, AuthType::NeonJWT) {
-                let jwt_token = self
-                    .env
-                    .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
-                    .unwrap();
-                overrides.push(format!("control_plane_api_token='{}'", jwt_token));
-            }
+        // Storage controller uses the same auth as pageserver: if JWT is enabled
+        // for us, we will also need it to talk to them.
+        if matches!(conf.http_auth_type, AuthType::NeonJWT) {
+            let jwt_token = self
+                .env
+                .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
+                .unwrap();
+            overrides.push(format!("control_plane_api_token='{}'", jwt_token));
         }
 
         if !conf.other.contains_key("remote_storage") {
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index b70bd2e1b5..22d2420ed4 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -338,7 +338,7 @@ impl StorageController {
                         .port(),
                 )
             } else {
-                let listen_url = self.env.control_plane_api.clone().unwrap();
+                let listen_url = self.env.control_plane_api.clone();
 
                 let listen = format!(
                     "{}:{}",
@@ -708,7 +708,7 @@ impl StorageController {
         } else {
             // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
             // for general purpose API access.
-            let listen_url = self.env.control_plane_api.clone().unwrap();
+            let listen_url = self.env.control_plane_api.clone();
             Url::from_str(&format!(
                 "http://{}:{}/{path}",
                 listen_url.host_str().unwrap(),

From 3d1c3a80ae0fd2babe42d7fc2293cb2a058ff8cb Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 18 Dec 2024 13:09:02 -0500
Subject: [PATCH 073/583] feat(pageserver): add compact queue http endpoint
 (#10173)

## Problem

We cannot get the size of the compaction queue and access the info.

Part of #9114

## Summary of changes

* Add an API endpoint to get the compaction queue.
* gc_compaction test case now waits until the compaction finishes.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/models.rs       | 64 +++++++++++++++++++++++++
 pageserver/src/http/routes.rs           | 36 +++++++++++++-
 pageserver/src/tenant.rs                | 23 +++++++--
 pageserver/src/tenant/timeline.rs       | 63 ++----------------------
 test_runner/fixtures/pageserver/http.py | 13 ++++-
 test_runner/regress/test_compaction.py  |  6 +++
 6 files changed, 139 insertions(+), 66 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 5690b643f0..f3fc9fad76 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -6,6 +6,7 @@ pub mod utilization;
 use camino::Utf8PathBuf;
 pub use utilization::PageserverUtilization;
 
+use core::ops::Range;
 use std::{
     collections::HashMap,
     fmt::Display,
@@ -28,6 +29,7 @@ use utils::{
 };
 
 use crate::{
+    key::Key,
     reltag::RelTag,
     shard::{ShardCount, ShardStripeSize, TenantShardId},
 };
@@ -210,6 +212,68 @@ pub enum TimelineState {
     Broken { reason: String, backtrace: String },
 }
 
+#[serde_with::serde_as]
+#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
+pub struct CompactLsnRange {
+    pub start: Lsn,
+    pub end: Lsn,
+}
+
+#[serde_with::serde_as]
+#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
+pub struct CompactKeyRange {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub start: Key,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub end: Key,
+}
+
+impl From<Range<Lsn>> for CompactLsnRange {
+    fn from(range: Range<Lsn>) -> Self {
+        Self {
+            start: range.start,
+            end: range.end,
+        }
+    }
+}
+
+impl From<Range<Key>> for CompactKeyRange {
+    fn from(range: Range<Key>) -> Self {
+        Self {
+            start: range.start,
+            end: range.end,
+        }
+    }
+}
+
+impl From<CompactLsnRange> for Range<Lsn> {
+    fn from(range: CompactLsnRange) -> Self {
+        range.start..range.end
+    }
+}
+
+impl From<CompactKeyRange> for Range<Key> {
+    fn from(range: CompactKeyRange) -> Self {
+        range.start..range.end
+    }
+}
+
+impl CompactLsnRange {
+    pub fn above(lsn: Lsn) -> Self {
+        Self {
+            start: lsn,
+            end: Lsn::MAX,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct CompactInfoResponse {
+    pub compact_key_range: Option<CompactKeyRange>,
+    pub compact_lsn_range: Option<CompactLsnRange>,
+    pub sub_compaction: bool,
+}
+
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TimelineCreateRequest {
     pub new_timeline_id: TimelineId,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index db7d293856..60ef4c3702 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -97,8 +97,8 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::DEFAULT_PG_VERSION;
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
-    StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
-    TimelineInfo,
+    CompactInfoResponse, StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest,
+    TimelineGcRequest, TimelineInfo,
 };
 use utils::{
     auth::SwappableJwtAuth,
@@ -2039,6 +2039,34 @@ async fn timeline_cancel_compact_handler(
     .await
 }
 
+// Get compact info of a timeline
+async fn timeline_compact_info_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
+    async {
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+        let res = tenant.get_scheduled_compaction_tasks(timeline_id);
+        let mut resp = Vec::new();
+        for item in res {
+            resp.push(CompactInfoResponse {
+                compact_key_range: item.compact_key_range,
+                compact_lsn_range: item.compact_lsn_range,
+                sub_compaction: item.sub_compaction,
+            });
+        }
+        json_response(StatusCode::OK, resp)
+    }
+    .instrument(info_span!("timeline_compact_info", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .await
+}
+
 // Run compaction immediately on given timeline.
 async fn timeline_compact_handler(
     mut request: Request<Body>,
@@ -3400,6 +3428,10 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc",
             |r| api_handler(r, timeline_gc_handler),
         )
+        .get(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
+            |r| api_handler(r, timeline_compact_info_handler),
+        )
         .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
             |r| api_handler(r, timeline_compact_handler),
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 99289d5f15..2e4c47c6e4 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3122,6 +3122,23 @@ impl Tenant {
         }
     }
 
+    pub(crate) fn get_scheduled_compaction_tasks(
+        &self,
+        timeline_id: TimelineId,
+    ) -> Vec<CompactOptions> {
+        use itertools::Itertools;
+        let guard = self.scheduled_compaction_tasks.lock().unwrap();
+        guard
+            .get(&timeline_id)
+            .map(|tline_pending_tasks| {
+                tline_pending_tasks
+                    .iter()
+                    .map(|x| x.options.clone())
+                    .collect_vec()
+            })
+            .unwrap_or_default()
+    }
+
     /// Schedule a compaction task for a timeline.
     pub(crate) async fn schedule_compaction(
         &self,
@@ -5759,13 +5776,13 @@ mod tests {
     use timeline::{CompactOptions, DeltaLayerTestDesc};
     use utils::id::TenantId;
 
+    #[cfg(feature = "testing")]
+    use models::CompactLsnRange;
     #[cfg(feature = "testing")]
     use pageserver_api::record::NeonWalRecord;
     #[cfg(feature = "testing")]
     use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
     #[cfg(feature = "testing")]
-    use timeline::CompactLsnRange;
-    #[cfg(feature = "testing")]
     use timeline::GcInfo;
 
     static TEST_KEY: Lazy<Key> =
@@ -9634,7 +9651,7 @@ mod tests {
     #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
-        use timeline::CompactLsnRange;
+        use models::CompactLsnRange;
 
         let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
         let (tenant, ctx) = harness.load().await;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 87f5a03382..e71cb4db80 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -31,9 +31,9 @@ use pageserver_api::{
     },
     keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
     models::{
-        CompactionAlgorithm, CompactionAlgorithmSettings, DownloadRemoteLayersTaskInfo,
-        DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
-        LsnLease, TimelineState,
+        CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings,
+        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
+        InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState,
     },
     reltag::BlockNumber,
     shard::{ShardIdentity, ShardNumber, TenantShardId},
@@ -788,63 +788,6 @@ pub(crate) struct CompactRequest {
     pub sub_compaction_max_job_size_mb: Option<u64>,
 }
 
-#[serde_with::serde_as]
-#[derive(Debug, Clone, serde::Deserialize)]
-pub(crate) struct CompactLsnRange {
-    pub start: Lsn,
-    pub end: Lsn,
-}
-
-#[serde_with::serde_as]
-#[derive(Debug, Clone, serde::Deserialize)]
-pub(crate) struct CompactKeyRange {
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    pub start: Key,
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    pub end: Key,
-}
-
-impl From<Range<Lsn>> for CompactLsnRange {
-    fn from(range: Range<Lsn>) -> Self {
-        Self {
-            start: range.start,
-            end: range.end,
-        }
-    }
-}
-
-impl From<Range<Key>> for CompactKeyRange {
-    fn from(range: Range<Key>) -> Self {
-        Self {
-            start: range.start,
-            end: range.end,
-        }
-    }
-}
-
-impl From<CompactLsnRange> for Range<Lsn> {
-    fn from(range: CompactLsnRange) -> Self {
-        range.start..range.end
-    }
-}
-
-impl From<CompactKeyRange> for Range<Key> {
-    fn from(range: CompactKeyRange) -> Self {
-        range.start..range.end
-    }
-}
-
-impl CompactLsnRange {
-    #[cfg(test)]
-    #[cfg(feature = "testing")]
-    pub fn above(lsn: Lsn) -> Self {
-        Self {
-            start: lsn,
-            end: Lsn::MAX,
-        }
-    }
-}
-
 #[derive(Debug, Clone, Default)]
 pub(crate) struct CompactOptions {
     pub flags: EnumSet<CompactFlags>,
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index eabdeb1053..378e568622 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -738,6 +738,18 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res_json = res.json()
         assert res_json is None
 
+    def timeline_compact_info(
+        self,
+        tenant_id: TenantId | TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Any:
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact",
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        return res_json
+
     def timeline_compact(
         self,
         tenant_id: TenantId | TenantShardId,
@@ -749,7 +761,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         enhanced_gc_bottom_most_compaction=False,
         body: dict[str, Any] | None = None,
     ):
-        self.is_testing_enabled_or_skip()
         query = {}
         if force_repartition:
             query["force_repartition"] = "true"
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index aef9a825ee..ae48a8fc27 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -176,6 +176,12 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
 
         workload.churn_rows(row_count, env.pageserver.id)
 
+    def compaction_finished():
+        queue_depth = len(ps_http.timeline_compact_info(tenant_id, timeline_id))
+        assert queue_depth == 0
+
+    wait_until(compaction_finished, timeout=60)
+
     # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
     env.pageserver.assert_log_contains(
         "scheduled_compact_timeline.*picked .* layers for compaction"

From 6d3e8096fcad394d387b59fa300f07fb19613760 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 18 Dec 2024 13:10:05 -0500
Subject: [PATCH 074/583] refactor(test): tighten up test_gc_feedback (#10126)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

In https://github.com/neondatabase/neon/pull/8103 we changed the test
case to have more test coverage of gc_compaction. Now that we have
`test_gc_compaction_smoke`, we can revert this test case to serve its
original purpose and revert the parameter changes.

part of https://github.com/neondatabase/neon/issues/9114

## Summary of changes

* Revert pitr_interval from 60s to 10s.
* Assert the physical/logical size ratio in the benchmark.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 test_runner/performance/test_gc_feedback.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py
index 07f244da0c..acb7b56fd0 100644
--- a/test_runner/performance/test_gc_feedback.py
+++ b/test_runner/performance/test_gc_feedback.py
@@ -22,7 +22,7 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
             "checkpoint_distance": f"{1024 ** 2}",
             "compaction_target_size": f"{1024 ** 2}",
             # set PITR interval to be small, so we can do GC
-            "pitr_interval": "60 s",
+            "pitr_interval": "10 s",
             # "compaction_threshold": "3",
             # "image_creation_threshold": "2",
         }
@@ -32,6 +32,7 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
     n_steps = 10
     n_update_iters = 100
     step_size = 10000
+    branch_created = 0
     with endpoint.cursor() as cur:
         cur.execute("SET statement_timeout='1000s'")
         cur.execute(
@@ -66,6 +67,7 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
         if mode == "with_snapshots":
             if step == n_steps / 2:
                 env.create_branch("child")
+                branch_created += 1
 
     max_num_of_deltas_above_image = 0
     max_total_num_of_deltas = 0
@@ -142,6 +144,15 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
     with layer_map_path.open("w") as f:
         f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id)))
 
+    # We should have collected all garbage
+    if mode == "normal":
+        # in theory we should get physical size ~= logical size, but given that gc interval is 10s,
+        # and the layer has indexes that might contribute to the fluctuation, we allow a small margin
+        # of 1 here, and the end ratio we are asserting is 1 (margin) + 1 (expected) = 2.
+        assert physical_size / logical_size < 2
+    elif mode == "with_snapshots":
+        assert physical_size / logical_size < (2 + branch_created)
+
 
 @pytest.mark.timeout(10000)
 def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):

From 61fcf64c22b7464aa9beb5f22ec9ded96891a12f Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 18 Dec 2024 21:15:38 +0200
Subject: [PATCH 075/583] Fix flukyness of
 test_physical_and_logical_replicaiton.py (#10176)

## Problem

See https://github.com/neondatabase/neon/issues/10037
test_physical_and_logical_replication.py sometimes failed.

## Summary of changes

Add `wait_replica_caughtup` to wait for replica sync

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 test_runner/regress/test_physical_and_logical_replicaiton.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_physical_and_logical_replicaiton.py b/test_runner/regress/test_physical_and_logical_replicaiton.py
index ad2d0871b8..3f9824ee67 100644
--- a/test_runner/regress/test_physical_and_logical_replicaiton.py
+++ b/test_runner/regress/test_physical_and_logical_replicaiton.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 
 import time
 
-from fixtures.neon_fixtures import NeonEnv, logical_replication_sync
+from fixtures.neon_fixtures import NeonEnv, logical_replication_sync, wait_replica_caughtup
 
 
 def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonEnv, vanilla_pg):
@@ -38,6 +38,8 @@ def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonE
     for pk in range(n_records):
         p_cur.execute("insert into t (pk) values (%s)", (pk,))
 
+    wait_replica_caughtup(primary, secondary)
+
     s_cur.execute("select count(*) from t")
     assert s_cur.fetchall()[0][0] == n_records
 

From cc138b56f983c8fc66e737c5e02898121fdd72ec Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 19 Dec 2024 04:45:06 -0500
Subject: [PATCH 076/583] fix(pageserver): run psql in thread to avoid blocking
 (#10177)

## Problem

ref https://github.com/neondatabase/neon/issues/10170
ref https://github.com/neondatabase/neon/issues/9994

The psql command will block the main thread, causing other async tasks
to timeout (i.e., HTTP connect). Therefore, we need to move it to an I/O
executor thread.

## Summary of changes

* run psql connection in a thread

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: John Spray <john@neon.tech>
---
 .../regress/test_pageserver_layer_rolling.py    | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
index 706da1e35e..fcc465f90a 100644
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -22,7 +22,10 @@ CHECKPOINT_TIMEOUT_SECONDS = 60
 
 
 async def run_worker_for_tenant(
-    env: NeonEnv, entries: int, tenant: TenantId, offset: int | None = None
+    env: NeonEnv,
+    entries: int,
+    tenant: TenantId,
+    offset: int | None = None,
 ) -> Lsn:
     if offset is None:
         offset = 0
@@ -37,12 +40,20 @@ async def run_worker_for_tenant(
         finally:
             await conn.close(timeout=10)
 
-        last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+        loop = asyncio.get_running_loop()
+        sql = await loop.run_in_executor(
+            None, lambda ep: ep.safe_psql("SELECT pg_current_wal_flush_lsn()"), ep
+        )
+        last_flush_lsn = Lsn(sql[0][0])
         return last_flush_lsn
 
 
 async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> tuple[TenantId, TimelineId, Lsn]:
-    tenant, timeline = env.create_tenant(conf=tenant_conf)
+    loop = asyncio.get_running_loop()
+    # capture tenant_conf by specifying `tenant_conf=tenant_conf`, otherwise it will be evaluated to some random value
+    tenant, timeline = await loop.run_in_executor(
+        None, lambda tenant_conf, env: env.create_tenant(conf=tenant_conf), tenant_conf, env
+    )
     last_flush_lsn = await run_worker_for_tenant(env, entries, tenant)
     return tenant, timeline, last_flush_lsn
 

From a1b0558493930dca4f5c86db658ad295b9beabd6 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 19 Dec 2024 11:04:17 +0100
Subject: [PATCH 077/583] fast import: importer: use aws s3 cli (#10162)

## Problem

s5cmd doesn't pick up the pod service account

```
2024/12/16 16:26:01 Ignoring, HTTP credential provider invalid endpoint host, "169.254.170.23", only loopback hosts are allowed. <nil>
ERROR "ls s3://neon-dev-bulk-import-us-east-2/import-pgdata/fast-import/v1/br-wandering-hall-w2xobawv": NoCredentialProviders: no valid providers in chain. Deprecated. For verbose messaging see aws.Config.CredentialsChainVerboseErrors
```

## Summary of changes

Switch to offical CLI.


## Testing

Tested the pre-merge image in staging, using `job_image` override in
project settings.


https://neondb.slack.com/archives/C033RQ5SPDH/p1734554944391949?thread_ts=1734368383.258759&cid=C033RQ5SPDH

## Future Work

Switch back to s5cmd once https://github.com/peak/s5cmd/pull/769 gets
merged.

## Refs

- fixes https://github.com/neondatabase/cloud/issues/21876

---------

Co-authored-by: Gleb Novikov <NanoBjorn@users.noreply.github.com>
---
 compute/compute-node.Dockerfile               | 24 ++++++++++---------
 compute_tools/src/bin/fast_import.rs          | 10 ++++----
 .../fast_import/{s5cmd.rs => aws_s3_sync.rs}  | 13 ++++------
 3 files changed, 23 insertions(+), 24 deletions(-)
 rename compute_tools/src/bin/fast_import/{s5cmd.rs => aws_s3_sync.rs} (50%)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 9f1f3b7343..5e7b4e8287 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1556,28 +1556,30 @@ RUN apt update && \
         locales \
         procps \
         ca-certificates \
+        curl \
+        unzip \
         $VERSION_INSTALLS && \
     apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
 
-# s5cmd 2.2.2 from https://github.com/peak/s5cmd/releases/tag/v2.2.2
-# used by fast_import
+# aws cli is used by fast_import (curl and unzip above are at this time only used for this installation step)
 ARG TARGETARCH
-ADD https://github.com/peak/s5cmd/releases/download/v2.2.2/s5cmd_2.2.2_linux_$TARGETARCH.deb /tmp/s5cmd.deb
 RUN set -ex; \
-    \
-    # Determine the expected checksum based on TARGETARCH
     if [ "${TARGETARCH}" = "amd64" ]; then \
-        CHECKSUM="392c385320cd5ffa435759a95af77c215553d967e4b1c0fffe52e4f14c29cf85"; \
+        TARGETARCH_ALT="x86_64"; \
+        CHECKSUM="c9a9df3770a3ff9259cb469b6179e02829687a464e0824d5c32d378820b53a00"; \
     elif [ "${TARGETARCH}" = "arm64" ]; then \
-        CHECKSUM="939bee3cf4b5604ddb00e67f8c157b91d7c7a5b553d1fbb6890fad32894b7b46"; \
+        TARGETARCH_ALT="aarch64"; \
+        CHECKSUM="8181730be7891582b38b028112e81b4899ca817e8c616aad807c9e9d1289223a"; \
     else \
         echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \
     fi; \
-    \
-    # Compute and validate the checksum
-    echo "${CHECKSUM}  /tmp/s5cmd.deb" | sha256sum -c -
-RUN dpkg -i /tmp/s5cmd.deb && rm /tmp/s5cmd.deb
+    curl -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \
+    echo "${CHECKSUM}  /tmp/awscliv2.zip" | sha256sum -c -; \
+    unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \
+    /tmp/awscliv2/aws/install; \
+    rm -rf /tmp/awscliv2.zip /tmp/awscliv2; \
+    true
 
 ENV LANG=en_US.utf8
 USER postgres
diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index b6db3eb11a..793ec4cf10 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -34,12 +34,12 @@ use nix::unistd::Pid;
 use tracing::{info, info_span, warn, Instrument};
 use utils::fs_ext::is_directory_empty;
 
+#[path = "fast_import/aws_s3_sync.rs"]
+mod aws_s3_sync;
 #[path = "fast_import/child_stdio_to_log.rs"]
 mod child_stdio_to_log;
 #[path = "fast_import/s3_uri.rs"]
 mod s3_uri;
-#[path = "fast_import/s5cmd.rs"]
-mod s5cmd;
 
 #[derive(clap::Parser)]
 struct Args {
@@ -326,7 +326,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
     }
 
     info!("upload pgdata");
-    s5cmd::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/"))
+    aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
         .await
         .context("sync dump directory to destination")?;
 
@@ -334,10 +334,10 @@ pub(crate) async fn main() -> anyhow::Result<()> {
     {
         let status_dir = working_directory.join("status");
         std::fs::create_dir(&status_dir).context("create status directory")?;
-        let status_file = status_dir.join("status");
+        let status_file = status_dir.join("pgdata");
         std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
             .context("write status file")?;
-        s5cmd::sync(&status_file, &s3_prefix.append("/status/pgdata"))
+        aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
             .await
             .context("sync status directory to destination")?;
     }
diff --git a/compute_tools/src/bin/fast_import/s5cmd.rs b/compute_tools/src/bin/fast_import/aws_s3_sync.rs
similarity index 50%
rename from compute_tools/src/bin/fast_import/s5cmd.rs
rename to compute_tools/src/bin/fast_import/aws_s3_sync.rs
index d2d9a79736..5fa58c8f87 100644
--- a/compute_tools/src/bin/fast_import/s5cmd.rs
+++ b/compute_tools/src/bin/fast_import/aws_s3_sync.rs
@@ -4,24 +4,21 @@ use camino::Utf8Path;
 use super::s3_uri::S3Uri;
 
 pub(crate) async fn sync(local: &Utf8Path, remote: &S3Uri) -> anyhow::Result<()> {
-    let mut builder = tokio::process::Command::new("s5cmd");
-    // s5cmd uses aws-sdk-go v1, hence doesn't support AWS_ENDPOINT_URL
-    if let Some(val) = std::env::var_os("AWS_ENDPOINT_URL") {
-        builder.arg("--endpoint-url").arg(val);
-    }
+    let mut builder = tokio::process::Command::new("aws");
     builder
+        .arg("s3")
         .arg("sync")
         .arg(local.as_str())
         .arg(remote.to_string());
     let st = builder
         .spawn()
-        .context("spawn s5cmd")?
+        .context("spawn aws s3 sync")?
         .wait()
         .await
-        .context("wait for s5cmd")?;
+        .context("wait for aws s3 sync")?;
     if st.success() {
         Ok(())
     } else {
-        Err(anyhow::anyhow!("s5cmd failed"))
+        Err(anyhow::anyhow!("aws s3 sync failed"))
     }
 }

From 43dc03459d42ec4c7ca028e48f5a0d8994ecf983 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 19 Dec 2024 11:25:44 +0100
Subject: [PATCH 078/583] Run pgbench on 10 GB scale factor on database with n
 relations (e.g. 10k) (#10172)

## Problem

We want to verify how much / if pgbench throughput and latency on Neon
suffers if the database contains many other relations, too.

## Summary of changes

Modify the benchmarking.yml pgbench-compare job to
- create an addiitional project at scale factor 10 GiB
- before running pgbench add n tables (initially 10k) to the database
- then compare the pgbench throughput and latency to the existing
pgbench-compare at 10 Gib scale factor

We use a realistic template for the n relations that is a partitioned
table with some realistic data types, indexes and constraints - similar
to a table that we use internally.

Example run:
https://github.com/neondatabase/neon/actions/runs/12377565956/job/34547386959
---
 .github/workflows/benchmarking.yml            |  25 ++-
 .../many_relations/create_many_relations.sql  | 199 ++++++++++++++++++
 .../performance/test_perf_many_relations.py   |  66 ++++++
 3 files changed, 288 insertions(+), 2 deletions(-)
 create mode 100644 test_runner/performance/many_relations/create_many_relations.sql
 create mode 100644 test_runner/performance/test_perf_many_relations.py

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index bbdcf5ef49..ab0f2a6155 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -308,6 +308,7 @@ jobs:
           "image": [ "'"$image_default"'" ],
           "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
                       { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
                       { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
                       { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
                       { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
@@ -410,7 +411,7 @@ jobs:
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Create Neon Project
-      if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
+      if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-new-many-tables", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
       id: create-neon-project
       uses: ./.github/actions/neon-project-create
       with:
@@ -429,7 +430,7 @@ jobs:
           neonvm-captest-sharding-reuse)
             CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
             ;;
-          neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
+          neonvm-captest-new | neonvm-captest-new-many-tables | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
             CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
             ;;
           rds-aurora)
@@ -446,6 +447,26 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
+    # we want to compare Neon project OLTP throughput and latency at scale factor 10 GB 
+    # without (neonvm-captest-new)
+    # and with (neonvm-captest-new-many-tables) many relations in the database
+    - name: Create many relations before the run
+      if: contains(fromJson('["neonvm-captest-new-many-tables"]'), matrix.platform)
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_perf_many_relations
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        TEST_NUM_RELATIONS: 10000
+
     - name: Benchmark init
       uses: ./.github/actions/run-python-test-set
       with:
diff --git a/test_runner/performance/many_relations/create_many_relations.sql b/test_runner/performance/many_relations/create_many_relations.sql
new file mode 100644
index 0000000000..1b3673c9e1
--- /dev/null
+++ b/test_runner/performance/many_relations/create_many_relations.sql
@@ -0,0 +1,199 @@
+-- create a schema that simulates Neon control plane operations table
+-- however use partitioned operations tables with many (e.g. 500) child partition tables per table
+-- in summary we create multiple of these partitioned operations tables (with 500 childs each) - until we reach the requested number of tables
+
+
+-- first we need some other tables that can be referenced by the operations table
+
+--  Table for branches
+CREATE TABLE public.branches (
+    id text PRIMARY KEY
+);
+
+-- Table for endpoints
+CREATE TABLE public.endpoints (
+    id text PRIMARY KEY
+);
+
+-- Table for projects
+CREATE TABLE public.projects (
+    id text PRIMARY KEY
+);
+
+INSERT INTO public.branches (id)
+VALUES ('branch_1');
+
+-- Insert one row into endpoints
+INSERT INTO public.endpoints (id)
+VALUES ('endpoint_1');
+
+-- Insert one row into projects
+INSERT INTO public.projects (id)
+VALUES ('project_1');
+
+-- now we create a procedure that can create n operations tables
+-- we do that in a procedure to save roundtrip latency when scaling the test to many tables
+-- prefix is the base table name, e.g. 'operations_scale_1000' if we create 1000 tables
+CREATE OR REPLACE PROCEDURE create_partitioned_tables(prefix text, n INT)
+LANGUAGE plpgsql AS $$
+DECLARE
+    table_name TEXT;  -- Variable to hold table names dynamically
+    i INT;            -- Counter for the loop
+BEGIN
+    -- Loop to create n partitioned tables
+    FOR i IN 1..n LOOP
+        table_name := format('%s_%s', prefix, i);
+
+        -- Create the partitioned table
+        EXECUTE format(
+            'CREATE TABLE public.%s (
+                project_id character varying NOT NULL,
+                id uuid NOT NULL,
+                status integer,
+                action character varying NOT NULL,
+                error character varying,
+                created_at timestamp with time zone NOT NULL DEFAULT now(),
+                updated_at timestamp with time zone NOT NULL DEFAULT now(),
+                spec jsonb,
+                retry_at timestamp with time zone,
+                failures_count integer DEFAULT 0,
+                metadata jsonb NOT NULL DEFAULT ''{}''::jsonb,
+                executor_id text NOT NULL,
+                attempt_duration_ms integer,
+                metrics jsonb DEFAULT ''{}''::jsonb,
+                branch_id text,
+                endpoint_id text,
+                next_operation_id uuid,
+                compute_id text,
+                connection_attempt_at timestamp with time zone,
+                concurrency_key text,
+                queue_id text,
+                CONSTRAINT %s_pkey PRIMARY KEY (id, created_at),
+                CONSTRAINT %s_branch_id_fk FOREIGN KEY (branch_id) REFERENCES branches(id) ON DELETE CASCADE,
+                CONSTRAINT %s_endpoint_id_fk FOREIGN KEY (endpoint_id) REFERENCES endpoints(id) ON DELETE CASCADE,
+                CONSTRAINT %s_next_operation_id_fk FOREIGN KEY (next_operation_id, created_at) REFERENCES %s(id, created_at),
+                CONSTRAINT %s_project_id_fk FOREIGN KEY (project_id) REFERENCES projects(id) ON DELETE CASCADE
+            ) PARTITION BY RANGE (created_at)',
+            table_name, table_name, table_name, table_name, table_name, table_name, table_name
+        );
+
+        -- Add indexes for the partitioned table
+        EXECUTE format('CREATE INDEX index_%s_on_next_operation_id ON public.%s (next_operation_id)', table_name, table_name);
+        EXECUTE format('CREATE INDEX index_%s_on_project_id ON public.%s (project_id)', table_name, table_name);
+        EXECUTE format('CREATE INDEX %s_branch_id ON public.%s (branch_id)', table_name, table_name);
+        EXECUTE format('CREATE INDEX %s_branch_id_created_idx ON public.%s (branch_id, created_at)', table_name, table_name);
+        EXECUTE format('CREATE INDEX %s_created_at_idx ON public.%s (created_at)', table_name, table_name);
+        EXECUTE format('CREATE INDEX %s_created_at_project_id_id_cond_idx ON public.%s (created_at, project_id, id)', table_name, table_name);
+        EXECUTE format('CREATE INDEX %s_endpoint_id ON public.%s (endpoint_id)', table_name, table_name);
+        EXECUTE format(
+            'CREATE INDEX %s_for_redo_worker_idx ON public.%s (executor_id) WHERE status <> 1',
+            table_name, table_name
+        );
+        EXECUTE format(
+            'CREATE INDEX %s_project_id_status_index ON public.%s ((project_id::text), status)',
+            table_name, table_name
+        );
+        EXECUTE format(
+            'CREATE INDEX %s_status_not_finished ON public.%s (status) WHERE status <> 1',
+            table_name, table_name
+        );
+        EXECUTE format('CREATE INDEX %s_updated_at_desc_idx ON public.%s (updated_at DESC)', table_name, table_name);
+        EXECUTE format(
+            'CREATE INDEX %s_with_failures ON public.%s (failures_count) WHERE failures_count > 0',
+            table_name, table_name
+        );
+    END LOOP;
+END;
+$$;
+
+-- next we create a procedure that can add the child partitions (one per day) to each of the operations tables
+CREATE OR REPLACE PROCEDURE create_operations_partitions(
+    table_name TEXT, 
+    start_date DATE,
+    end_date DATE
+)
+LANGUAGE plpgsql AS $$
+DECLARE
+    partition_date DATE;
+    partition_name TEXT;
+    counter INT := 0;  -- Counter to track the number of tables created in the current transaction
+BEGIN
+    partition_date := start_date;
+
+    -- Create partitions in batches
+    WHILE partition_date < end_date LOOP
+        partition_name := format('%s_%s', table_name, to_char(partition_date,'YYYY_MM_DD'));
+
+        EXECUTE format(
+            'CREATE TABLE IF NOT EXISTS public.%s PARTITION OF public.%s
+             FOR VALUES FROM (''%s'') TO (''%s'')',
+            partition_name,
+            table_name,
+            partition_date,
+            partition_date + INTERVAL '1 day'
+        );
+
+        counter := counter + 1;
+
+        -- Commit and reset counter after every 100 partitions
+        IF counter >= 100 THEN
+            COMMIT;
+            counter := 0;  -- Reset the counter
+        END IF;
+
+        -- Advance to the next day
+        partition_date := partition_date + INTERVAL '1 day';
+    END LOOP;
+
+    -- Final commit for remaining partitions
+    IF counter > 0 THEN
+        COMMIT;
+    END IF;
+
+    -- Insert synthetic rows into each partition
+    EXECUTE format(
+        'INSERT INTO %I (
+            project_id,
+            branch_id,
+            endpoint_id,
+            id,
+            status,
+            action,
+            created_at,
+            updated_at,
+            spec,
+            metadata,
+            executor_id,
+            failures_count
+        )
+        SELECT 
+            ''project_1'',                                   -- project_id
+            ''branch_1'',                                    -- branch_id
+            ''endpoint_1'',                                  -- endpoint_id
+            ''e8bba687-0df9-4291-bfcd-7d5f6aa7c158'',          -- unique id
+            1,                                               -- status
+            ''SYNTHETIC_ACTION'',                            -- action
+            gs::timestamp + interval ''0 ms'',               -- created_at
+            gs::timestamp + interval ''1 minute'',           -- updated_at
+            ''{"key": "value"}'',                            -- spec (JSONB)
+            ''{"metadata_key": "metadata_value"}'',          -- metadata (JSONB)
+            ''executor_1'',                                  -- executor_id
+            0                                                -- failures_count
+        FROM generate_series(%L, %L::DATE - INTERVAL ''1 day'', INTERVAL ''1 day'') AS gs',
+        table_name, start_date, end_date
+    );
+    
+    -- Commit the inserted rows
+    COMMIT;
+END;
+$$;
+
+-- we can now create partitioned tables using something like
+-- CALL create_partitioned_tables('operations_scale_1000' ,10);
+
+-- and we can create the child partitions for a table using something like
+-- CALL create_operations_partitions(
+--     'operations_scale_1000_1',
+--     '2000-01-01',            -- Start date
+--     ('2000-01-01'::DATE + INTERVAL '1 day' * 500)::DATE  -- End date (start date + number of days)
+-- );
diff --git a/test_runner/performance/test_perf_many_relations.py b/test_runner/performance/test_perf_many_relations.py
new file mode 100644
index 0000000000..0ee0efe8b9
--- /dev/null
+++ b/test_runner/performance/test_perf_many_relations.py
@@ -0,0 +1,66 @@
+import os
+from pathlib import Path
+
+import pytest
+from fixtures.compare_fixtures import RemoteCompare
+from fixtures.log_helper import log
+
+
+def get_num_relations(default: int = 1000) -> list[int]:
+    # We parametrize each run with scale specifying the number of wanted child partitions.
+    # Databases are pre-created and passed through BENCHMARK_CONNSTR env variable.
+    scales = os.getenv("TEST_NUM_RELATIONS", default=str(default))
+    rv = []
+    for s in scales.split(","):
+        scale = int(s)
+        rv.append(scale)
+    return rv
+
+
+@pytest.mark.parametrize("num_relations", get_num_relations())
+@pytest.mark.remote_cluster
+def test_perf_many_relations(remote_compare: RemoteCompare, num_relations: int):
+    """
+    Test creating many relations in a single database.
+    We use partitioned tables with child tables, indexes and constraints to have a realistic schema.
+    Also we include some common data types like text, uuid, timestamp, JSONB, etc.
+
+    see many_relations/create_many_relations.sql
+    """
+    env = remote_compare
+
+    # prepare some base tables and the plpgsql procedures that we use to create the tables
+    sql_file = Path(__file__).parent / "many_relations" / "create_many_relations.sql"
+    env.pg_bin.run_capture(["psql", env.pg.connstr(), "-f", str(sql_file)])
+
+    num_parent_tables = num_relations // 500 + 1
+    log.info(f"Creating {num_relations} relations in {num_parent_tables} parent tables")
+
+    log.info(f"Creating {num_parent_tables} parent tables")
+    sql = f"CALL create_partitioned_tables('operations_scale_{num_relations}', {num_parent_tables})"
+    log.info(sql)
+    env.pg_bin.run_capture(["psql", env.pg.connstr(), "-c", sql])
+
+    current_table = 0
+    num_relations_remaining = num_relations
+
+    # now run and measure the actual relation creation
+    while num_relations_remaining > 0:
+        current_table += 1
+        parent_table_name = f"operations_scale_{num_relations}_{current_table}"
+        if num_relations_remaining > 500:
+            num_relations_to_create = 500
+        else:
+            num_relations_to_create = num_relations_remaining
+        num_relations_remaining -= num_relations_to_create
+        log.info(
+            f"Creating {num_relations_to_create} child tables in partitioned parent table '{parent_table_name}'"
+        )
+        sql = f"CALL create_operations_partitions( '{parent_table_name}', '2000-01-01', ('2000-01-01'::DATE + INTERVAL '1 day' * {num_relations_to_create})::DATE)"
+        log.info(sql)
+        with env.zenbenchmark.record_duration(
+            f"CREATE_TABLE/{current_table}/{num_relations_to_create}"
+        ):
+            env.pg_bin.run_capture(
+                ["psql", env.pg.connstr(options="-cstatement_timeout=1000s "), "-c", sql]
+            )

From b135194090369d8e5452c9ee1c6e7c37cc9ba8bd Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Thu, 19 Dec 2024 11:37:08 +0100
Subject: [PATCH 079/583] proxy: Delay SASL complete message until auth is done
 (#10189)

The final SASL complete message can be bundled with the remainder of the
auth flow messages until ReadyForQuery.

neondatabase/cloud#19184
---
 proxy/src/auth/backend/mod.rs | 3 +++
 proxy/src/sasl/stream.rs      | 8 +++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 50cb94bfa0..0c9a7f7825 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -678,6 +678,9 @@ mod tests {
         .await
         .unwrap();
 
+        // flush the final server message
+        stream.flush().await.unwrap();
+
         handle.await.unwrap();
     }
 
diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs
index f1c916daa2..ac77556566 100644
--- a/proxy/src/sasl/stream.rs
+++ b/proxy/src/sasl/stream.rs
@@ -50,6 +50,12 @@ impl<S: AsyncWrite + Unpin> SaslStream<'_, S> {
         self.stream.write_message(&msg.to_reply()).await?;
         Ok(())
     }
+
+    // Queue a SASL message for the client.
+    fn send_noflush(&mut self, msg: &ServerMessage<&str>) -> io::Result<()> {
+        self.stream.write_message_noflush(&msg.to_reply())?;
+        Ok(())
+    }
 }
 
 /// SASL authentication outcome.
@@ -85,7 +91,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> SaslStream<'_, S> {
                     continue;
                 }
                 Step::Success(result, reply) => {
-                    self.send(&ServerMessage::Final(&reply)).await?;
+                    self.send_noflush(&ServerMessage::Final(&reply))?;
                     Outcome::Success(result)
                 }
                 Step::Failure(reason) => Outcome::Failure(reason),

From 65042cbadd0426c43499bb7675e671b5c6e980e9 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 19 Dec 2024 10:58:49 +0000
Subject: [PATCH 080/583] tests: use high IO concurrency in
 `test_pgdata_import_smoke`, use `effective_io_concurrency=2` in tests by
 default (#10114)

## Problem

`test_pgdata_import_smoke` writes two gigabytes of pages and then reads
them back serially. This is CPU bottlenecked and results in a long
runtime, and sensitivity to CPU load from other tests on the same
machine.

Closes: https://github.com/neondatabase/neon/issues/10071

## Summary of changes

- Use effective_io_concurrency=32 when doing sequential scans through
2GiB of pages in test_pgdata_import_smoke. This is a ~10x runtime
decrease in the parts of the test that do sequential scans.
- Also set `effective_io_concurrency=2` for tests, as I noticed while
debugging that we were doing all getpage requests serially, which is bad
for checking the stability of the batching code.
---
 control_plane/src/endpoint.rs             |  4 ++++
 test_runner/regress/test_import_pgdata.py | 14 +++++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 1fdf326051..5ebf842813 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -316,6 +316,10 @@ impl Endpoint {
         // and can cause errors like 'no unpinned buffers available', see
         // <https://github.com/neondatabase/neon/issues/9956>
         conf.append("shared_buffers", "1MB");
+        // Postgres defaults to effective_io_concurrency=1, which does not exercise the pageserver's
+        // batching logic.  Set this to 2 so that we exercise the code a bit without letting
+        // individual tests do a lot of concurrent work on underpowered test machines
+        conf.append("effective_io_concurrency", "2");
         conf.append("fsync", "off");
         conf.append("max_connections", "100");
         conf.append("wal_level", "logical");
diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py
index 29229b73c1..6ea2393a9d 100644
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -84,6 +84,8 @@ def test_pgdata_import_smoke(
     elif rel_block_size == RelBlockSize.TWO_STRPES_PER_SHARD:
         target_relblock_size = (shard_count or 1) * stripe_size * 8192 * 2
     elif rel_block_size == RelBlockSize.MULTIPLE_RELATION_SEGMENTS:
+        # Postgres uses a 1GiB segment size, fixed at compile time, so we must use >2GB of data
+        # to exercise multiple segments.
         target_relblock_size = int(((2.333 * 1024 * 1024 * 1024) // 8192) * 8192)
     else:
         raise ValueError
@@ -111,9 +113,15 @@ def test_pgdata_import_smoke(
 
     def validate_vanilla_equivalence(ep):
         # TODO: would be nicer to just compare pgdump
-        assert ep.safe_psql("select count(*), sum(data::bigint)::bigint from t") == [
-            (expect_nrows, expect_sum)
-        ]
+
+        # Enable IO concurrency for batching on large sequential scan, to avoid making
+        # this test unnecessarily onerous on CPU
+        assert ep.safe_psql_many(
+            [
+                "set effective_io_concurrency=32;",
+                "select count(*), sum(data::bigint)::bigint from t",
+            ]
+        ) == [[], [(expect_nrows, expect_sum)]]
 
     validate_vanilla_equivalence(vanilla_pg)
 

From afda6d4700ca0b521ecb412f0bc81e80e5903dbd Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 19 Dec 2024 12:55:05 +0000
Subject: [PATCH 081/583] storage_scrubber: don't report half-created timelines
 as corruption (#10198)

## Problem

test_timeline_archival_chaos does timeline creation with failure
injection, and thereby sometimes leaves timelines in a part created
state. This was being reported as corruption by the scrubber on test
teardown, because it considered a layer without an index to be an
invalid state. This was incorrect: the scrubber should accept this
state, it occurs legitimately during timeline creation.

Closes: https://github.com/neondatabase/neon/issues/9988

## Summary of changes

- Report a timeline with layers but no index as Relic rather than
MissingIndexPart.
- We retain the MissingIndexPart variant for the case where an index
_was_ found in the listing, but was not found by a subsequent GET, i.e.
racing with deletion.
---
 storage_scrubber/src/checks.rs | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index f759f54d19..32c86052ef 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -310,7 +310,7 @@ pub(crate) enum BlobDataParseResult {
         index_part_generation: Generation,
         s3_layers: HashSet<(LayerName, Generation)>,
     },
-    /// The remains of a deleted Timeline (i.e. an initdb archive only)
+    /// The remains of an uncleanly deleted Timeline or aborted timeline creation(e.g. an initdb archive only, or some layer without an index)
     Relic,
     Incorrect {
         errors: Vec<String>,
@@ -346,7 +346,7 @@ pub(crate) async fn list_timeline_blobs(
     match res {
         ListTimelineBlobsResult::Ready(data) => Ok(data),
         ListTimelineBlobsResult::MissingIndexPart(_) => {
-            // Retry if index is missing.
+            // Retry if listing raced with removal of an index
             let data = list_timeline_blobs_impl(remote_client, id, root_target)
                 .await?
                 .into_data();
@@ -358,7 +358,7 @@ pub(crate) async fn list_timeline_blobs(
 enum ListTimelineBlobsResult {
     /// Blob data is ready to be intepreted.
     Ready(RemoteTimelineBlobData),
-    /// List timeline blobs has layer files but is missing [`IndexPart`].
+    /// The listing contained an index but when we tried to fetch it, we couldn't
     MissingIndexPart(RemoteTimelineBlobData),
 }
 
@@ -467,19 +467,19 @@ async fn list_timeline_blobs_impl(
     match index_part_object.as_ref() {
         Some(selected) => index_part_keys.retain(|k| k != selected),
         None => {
-            // It is possible that the branch gets deleted after we got some layer files listed
-            // and we no longer have the index file in the listing.
-            errors.push(
+            // This case does not indicate corruption, but it should be very unusual.  It can
+            // happen if:
+            // - timeline creation is in progress (first layer is written before index is written)
+            // - timeline deletion happened while a stale pageserver was still attached, it might upload
+            //   a layer after the deletion is done.
+            tracing::info!(
                 "S3 list response got no index_part.json file but still has layer files"
-                    .to_string(),
             );
-            return Ok(ListTimelineBlobsResult::MissingIndexPart(
-                RemoteTimelineBlobData {
-                    blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
-                    unused_index_keys: index_part_keys,
-                    unknown_keys,
-                },
-            ));
+            return Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData {
+                blob_data: BlobDataParseResult::Relic,
+                unused_index_keys: index_part_keys,
+                unknown_keys,
+            }));
         }
     }
 

From 502d512fe2ca9f07392428d70d5262cf3f5103e2 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 19 Dec 2024 14:04:42 +0000
Subject: [PATCH 082/583] safekeeper: lift benchmarking utils into safekeeper
 crate (#10200)

## Problem

The benchmarking utilities are also useful for testing. We want to write
tests in the safekeeper crate.

## Summary of changes

This commit lifts the utils to the safekeeper crate. They are compiled
if the benchmarking features is enabled or if in test mode.
---
 libs/postgres_ffi/src/wal_generator.rs        |  6 ++--
 safekeeper/Cargo.toml                         |  2 ++
 safekeeper/benches/receive_wal.rs             | 23 +++++++--------
 safekeeper/src/lib.rs                         |  3 ++
 .../benchutils.rs => src/test_utils.rs}       | 28 ++++++++++---------
 .../tests/walproposer_sim/walproposer_disk.rs |  2 +-
 6 files changed, 36 insertions(+), 28 deletions(-)
 rename safekeeper/{benches/benchutils.rs => src/test_utils.rs} (78%)

diff --git a/libs/postgres_ffi/src/wal_generator.rs b/libs/postgres_ffi/src/wal_generator.rs
index 69cc4b771f..a72b035e17 100644
--- a/libs/postgres_ffi/src/wal_generator.rs
+++ b/libs/postgres_ffi/src/wal_generator.rs
@@ -106,11 +106,11 @@ impl<R: RecordGenerator> WalGenerator<R> {
     const TIMELINE_ID: u32 = 1;
 
     /// Creates a new WAL generator with the given record generator.
-    pub fn new(record_generator: R) -> WalGenerator<R> {
+    pub fn new(record_generator: R, start_lsn: Lsn) -> WalGenerator<R> {
         Self {
             record_generator,
-            lsn: Lsn(0),
-            prev_lsn: Lsn(0),
+            lsn: start_lsn,
+            prev_lsn: start_lsn,
         }
     }
 
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 086407603f..3ebb7097f2 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -9,6 +9,7 @@ default = []
 # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
 # which adds some runtime cost to run tests on outage conditions
 testing = ["fail/failpoints"]
+benchmarking = []
 
 [dependencies]
 async-stream.workspace = true
@@ -77,3 +78,4 @@ tracing-subscriber = { workspace = true, features = ["json"] }
 [[bench]]
 name = "receive_wal"
 harness = false
+required-features = ["benchmarking"]
diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs
index 313d945b94..996c4d9b8c 100644
--- a/safekeeper/benches/receive_wal.rs
+++ b/safekeeper/benches/receive_wal.rs
@@ -1,11 +1,7 @@
 //! WAL ingestion benchmarks.
 
-#[path = "benchutils.rs"]
-mod benchutils;
-
 use std::io::Write as _;
 
-use benchutils::Env;
 use bytes::BytesMut;
 use camino_tempfile::tempfile;
 use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion};
@@ -16,6 +12,7 @@ use safekeeper::receive_wal::{self, WalAcceptor};
 use safekeeper::safekeeper::{
     AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage,
 };
+use safekeeper::test_utils::Env;
 use tokio::io::AsyncWriteExt as _;
 use utils::id::{NodeId, TenantTimelineId};
 use utils::lsn::Lsn;
@@ -76,12 +73,15 @@ fn bench_process_msg(c: &mut Criterion) {
         assert!(size >= prefixlen);
         let message = vec![0; size - prefixlen];
 
-        let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message));
+        let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), Lsn(0));
 
         // Set up the Safekeeper.
         let env = Env::new(fsync)?;
-        let mut safekeeper =
-            runtime.block_on(env.make_safekeeper(NodeId(1), TenantTimelineId::generate()))?;
+        let mut safekeeper = runtime.block_on(env.make_safekeeper(
+            NodeId(1),
+            TenantTimelineId::generate(),
+            Lsn(0),
+        ))?;
 
         b.iter_batched_ref(
             // Pre-construct WAL records and requests. Criterion will batch them.
@@ -134,7 +134,8 @@ fn bench_wal_acceptor(c: &mut Criterion) {
         let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded
 
         let env = Env::new(fsync)?;
-        let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(c"prefix", b"message"));
+        let walgen =
+            &mut WalGenerator::new(LogicalMessageGenerator::new(c"prefix", b"message"), Lsn(0));
 
         // Create buffered channels that can fit all requests, to avoid blocking on channels.
         let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(n);
@@ -145,7 +146,7 @@ fn bench_wal_acceptor(c: &mut Criterion) {
             // TODO: WalAcceptor doesn't actually need a full timeline, only
             // Safekeeper::process_msg(). Consider decoupling them to simplify the setup.
             let tli = env
-                .make_timeline(NodeId(1), TenantTimelineId::generate())
+                .make_timeline(NodeId(1), TenantTimelineId::generate(), Lsn(0))
                 .await?
                 .wal_residence_guard()
                 .await?;
@@ -239,7 +240,7 @@ fn bench_wal_acceptor_throughput(c: &mut Criterion) {
         assert!(size >= prefixlen);
         let message = vec![0; size - prefixlen];
 
-        let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message));
+        let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), Lsn(0));
 
         // Construct and spawn the WalAcceptor task.
         let env = Env::new(fsync)?;
@@ -249,7 +250,7 @@ fn bench_wal_acceptor_throughput(c: &mut Criterion) {
 
         runtime.block_on(async {
             let tli = env
-                .make_timeline(NodeId(1), TenantTimelineId::generate())
+                .make_timeline(NodeId(1), TenantTimelineId::generate(), Lsn(0))
                 .await?
                 .wal_residence_guard()
                 .await?;
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index abe6e00a66..7acf355e6a 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -43,6 +43,9 @@ pub mod wal_reader_stream;
 pub mod wal_service;
 pub mod wal_storage;
 
+#[cfg(any(test, feature = "benchmarking"))]
+pub mod test_utils;
+
 mod timelines_global_map;
 use std::sync::Arc;
 pub use timelines_global_map::GlobalTimelines;
diff --git a/safekeeper/benches/benchutils.rs b/safekeeper/src/test_utils.rs
similarity index 78%
rename from safekeeper/benches/benchutils.rs
rename to safekeeper/src/test_utils.rs
index 48d796221b..c40a8bae5a 100644
--- a/safekeeper/benches/benchutils.rs
+++ b/safekeeper/src/test_utils.rs
@@ -1,18 +1,18 @@
 use std::sync::Arc;
 
+use crate::rate_limit::RateLimiter;
+use crate::safekeeper::{ProposerAcceptorMessage, ProposerElected, SafeKeeper, TermHistory};
+use crate::state::{TimelinePersistentState, TimelineState};
+use crate::timeline::{get_timeline_dir, SharedState, StateSK, Timeline};
+use crate::timelines_set::TimelinesSet;
+use crate::wal_backup::remote_timeline_path;
+use crate::{control_file, wal_storage, SafeKeeperConf};
 use camino_tempfile::Utf8TempDir;
-use safekeeper::rate_limit::RateLimiter;
-use safekeeper::safekeeper::{ProposerAcceptorMessage, ProposerElected, SafeKeeper, TermHistory};
-use safekeeper::state::{TimelinePersistentState, TimelineState};
-use safekeeper::timeline::{get_timeline_dir, SharedState, StateSK, Timeline};
-use safekeeper::timelines_set::TimelinesSet;
-use safekeeper::wal_backup::remote_timeline_path;
-use safekeeper::{control_file, wal_storage, SafeKeeperConf};
 use tokio::fs::create_dir_all;
 use utils::id::{NodeId, TenantTimelineId};
 use utils::lsn::Lsn;
 
-/// A Safekeeper benchmarking environment. Uses a tempdir for storage, removed on drop.
+/// A Safekeeper testing or benchmarking environment. Uses a tempdir for storage, removed on drop.
 pub struct Env {
     /// Whether to enable fsync.
     pub fsync: bool,
@@ -21,7 +21,7 @@ pub struct Env {
 }
 
 impl Env {
-    /// Creates a new benchmarking environment in a temporary directory. fsync controls whether to
+    /// Creates a new test or benchmarking environment in a temporary directory. fsync controls whether to
     /// enable fsyncing.
     pub fn new(fsync: bool) -> anyhow::Result<Self> {
         let tempdir = camino_tempfile::tempdir()?;
@@ -47,6 +47,7 @@ impl Env {
         &self,
         node_id: NodeId,
         ttid: TenantTimelineId,
+        start_lsn: Lsn,
     ) -> anyhow::Result<SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>> {
         let conf = self.make_conf(node_id);
 
@@ -67,9 +68,9 @@ impl Env {
         safekeeper
             .process_msg(&ProposerAcceptorMessage::Elected(ProposerElected {
                 term: 1,
-                start_streaming_at: Lsn(0),
-                term_history: TermHistory(vec![(1, Lsn(0)).into()]),
-                timeline_start_lsn: Lsn(0),
+                start_streaming_at: start_lsn,
+                term_history: TermHistory(vec![(1, start_lsn).into()]),
+                timeline_start_lsn: start_lsn,
             }))
             .await?;
 
@@ -82,12 +83,13 @@ impl Env {
         &self,
         node_id: NodeId,
         ttid: TenantTimelineId,
+        start_lsn: Lsn,
     ) -> anyhow::Result<Arc<Timeline>> {
         let conf = Arc::new(self.make_conf(node_id));
         let timeline_dir = get_timeline_dir(&conf, &ttid);
         let remote_path = remote_timeline_path(&ttid)?;
 
-        let safekeeper = self.make_safekeeper(node_id, ttid).await?;
+        let safekeeper = self.make_safekeeper(node_id, ttid, start_lsn).await?;
         let shared_state = SharedState::new(StateSK::Loaded(safekeeper));
 
         let timeline = Timeline::new(
diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs
index aefb3919a1..7dc7f48548 100644
--- a/safekeeper/tests/walproposer_sim/walproposer_disk.rs
+++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs
@@ -18,7 +18,7 @@ impl DiskWalProposer {
                 internal_available_lsn: Lsn(0),
                 prev_lsn: Lsn(0),
                 disk: BlockStorage::new(),
-                wal_generator: WalGenerator::new(LogicalMessageGenerator::new(c"", &[])),
+                wal_generator: WalGenerator::new(LogicalMessageGenerator::new(c"", &[]), Lsn(0)),
             }),
         })
     }

From 628451d68ec30c0fb591e14a64f696b031d7ca88 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 19 Dec 2024 14:04:46 +0000
Subject: [PATCH 083/583] safekeeper: short-circuit interpreted wal sender
 (#10202)

## Problem

Safekeeper may currently send a batch to the pageserver even if it
hasn't decoded a new record.
I think this is quite unlikely in the field, but worth adressing.

## Summary of changes

Don't send anything if we haven't decoded a full record. Once this
merges and releases, the `InterpretedWalRecords` struct can be updated
to remove the Option wrapper for `next_record_lsn`.
---
 safekeeper/src/send_interpreted_wal.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
index 2589030422..7d215176dd 100644
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -94,9 +94,14 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
                         }
                     }
 
+                    let max_next_record_lsn = match max_next_record_lsn {
+                        Some(lsn) => lsn,
+                        None => { continue; }
+                    };
+
                     let batch = InterpretedWalRecords {
                         records,
-                        next_record_lsn: max_next_record_lsn
+                        next_record_lsn: Some(max_next_record_lsn),
                     };
 
                     tx.send(Batch {wal_end_lsn, available_wal_end_lsn, records: batch}).await.unwrap();

From 04517c6ff3db53b7145aff41a9de208648194a6d Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 19 Dec 2024 17:22:39 +0200
Subject: [PATCH 084/583] Do not reload config file on PS reconnect (#10204)

## Problem

See https://github.com/neondatabase/neon/issues/10184
and
https://neondb.slack.com/archives/C04DGM6SMTM/p1733997259898819

Reloading config file inside parallel worker cause it's termination

## Summary of changes

Remove call of `HandleMainLoopInterrupts()`
Update of page server URL is propagated by postmaster through shared
memory and we should not reload config for it.

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/libpagestore.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 6513ba4dd6..88d0a5292b 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -827,7 +827,6 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	{
 		while (!pageserver_connect(shard_no, shard->n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
 		{
-			HandleMainLoopInterrupts();
 			shard->n_reconnect_attempts += 1;
 		}
 		shard->n_reconnect_attempts = 0;

From b89e02f3e8efccfb900685f2d6c1fe18d13cb956 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 19 Dec 2024 13:04:53 -0500
Subject: [PATCH 085/583] fix(pageserver): consider partial compaction layer
 map in layer check (#10044)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

In https://github.com/neondatabase/neon/pull/9897 we temporarily
disabled the layer valid check because the current one only considers
the end result of all compaction algorithms, but partial gc-compaction
would temporarily produce an "invalid" layer map.

part of https://github.com/neondatabase/neon/issues/9114

## Summary of changes

Allow LSN splits to overlap in the slow path check. Currently, the valid
check is only used in storage scrubber (background job) and during
gc-compaction (without taking layer lock). Therefore, it's fine for such
checks to be a little bit inefficient but more accurate.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 pageserver/src/tenant/checks.rs              | 47 +++++++++++++------
 pageserver/src/tenant/timeline/compaction.rs | 48 ++++++++++++++++----
 2 files changed, 71 insertions(+), 24 deletions(-)

diff --git a/pageserver/src/tenant/checks.rs b/pageserver/src/tenant/checks.rs
index 1e8fa8d1d6..f98356242e 100644
--- a/pageserver/src/tenant/checks.rs
+++ b/pageserver/src/tenant/checks.rs
@@ -1,12 +1,15 @@
 use std::collections::BTreeSet;
 
 use itertools::Itertools;
+use pageserver_compaction::helpers::overlaps_with;
 
 use super::storage_layer::LayerName;
 
 /// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
 ///
-/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
+/// The function implements a fast path check and a slow path check.
+///
+/// The fast path checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
 ///
 /// ```plain
 /// |       |                 |       |
@@ -25,31 +28,47 @@ use super::storage_layer::LayerName;
 /// |       |    |   4   |    |       |
 ///
 /// If layer 2 and 4 contain the same single key, this is also a valid layer map.
+///
+/// However, if a partial compaction is still going on, it is possible that we get a layer map not satisfying the above condition.
+/// Therefore, we fallback to simply check if any of the two delta layers overlap. (See "A slow path...")
 pub fn check_valid_layermap(metadata: &[LayerName]) -> Option<String> {
     let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
     let mut all_delta_layers = Vec::new();
     for name in metadata {
         if let LayerName::Delta(layer) = name {
-            if layer.key_range.start.next() != layer.key_range.end {
-                all_delta_layers.push(layer.clone());
-            }
+            all_delta_layers.push(layer.clone());
         }
     }
     for layer in &all_delta_layers {
-        let lsn_range = &layer.lsn_range;
-        lsn_split_point.insert(lsn_range.start);
-        lsn_split_point.insert(lsn_range.end);
+        if layer.key_range.start.next() != layer.key_range.end {
+            let lsn_range = &layer.lsn_range;
+            lsn_split_point.insert(lsn_range.start);
+            lsn_split_point.insert(lsn_range.end);
+        }
     }
-    for layer in &all_delta_layers {
+    for (idx, layer) in all_delta_layers.iter().enumerate() {
+        if layer.key_range.start.next() == layer.key_range.end {
+            continue;
+        }
         let lsn_range = layer.lsn_range.clone();
         let intersects = lsn_split_point.range(lsn_range).collect_vec();
         if intersects.len() > 1 {
-            let err = format!(
-                "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
-                layer,
-                intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
-            );
-            return Some(err);
+            // A slow path to check if the layer intersects with any other delta layer.
+            for (other_idx, other_layer) in all_delta_layers.iter().enumerate() {
+                if other_idx == idx {
+                    // do not check self intersects with self
+                    continue;
+                }
+                if overlaps_with(&layer.lsn_range, &other_layer.lsn_range)
+                    && overlaps_with(&layer.key_range, &other_layer.key_range)
+                {
+                    let err = format!(
+                            "layer violates the layer map LSN split assumption: layer {} intersects with layer {}",
+                            layer, other_layer
+                        );
+                    return Some(err);
+                }
+            }
         }
     }
     None
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index a4e8f39522..01f2a5b170 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -29,6 +29,7 @@ use utils::id::TimelineId;
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
 use crate::statvfs::Statvfs;
+use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::batch_split_writer::{
     BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -2156,15 +2157,14 @@ impl Timeline {
 
         // Step 1: construct a k-merge iterator over all layers.
         // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
-        // disable the check for now because we need to adjust the check for partial compactions, will enable later.
-        // let layer_names = job_desc
-        //     .selected_layers
-        //     .iter()
-        //     .map(|layer| layer.layer_desc().layer_name())
-        //     .collect_vec();
-        // if let Some(err) = check_valid_layermap(&layer_names) {
-        //     warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
-        // }
+        let layer_names = job_desc
+            .selected_layers
+            .iter()
+            .map(|layer| layer.layer_desc().layer_name())
+            .collect_vec();
+        if let Some(err) = check_valid_layermap(&layer_names) {
+            bail!("gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss", err);
+        }
         // The maximum LSN we are processing in this compaction loop
         let end_lsn = job_desc
             .selected_layers
@@ -2546,8 +2546,36 @@ impl Timeline {
         );
 
         // Step 3: Place back to the layer map.
+
+        // First, do a sanity check to ensure the newly-created layer map does not contain overlaps.
+        let all_layers = {
+            let guard = self.layers.read().await;
+            let layer_map = guard.layer_map()?;
+            layer_map.iter_historic_layers().collect_vec()
+        };
+
+        let mut final_layers = all_layers
+            .iter()
+            .map(|layer| layer.layer_name())
+            .collect::<HashSet<_>>();
+        for layer in &layer_selection {
+            final_layers.remove(&layer.layer_desc().layer_name());
+        }
+        for layer in &compact_to {
+            final_layers.insert(layer.layer_desc().layer_name());
+        }
+        let final_layers = final_layers.into_iter().collect_vec();
+
+        // TODO: move this check before we call `finish` on image layer writers. However, this will require us to get the layer name before we finish
+        // the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are
+        // in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails.
+        if let Some(err) = check_valid_layermap(&final_layers) {
+            bail!("gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss", err);
+        }
+
+        // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only
+        // operate on L1 layers.
         {
-            // TODO: sanity check if the layer map is valid (i.e., should not have overlaps)
             let mut guard = self.layers.write().await;
             guard
                 .open_mut()?

From 197a89ab3dee0ff90b060c92032a1a8e0b3213a8 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 19 Dec 2024 20:32:32 +0200
Subject: [PATCH 086/583] =?UTF-8?q?Increase=20default=20stotrage=20control?=
 =?UTF-8?q?ler=20heartbeat=20interval=20from=20100msec=20=E2=80=A6=20(#102?=
 =?UTF-8?q?06)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

Currently default value of storage controller heartbeat interval is
100msec. It means that 10 times per second it establish connection to
PS. And it seems to be quite expensive.
At MacOS right now storage_controller consumes 70% CPU and trusts - 30%.
So together they completely utilize one core.
A lot of us has Macs. Let's save environment a little bit and do not
waste electricity and contribute to global warming.

By the way, on prod we have interval  10seconds

## Summary of changes

Increase heartbeat interval from 100msec to 1 second.

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 control_plane/src/local_env.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 489f9c8509..5b82acb3a5 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -180,7 +180,7 @@ impl NeonStorageControllerConf {
     const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30);
 
     // Very tight heartbeat interval to speed up tests
-    const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(100);
+    const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(1000);
 }
 
 impl Default for NeonStorageControllerConf {

From 9c53b41245e3aecba30c2e05df4eeabe45fd39ac Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 19 Dec 2024 13:40:20 -0500
Subject: [PATCH 087/583] fix(pageserver): update remote latest_gc_cutoff after
 gc-compaction (#10209)

## Problem

close https://github.com/neondatabase/neon/issues/10208
part of #9114

## Summary of changes

* Ensure remote `latest_gc_cutoff` is up-to-date before removing any
files for gc-compaction.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 01f2a5b170..94c65631b2 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -2581,6 +2581,13 @@ impl Timeline {
                 .open_mut()?
                 .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
         };
+
+        // Schedule an index-only upload to update the `latest_gc_cutoff` in the index_part.json.
+        // Otherwise, after restart, the index_part only contains the old `latest_gc_cutoff` and
+        // find_gc_cutoffs will try accessing things below the cutoff. TODO: ideally, this should
+        // be batched into `schedule_compaction_update`.
+        let disk_consistent_lsn = self.disk_consistent_lsn.load();
+        self.schedule_uploads(disk_consistent_lsn, None)?;
         self.remote_client
             .schedule_compaction_update(&layer_selection, &compact_to)?;
 

From f94248a5941dc0ba38ea8fc94ebc49016caef162 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 2 Jan 2025 09:35:28 +0000
Subject: [PATCH 088/583] chore(libs/proxy): refactor tokio-postgres connection
 control flow (#10247)

In #10207 it was clear there was some confusion with the current
connection logic. To analyse the flow to make sure there was no poll
stalling, I ended up with the following refactor.

Notable changes:
1. Now all functions called `poll_xyz` and that have a `cx: &mut
Context` argument must return a `Poll<_>` type, and can only return
`Pending` iff an internal poll call also returned `Pending`
2. State management is handled entirely by `poll_messages`. There are
now only 2 states which makes it much easier to keep track of.

Each commit should be self-reviewable and should be simple to verify
that it keeps the same behaviour
---
 libs/proxy/tokio-postgres2/src/connection.rs | 147 ++++++++++---------
 1 file changed, 81 insertions(+), 66 deletions(-)

diff --git a/libs/proxy/tokio-postgres2/src/connection.rs b/libs/proxy/tokio-postgres2/src/connection.rs
index 0aa5c77e22..f478717e0d 100644
--- a/libs/proxy/tokio-postgres2/src/connection.rs
+++ b/libs/proxy/tokio-postgres2/src/connection.rs
@@ -33,10 +33,14 @@ pub struct Response {
 #[derive(PartialEq, Debug)]
 enum State {
     Active,
-    Terminating,
     Closing,
 }
 
+enum WriteReady {
+    Terminating,
+    WaitingOnRead,
+}
+
 /// A connection to a PostgreSQL database.
 ///
 /// This is one half of what is returned when a new connection is established. It performs the actual IO with the
@@ -51,7 +55,6 @@ pub struct Connection<S, T> {
     /// HACK: we need this in the Neon Proxy to forward params.
     pub parameters: HashMap<String, String>,
     receiver: mpsc::UnboundedReceiver<Request>,
-    pending_request: Option<RequestMessages>,
     pending_responses: VecDeque<BackendMessage>,
     responses: VecDeque<Response>,
     state: State,
@@ -72,7 +75,6 @@ where
             stream,
             parameters,
             receiver,
-            pending_request: None,
             pending_responses,
             responses: VecDeque::new(),
             state: State::Active,
@@ -93,26 +95,23 @@ where
             .map(|o| o.map(|r| r.map_err(Error::io)))
     }
 
-    fn poll_read(&mut self, cx: &mut Context<'_>) -> Result<Option<AsyncMessage>, Error> {
-        if self.state != State::Active {
-            trace!("poll_read: done");
-            return Ok(None);
-        }
-
+    /// Read and process messages from the connection to postgres.
+    /// client <- postgres
+    fn poll_read(&mut self, cx: &mut Context<'_>) -> Poll<Result<AsyncMessage, Error>> {
         loop {
             let message = match self.poll_response(cx)? {
                 Poll::Ready(Some(message)) => message,
-                Poll::Ready(None) => return Err(Error::closed()),
+                Poll::Ready(None) => return Poll::Ready(Err(Error::closed())),
                 Poll::Pending => {
                     trace!("poll_read: waiting on response");
-                    return Ok(None);
+                    return Poll::Pending;
                 }
             };
 
             let (mut messages, request_complete) = match message {
                 BackendMessage::Async(Message::NoticeResponse(body)) => {
                     let error = DbError::parse(&mut body.fields()).map_err(Error::parse)?;
-                    return Ok(Some(AsyncMessage::Notice(error)));
+                    return Poll::Ready(Ok(AsyncMessage::Notice(error)));
                 }
                 BackendMessage::Async(Message::NotificationResponse(body)) => {
                     let notification = Notification {
@@ -120,7 +119,7 @@ where
                         channel: body.channel().map_err(Error::parse)?.to_string(),
                         payload: body.message().map_err(Error::parse)?.to_string(),
                     };
-                    return Ok(Some(AsyncMessage::Notification(notification)));
+                    return Poll::Ready(Ok(AsyncMessage::Notification(notification)));
                 }
                 BackendMessage::Async(Message::ParameterStatus(body)) => {
                     self.parameters.insert(
@@ -139,8 +138,10 @@ where
             let mut response = match self.responses.pop_front() {
                 Some(response) => response,
                 None => match messages.next().map_err(Error::parse)? {
-                    Some(Message::ErrorResponse(error)) => return Err(Error::db(error)),
-                    _ => return Err(Error::unexpected_message()),
+                    Some(Message::ErrorResponse(error)) => {
+                        return Poll::Ready(Err(Error::db(error)))
+                    }
+                    _ => return Poll::Ready(Err(Error::unexpected_message())),
                 },
             };
 
@@ -164,18 +165,14 @@ where
                         request_complete,
                     });
                     trace!("poll_read: waiting on sender");
-                    return Ok(None);
+                    return Poll::Pending;
                 }
             }
         }
     }
 
+    /// Fetch the next client request and enqueue the response sender.
     fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll<Option<RequestMessages>> {
-        if let Some(messages) = self.pending_request.take() {
-            trace!("retrying pending request");
-            return Poll::Ready(Some(messages));
-        }
-
         if self.receiver.is_closed() {
             return Poll::Ready(None);
         }
@@ -193,74 +190,80 @@ where
         }
     }
 
-    fn poll_write(&mut self, cx: &mut Context<'_>) -> Result<bool, Error> {
+    /// Process client requests and write them to the postgres connection, flushing if necessary.
+    /// client -> postgres
+    fn poll_write(&mut self, cx: &mut Context<'_>) -> Poll<Result<WriteReady, Error>> {
         loop {
-            if self.state == State::Closing {
-                trace!("poll_write: done");
-                return Ok(false);
-            }
-
             if Pin::new(&mut self.stream)
                 .poll_ready(cx)
                 .map_err(Error::io)?
                 .is_pending()
             {
                 trace!("poll_write: waiting on socket");
-                return Ok(false);
+
+                // poll_ready is self-flushing.
+                return Poll::Pending;
             }
 
-            let request = match self.poll_request(cx) {
-                Poll::Ready(Some(request)) => request,
-                Poll::Ready(None) if self.responses.is_empty() && self.state == State::Active => {
+            match self.poll_request(cx) {
+                // send the message to postgres
+                Poll::Ready(Some(RequestMessages::Single(request))) => {
+                    Pin::new(&mut self.stream)
+                        .start_send(request)
+                        .map_err(Error::io)?;
+                }
+                // No more messages from the client, and no more responses to wait for.
+                // Send a terminate message to postgres
+                Poll::Ready(None) if self.responses.is_empty() => {
                     trace!("poll_write: at eof, terminating");
-                    self.state = State::Terminating;
                     let mut request = BytesMut::new();
                     frontend::terminate(&mut request);
-                    RequestMessages::Single(FrontendMessage::Raw(request.freeze()))
+                    let request = FrontendMessage::Raw(request.freeze());
+
+                    Pin::new(&mut self.stream)
+                        .start_send(request)
+                        .map_err(Error::io)?;
+
+                    trace!("poll_write: sent eof, closing");
+                    trace!("poll_write: done");
+                    return Poll::Ready(Ok(WriteReady::Terminating));
                 }
+                // No more messages from the client, but there are still some responses to wait for.
                 Poll::Ready(None) => {
                     trace!(
                         "poll_write: at eof, pending responses {}",
                         self.responses.len()
                     );
-                    return Ok(true);
+                    ready!(self.poll_flush(cx))?;
+                    return Poll::Ready(Ok(WriteReady::WaitingOnRead));
                 }
+                // Still waiting for a message from the client.
                 Poll::Pending => {
                     trace!("poll_write: waiting on request");
-                    return Ok(true);
-                }
-            };
-
-            match request {
-                RequestMessages::Single(request) => {
-                    Pin::new(&mut self.stream)
-                        .start_send(request)
-                        .map_err(Error::io)?;
-                    if self.state == State::Terminating {
-                        trace!("poll_write: sent eof, closing");
-                        self.state = State::Closing;
-                    }
+                    ready!(self.poll_flush(cx))?;
+                    return Poll::Pending;
                 }
             }
         }
     }
 
-    fn poll_flush(&mut self, cx: &mut Context<'_>) -> Result<(), Error> {
+    fn poll_flush(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
         match Pin::new(&mut self.stream)
             .poll_flush(cx)
             .map_err(Error::io)?
         {
-            Poll::Ready(()) => trace!("poll_flush: flushed"),
-            Poll::Pending => trace!("poll_flush: waiting on socket"),
+            Poll::Ready(()) => {
+                trace!("poll_flush: flushed");
+                Poll::Ready(Ok(()))
+            }
+            Poll::Pending => {
+                trace!("poll_flush: waiting on socket");
+                Poll::Pending
+            }
         }
-        Ok(())
     }
 
     fn poll_shutdown(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
-        if self.state != State::Closing {
-            return Poll::Pending;
-        }
-
         match Pin::new(&mut self.stream)
             .poll_close(cx)
             .map_err(Error::io)?
@@ -289,18 +292,30 @@ where
         &mut self,
         cx: &mut Context<'_>,
     ) -> Poll<Option<Result<AsyncMessage, Error>>> {
-        let message = self.poll_read(cx)?;
-        let want_flush = self.poll_write(cx)?;
-        if want_flush {
-            self.poll_flush(cx)?;
+        if self.state != State::Closing {
+            // if the state is still active, try read from and write to postgres.
+            let message = self.poll_read(cx)?;
+            let closing = self.poll_write(cx)?;
+            if let Poll::Ready(WriteReady::Terminating) = closing {
+                self.state = State::Closing;
+            }
+
+            if let Poll::Ready(message) = message {
+                return Poll::Ready(Some(Ok(message)));
+            }
+
+            // poll_read returned Pending.
+            // poll_write returned Pending or Ready(WriteReady::WaitingOnRead).
+            // if poll_write returned Ready(WriteReady::WaitingOnRead), then we are waiting to read more data from postgres.
+            if self.state != State::Closing {
+                return Poll::Pending;
+            }
         }
-        match message {
-            Some(message) => Poll::Ready(Some(Ok(message))),
-            None => match self.poll_shutdown(cx) {
-                Poll::Ready(Ok(())) => Poll::Ready(None),
-                Poll::Ready(Err(e)) => Poll::Ready(Some(Err(e))),
-                Poll::Pending => Poll::Pending,
-            },
+
+        match self.poll_shutdown(cx) {
+            Poll::Ready(Ok(())) => Poll::Ready(None),
+            Poll::Ready(Err(e)) => Poll::Ready(Some(Err(e))),
+            Poll::Pending => Poll::Pending,
         }
     }
 }

From 38c7a2abfc40bbbe2c952ed634aaab32fb100fcc Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 2 Jan 2025 09:36:13 +0000
Subject: [PATCH 089/583] chore(proxy): pre-load native tls certificates and
 propagate compute client config (#10182)

Now that we construct the TLS client config for cancellation as well as
connect, it feels appropriate to construct the same config once and
re-use it elsewhere. It might also help should #7500 require any extra
setup, so we can easily add it to all the appropriate call sites.
---
 proxy/src/auth/flow.rs                        |   2 +-
 proxy/src/bin/local_proxy.rs                  |  16 +-
 proxy/src/bin/pg_sni_router.rs                |   2 +-
 proxy/src/bin/proxy.rs                        |  17 +-
 proxy/src/cancellation.rs                     |  77 +++--
 proxy/src/compute.rs                          |  41 +--
 proxy/src/config.rs                           | 288 +-----------------
 proxy/src/console_redirect_proxy.rs           |   4 +-
 proxy/src/control_plane/mod.rs                |   6 +-
 proxy/src/lib.rs                              |   2 +-
 proxy/src/proxy/connect_compute.rs            |  28 +-
 proxy/src/proxy/handshake.rs                  |   3 +-
 proxy/src/proxy/mod.rs                        |   4 +-
 proxy/src/proxy/passthrough.rs                |  13 +-
 proxy/src/proxy/tests/mod.rs                  |  91 +++---
 proxy/src/redis/notifications.rs              |   3 +
 proxy/src/scram/exchange.rs                   |  13 +-
 proxy/src/scram/mod.rs                        |   7 +-
 proxy/src/serverless/backend.rs               |  14 +-
 proxy/src/serverless/websocket.rs             |   2 +-
 proxy/src/stream.rs                           |   2 +-
 proxy/src/tls/client_config.rs                |  42 +++
 proxy/src/tls/mod.rs                          |  72 +++++
 .../mod.rs => tls/postgres_rustls.rs}         |  10 +-
 proxy/src/tls/server_config.rs                | 218 +++++++++++++
 25 files changed, 509 insertions(+), 468 deletions(-)
 create mode 100644 proxy/src/tls/client_config.rs
 create mode 100644 proxy/src/tls/mod.rs
 rename proxy/src/{postgres_rustls/mod.rs => tls/postgres_rustls.rs} (96%)
 create mode 100644 proxy/src/tls/server_config.rs

diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 60d1962d7f..0992c6d875 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -10,7 +10,6 @@ use tracing::info;
 
 use super::backend::ComputeCredentialKeys;
 use super::{AuthError, PasswordHackPayload};
-use crate::config::TlsServerEndPoint;
 use crate::context::RequestContext;
 use crate::control_plane::AuthSecret;
 use crate::intern::EndpointIdInt;
@@ -18,6 +17,7 @@ use crate::sasl;
 use crate::scram::threadpool::ThreadPool;
 use crate::scram::{self};
 use crate::stream::{PqStream, Stream};
+use crate::tls::TlsServerEndPoint;
 
 /// Every authentication selector is supposed to implement this trait.
 pub(crate) trait AuthMethod {
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index 56bbd94850..644f670f88 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -13,7 +13,9 @@ use proxy::auth::backend::jwt::JwkCache;
 use proxy::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP};
 use proxy::auth::{self};
 use proxy::cancellation::CancellationHandlerMain;
-use proxy::config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig};
+use proxy::config::{
+    self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig,
+};
 use proxy::control_plane::locks::ApiLocks;
 use proxy::control_plane::messages::{EndpointJwksResponse, JwksSettings};
 use proxy::http::health_server::AppMetrics;
@@ -25,6 +27,7 @@ use proxy::rate_limiter::{
 use proxy::scram::threadpool::ThreadPool;
 use proxy::serverless::cancel_set::CancelSet;
 use proxy::serverless::{self, GlobalConnPoolOptions};
+use proxy::tls::client_config::compute_client_config_with_root_certs;
 use proxy::types::RoleName;
 use proxy::url::ApiUrl;
 
@@ -209,6 +212,7 @@ async fn main() -> anyhow::Result<()> {
         http_listener,
         shutdown.clone(),
         Arc::new(CancellationHandlerMain::new(
+            &config.connect_to_compute,
             Arc::new(DashMap::new()),
             None,
             proxy::metrics::CancellationSource::Local,
@@ -268,6 +272,12 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
         max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
     };
 
+    let compute_config = ComputeConfig {
+        retry: RetryConfig::parse(RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)?,
+        tls: Arc::new(compute_client_config_with_root_certs()?),
+        timeout: Duration::from_secs(2),
+    };
+
     Ok(Box::leak(Box::new(ProxyConfig {
         tls_config: None,
         metric_collection: None,
@@ -289,9 +299,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
         region: "local".into(),
         wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?,
         connect_compute_locks,
-        connect_to_compute_retry_config: RetryConfig::parse(
-            RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES,
-        )?,
+        connect_to_compute: compute_config,
     })))
 }
 
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 9538384b9e..97d870a83a 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -10,12 +10,12 @@ use clap::Arg;
 use futures::future::Either;
 use futures::TryFutureExt;
 use itertools::Itertools;
-use proxy::config::TlsServerEndPoint;
 use proxy::context::RequestContext;
 use proxy::metrics::{Metrics, ThreadPoolMetrics};
 use proxy::protocol2::ConnectionInfo;
 use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
 use proxy::stream::{PqStream, Stream};
+use proxy::tls::TlsServerEndPoint;
 use rustls::crypto::ring;
 use rustls::pki_types::PrivateKeyDer;
 use tokio::io::{AsyncRead, AsyncWrite};
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 3dcf9ca060..3b122d771c 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,6 +1,7 @@
 use std::net::SocketAddr;
 use std::pin::pin;
 use std::sync::Arc;
+use std::time::Duration;
 
 use anyhow::bail;
 use futures::future::Either;
@@ -8,7 +9,7 @@ use proxy::auth::backend::jwt::JwkCache;
 use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned};
 use proxy::cancellation::{CancelMap, CancellationHandler};
 use proxy::config::{
-    self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, HttpConfig,
+    self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig,
     ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2,
 };
 use proxy::context::parquet::ParquetUploadArgs;
@@ -23,6 +24,7 @@ use proxy::redis::{elasticache, notifications};
 use proxy::scram::threadpool::ThreadPool;
 use proxy::serverless::cancel_set::CancelSet;
 use proxy::serverless::GlobalConnPoolOptions;
+use proxy::tls::client_config::compute_client_config_with_root_certs;
 use proxy::{auth, control_plane, http, serverless, usage_metrics};
 use remote_storage::RemoteStorageConfig;
 use tokio::net::TcpListener;
@@ -397,6 +399,7 @@ async fn main() -> anyhow::Result<()> {
     let cancellation_handler = Arc::new(CancellationHandler::<
         Option<Arc<Mutex<RedisPublisherClient>>>,
     >::new(
+        &config.connect_to_compute,
         cancel_map.clone(),
         redis_publisher,
         proxy::metrics::CancellationSource::FromClient,
@@ -492,6 +495,7 @@ async fn main() -> anyhow::Result<()> {
                     let cache = api.caches.project_info.clone();
                     if let Some(client) = client1 {
                         maintenance_tasks.spawn(notifications::task_main(
+                            config,
                             client,
                             cache.clone(),
                             cancel_map.clone(),
@@ -500,6 +504,7 @@ async fn main() -> anyhow::Result<()> {
                     }
                     if let Some(client) = client2 {
                         maintenance_tasks.spawn(notifications::task_main(
+                            config,
                             client,
                             cache.clone(),
                             cancel_map.clone(),
@@ -632,6 +637,12 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         console_redirect_confirmation_timeout: args.webauth_confirmation_timeout,
     };
 
+    let compute_config = ComputeConfig {
+        retry: config::RetryConfig::parse(&args.connect_to_compute_retry)?,
+        tls: Arc::new(compute_client_config_with_root_certs()?),
+        timeout: Duration::from_secs(2),
+    };
+
     let config = ProxyConfig {
         tls_config,
         metric_collection,
@@ -642,9 +653,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         region: args.region.clone(),
         wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
         connect_compute_locks,
-        connect_to_compute_retry_config: config::RetryConfig::parse(
-            &args.connect_to_compute_retry,
-        )?,
+        connect_to_compute: compute_config,
     };
 
     let config = Box::leak(Box::new(config));
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index ebaea173ae..df618cf242 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -3,11 +3,9 @@ use std::sync::Arc;
 
 use dashmap::DashMap;
 use ipnet::{IpNet, Ipv4Net, Ipv6Net};
-use once_cell::sync::OnceCell;
 use postgres_client::tls::MakeTlsConnect;
 use postgres_client::CancelToken;
 use pq_proto::CancelKeyData;
-use rustls::crypto::ring;
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio::sync::Mutex;
@@ -15,15 +13,15 @@ use tracing::{debug, info};
 use uuid::Uuid;
 
 use crate::auth::{check_peer_addr_is_in_list, IpPattern};
-use crate::compute::load_certs;
+use crate::config::ComputeConfig;
 use crate::error::ReportableError;
 use crate::ext::LockExt;
 use crate::metrics::{CancellationRequest, CancellationSource, Metrics};
-use crate::postgres_rustls::MakeRustlsConnect;
 use crate::rate_limiter::LeakyBucketRateLimiter;
 use crate::redis::cancellation_publisher::{
     CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
 };
+use crate::tls::postgres_rustls::MakeRustlsConnect;
 
 pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
 pub type CancellationHandlerMain = CancellationHandler<Option<Arc<Mutex<RedisPublisherClient>>>>;
@@ -35,6 +33,7 @@ type IpSubnetKey = IpNet;
 ///
 /// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances.
 pub struct CancellationHandler<P> {
+    compute_config: &'static ComputeConfig,
     map: CancelMap,
     client: P,
     /// This field used for the monitoring purposes.
@@ -183,7 +182,7 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
             "cancelling query per user's request using key {key}, hostname {}, address: {}",
             cancel_closure.hostname, cancel_closure.socket_addr
         );
-        cancel_closure.try_cancel_query().await
+        cancel_closure.try_cancel_query(self.compute_config).await
     }
 
     #[cfg(test)]
@@ -198,8 +197,13 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
 }
 
 impl CancellationHandler<()> {
-    pub fn new(map: CancelMap, from: CancellationSource) -> Self {
+    pub fn new(
+        compute_config: &'static ComputeConfig,
+        map: CancelMap,
+        from: CancellationSource,
+    ) -> Self {
         Self {
+            compute_config,
             map,
             client: (),
             from,
@@ -214,8 +218,14 @@ impl CancellationHandler<()> {
 }
 
 impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
-    pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: CancellationSource) -> Self {
+    pub fn new(
+        compute_config: &'static ComputeConfig,
+        map: CancelMap,
+        client: Option<Arc<Mutex<P>>>,
+        from: CancellationSource,
+    ) -> Self {
         Self {
+            compute_config,
             map,
             client,
             from,
@@ -229,8 +239,6 @@ impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
     }
 }
 
-static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
-
 /// This should've been a [`std::future::Future`], but
 /// it's impossible to name a type of an unboxed future
 /// (we'd need something like `#![feature(type_alias_impl_trait)]`).
@@ -257,27 +265,14 @@ impl CancelClosure {
         }
     }
     /// Cancels the query running on user's compute node.
-    pub(crate) async fn try_cancel_query(self) -> Result<(), CancelError> {
+    pub(crate) async fn try_cancel_query(
+        self,
+        compute_config: &ComputeConfig,
+    ) -> Result<(), CancelError> {
         let socket = TcpStream::connect(self.socket_addr).await?;
 
-        let root_store = TLS_ROOTS
-            .get_or_try_init(load_certs)
-            .map_err(|_e| {
-                CancelError::IO(std::io::Error::new(
-                    std::io::ErrorKind::Other,
-                    "TLS root store initialization failed".to_string(),
-                ))
-            })?
-            .clone();
-
-        let client_config =
-            rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
-                .with_safe_default_protocol_versions()
-                .expect("ring should support the default protocol versions")
-                .with_root_certificates(root_store)
-                .with_no_client_auth();
-
-        let mut mk_tls = crate::postgres_rustls::MakeRustlsConnect::new(client_config);
+        let mut mk_tls =
+            crate::tls::postgres_rustls::MakeRustlsConnect::new(compute_config.tls.clone());
         let tls = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::make_tls_connect(
             &mut mk_tls,
             &self.hostname,
@@ -329,11 +324,30 @@ impl<P> Drop for Session<P> {
 #[cfg(test)]
 #[expect(clippy::unwrap_used)]
 mod tests {
+    use std::time::Duration;
+
     use super::*;
+    use crate::config::RetryConfig;
+    use crate::tls::client_config::compute_client_config_with_certs;
+
+    fn config() -> ComputeConfig {
+        let retry = RetryConfig {
+            base_delay: Duration::from_secs(1),
+            max_retries: 5,
+            backoff_factor: 2.0,
+        };
+
+        ComputeConfig {
+            retry,
+            tls: Arc::new(compute_client_config_with_certs(std::iter::empty())),
+            timeout: Duration::from_secs(2),
+        }
+    }
 
     #[tokio::test]
     async fn check_session_drop() -> anyhow::Result<()> {
         let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
+            Box::leak(Box::new(config())),
             CancelMap::default(),
             CancellationSource::FromRedis,
         ));
@@ -349,8 +363,11 @@ mod tests {
 
     #[tokio::test]
     async fn cancel_session_noop_regression() {
-        let handler =
-            CancellationHandler::<()>::new(CancelMap::default(), CancellationSource::Local);
+        let handler = CancellationHandler::<()>::new(
+            Box::leak(Box::new(config())),
+            CancelMap::default(),
+            CancellationSource::Local,
+        );
         handler
             .cancel_session(
                 CancelKeyData {
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 8dc9b59e81..d60dfd0f80 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,16 +1,13 @@
 use std::io;
 use std::net::SocketAddr;
-use std::sync::Arc;
 use std::time::Duration;
 
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
-use once_cell::sync::OnceCell;
 use postgres_client::tls::MakeTlsConnect;
 use postgres_client::{CancelToken, RawConnection};
 use postgres_protocol::message::backend::NoticeResponseBody;
 use pq_proto::StartupMessageParams;
-use rustls::crypto::ring;
 use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
 use tokio::net::TcpStream;
@@ -18,14 +15,15 @@ use tracing::{debug, error, info, warn};
 
 use crate::auth::parse_endpoint_param;
 use crate::cancellation::CancelClosure;
+use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::control_plane::client::ApiLockError;
 use crate::control_plane::errors::WakeComputeError;
 use crate::control_plane::messages::MetricsAuxInfo;
 use crate::error::{ReportableError, UserFacingError};
 use crate::metrics::{Metrics, NumDbConnectionsGuard};
-use crate::postgres_rustls::MakeRustlsConnect;
 use crate::proxy::neon_option;
+use crate::tls::postgres_rustls::MakeRustlsConnect;
 use crate::types::Host;
 
 pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
@@ -40,9 +38,6 @@ pub(crate) enum ConnectionError {
     #[error("{COULD_NOT_CONNECT}: {0}")]
     CouldNotConnect(#[from] io::Error),
 
-    #[error("Couldn't load native TLS certificates: {0:?}")]
-    TlsCertificateError(Vec<rustls_native_certs::Error>),
-
     #[error("{COULD_NOT_CONNECT}: {0}")]
     TlsError(#[from] InvalidDnsNameError),
 
@@ -89,7 +84,6 @@ impl ReportableError for ConnectionError {
             }
             ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute,
             ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute,
-            ConnectionError::TlsCertificateError(_) => crate::error::ErrorKind::Service,
             ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute,
             ConnectionError::WakeComputeError(e) => e.get_error_kind(),
             ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(),
@@ -251,25 +245,13 @@ impl ConnCfg {
         &self,
         ctx: &RequestContext,
         aux: MetricsAuxInfo,
-        timeout: Duration,
+        config: &ComputeConfig,
     ) -> Result<PostgresConnection, ConnectionError> {
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
-        let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
+        let (socket_addr, stream, host) = self.connect_raw(config.timeout).await?;
         drop(pause);
 
-        let root_store = TLS_ROOTS
-            .get_or_try_init(load_certs)
-            .map_err(ConnectionError::TlsCertificateError)?
-            .clone();
-
-        let client_config =
-            rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
-                .with_safe_default_protocol_versions()
-                .expect("ring should support the default protocol versions")
-                .with_root_certificates(root_store)
-                .with_no_client_auth();
-
-        let mut mk_tls = crate::postgres_rustls::MakeRustlsConnect::new(client_config);
+        let mut mk_tls = crate::tls::postgres_rustls::MakeRustlsConnect::new(config.tls.clone());
         let tls = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::make_tls_connect(
             &mut mk_tls,
             host,
@@ -341,19 +323,6 @@ fn filtered_options(options: &str) -> Option<String> {
     Some(options)
 }
 
-pub(crate) fn load_certs() -> Result<Arc<rustls::RootCertStore>, Vec<rustls_native_certs::Error>> {
-    let der_certs = rustls_native_certs::load_native_certs();
-
-    if !der_certs.errors.is_empty() {
-        return Err(der_certs.errors);
-    }
-
-    let mut store = rustls::RootCertStore::empty();
-    store.add_parsable_certificates(der_certs.certs);
-    Ok(Arc::new(store))
-}
-static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 33d1d2e9e4..8502edcfab 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,17 +1,10 @@
-use std::collections::{HashMap, HashSet};
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 
 use anyhow::{bail, ensure, Context, Ok};
 use clap::ValueEnum;
-use itertools::Itertools;
 use remote_storage::RemoteStorageConfig;
-use rustls::crypto::ring::{self, sign};
-use rustls::pki_types::{CertificateDer, PrivateKeyDer};
-use sha2::{Digest, Sha256};
-use tracing::{error, info};
-use x509_parser::oid_registry;
 
 use crate::auth::backend::jwt::JwkCache;
 use crate::auth::backend::AuthRateLimiter;
@@ -20,6 +13,7 @@ use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}
 use crate::scram::threadpool::ThreadPool;
 use crate::serverless::cancel_set::CancelSet;
 use crate::serverless::GlobalConnPoolOptions;
+pub use crate::tls::server_config::{configure_tls, TlsConfig};
 use crate::types::Host;
 
 pub struct ProxyConfig {
@@ -32,7 +26,13 @@ pub struct ProxyConfig {
     pub handshake_timeout: Duration,
     pub wake_compute_retry_config: RetryConfig,
     pub connect_compute_locks: ApiLocks<Host>,
-    pub connect_to_compute_retry_config: RetryConfig,
+    pub connect_to_compute: ComputeConfig,
+}
+
+pub struct ComputeConfig {
+    pub retry: RetryConfig,
+    pub tls: Arc<rustls::ClientConfig>,
+    pub timeout: Duration,
 }
 
 #[derive(Copy, Clone, Debug, ValueEnum, PartialEq)]
@@ -52,12 +52,6 @@ pub struct MetricCollectionConfig {
     pub backup_metric_collection_config: MetricBackupCollectionConfig,
 }
 
-pub struct TlsConfig {
-    pub config: Arc<rustls::ServerConfig>,
-    pub common_names: HashSet<String>,
-    pub cert_resolver: Arc<CertResolver>,
-}
-
 pub struct HttpConfig {
     pub accept_websockets: bool,
     pub pool_options: GlobalConnPoolOptions,
@@ -80,272 +74,6 @@ pub struct AuthenticationConfig {
     pub console_redirect_confirmation_timeout: tokio::time::Duration,
 }
 
-impl TlsConfig {
-    pub fn to_server_config(&self) -> Arc<rustls::ServerConfig> {
-        self.config.clone()
-    }
-}
-
-/// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L159>
-pub const PG_ALPN_PROTOCOL: &[u8] = b"postgresql";
-
-/// Configure TLS for the main endpoint.
-pub fn configure_tls(
-    key_path: &str,
-    cert_path: &str,
-    certs_dir: Option<&String>,
-    allow_tls_keylogfile: bool,
-) -> anyhow::Result<TlsConfig> {
-    let mut cert_resolver = CertResolver::new();
-
-    // add default certificate
-    cert_resolver.add_cert_path(key_path, cert_path, true)?;
-
-    // add extra certificates
-    if let Some(certs_dir) = certs_dir {
-        for entry in std::fs::read_dir(certs_dir)? {
-            let entry = entry?;
-            let path = entry.path();
-            if path.is_dir() {
-                // file names aligned with default cert-manager names
-                let key_path = path.join("tls.key");
-                let cert_path = path.join("tls.crt");
-                if key_path.exists() && cert_path.exists() {
-                    cert_resolver.add_cert_path(
-                        &key_path.to_string_lossy(),
-                        &cert_path.to_string_lossy(),
-                        false,
-                    )?;
-                }
-            }
-        }
-    }
-
-    let common_names = cert_resolver.get_common_names();
-
-    let cert_resolver = Arc::new(cert_resolver);
-
-    // allow TLS 1.2 to be compatible with older client libraries
-    let mut config =
-        rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
-            .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
-            .context("ring should support TLS1.2 and TLS1.3")?
-            .with_no_client_auth()
-            .with_cert_resolver(cert_resolver.clone());
-
-    config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()];
-
-    if allow_tls_keylogfile {
-        // KeyLogFile will check for the SSLKEYLOGFILE environment variable.
-        config.key_log = Arc::new(rustls::KeyLogFile::new());
-    }
-
-    Ok(TlsConfig {
-        config: Arc::new(config),
-        common_names,
-        cert_resolver,
-    })
-}
-
-/// Channel binding parameter
-///
-/// <https://www.rfc-editor.org/rfc/rfc5929#section-4>
-/// Description: The hash of the TLS server's certificate as it
-/// appears, octet for octet, in the server's Certificate message.  Note
-/// that the Certificate message contains a certificate_list, in which
-/// the first element is the server's certificate.
-///
-/// The hash function is to be selected as follows:
-///
-/// * if the certificate's signatureAlgorithm uses a single hash
-///   function, and that hash function is either MD5 or SHA-1, then use SHA-256;
-///
-/// * if the certificate's signatureAlgorithm uses a single hash
-///   function and that hash function neither MD5 nor SHA-1, then use
-///   the hash function associated with the certificate's
-///   signatureAlgorithm;
-///
-/// * if the certificate's signatureAlgorithm uses no hash functions or
-///   uses multiple hash functions, then this channel binding type's
-///   channel bindings are undefined at this time (updates to is channel
-///   binding type may occur to address this issue if it ever arises).
-#[derive(Debug, Clone, Copy)]
-pub enum TlsServerEndPoint {
-    Sha256([u8; 32]),
-    Undefined,
-}
-
-impl TlsServerEndPoint {
-    pub fn new(cert: &CertificateDer<'_>) -> anyhow::Result<Self> {
-        let sha256_oids = [
-            // I'm explicitly not adding MD5 or SHA1 here... They're bad.
-            oid_registry::OID_SIG_ECDSA_WITH_SHA256,
-            oid_registry::OID_PKCS1_SHA256WITHRSA,
-        ];
-
-        let pem = x509_parser::parse_x509_certificate(cert)
-            .context("Failed to parse PEM object from cerficiate")?
-            .1;
-
-        info!(subject = %pem.subject, "parsing TLS certificate");
-
-        let reg = oid_registry::OidRegistry::default().with_all_crypto();
-        let oid = pem.signature_algorithm.oid();
-        let alg = reg.get(oid);
-        if sha256_oids.contains(oid) {
-            let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into();
-            info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding");
-            Ok(Self::Sha256(tls_server_end_point))
-        } else {
-            error!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), "unknown channel binding");
-            Ok(Self::Undefined)
-        }
-    }
-
-    pub fn supported(&self) -> bool {
-        !matches!(self, TlsServerEndPoint::Undefined)
-    }
-}
-
-#[derive(Default, Debug)]
-pub struct CertResolver {
-    certs: HashMap<String, (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
-    default: Option<(Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
-}
-
-impl CertResolver {
-    pub fn new() -> Self {
-        Self::default()
-    }
-
-    fn add_cert_path(
-        &mut self,
-        key_path: &str,
-        cert_path: &str,
-        is_default: bool,
-    ) -> anyhow::Result<()> {
-        let priv_key = {
-            let key_bytes = std::fs::read(key_path)
-                .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?;
-            rustls_pemfile::private_key(&mut &key_bytes[..])
-                .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))?
-                .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))?
-        };
-
-        let cert_chain_bytes = std::fs::read(cert_path)
-            .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
-
-        let cert_chain = {
-            rustls_pemfile::certs(&mut &cert_chain_bytes[..])
-                .try_collect()
-                .with_context(|| {
-                    format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.")
-                })?
-        };
-
-        self.add_cert(priv_key, cert_chain, is_default)
-    }
-
-    pub fn add_cert(
-        &mut self,
-        priv_key: PrivateKeyDer<'static>,
-        cert_chain: Vec<CertificateDer<'static>>,
-        is_default: bool,
-    ) -> anyhow::Result<()> {
-        let key = sign::any_supported_type(&priv_key).context("invalid private key")?;
-
-        let first_cert = &cert_chain[0];
-        let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
-        let pem = x509_parser::parse_x509_certificate(first_cert)
-            .context("Failed to parse PEM object from cerficiate")?
-            .1;
-
-        let common_name = pem.subject().to_string();
-
-        // We need to get the canonical name for this certificate so we can match them against any domain names
-        // seen within the proxy codebase.
-        //
-        // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI.
-        // We need to remove the wildcard prefix for the purposes of certificate selection.
-        //
-        // auth-broker does not use SNI and instead uses the Neon-Connection-String header.
-        // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String.
-        //
-        // Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string
-        // validation, so let's we can continue with any common-name
-        let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") {
-            s.to_string()
-        } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") {
-            s.to_string()
-        } else if let Some(s) = common_name.strip_prefix("CN=") {
-            s.to_string()
-        } else {
-            bail!("Failed to parse common name from certificate")
-        };
-
-        let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key));
-
-        if is_default {
-            self.default = Some((cert.clone(), tls_server_end_point));
-        }
-
-        self.certs.insert(common_name, (cert, tls_server_end_point));
-
-        Ok(())
-    }
-
-    pub fn get_common_names(&self) -> HashSet<String> {
-        self.certs.keys().map(|s| s.to_string()).collect()
-    }
-}
-
-impl rustls::server::ResolvesServerCert for CertResolver {
-    fn resolve(
-        &self,
-        client_hello: rustls::server::ClientHello<'_>,
-    ) -> Option<Arc<rustls::sign::CertifiedKey>> {
-        self.resolve(client_hello.server_name()).map(|x| x.0)
-    }
-}
-
-impl CertResolver {
-    pub fn resolve(
-        &self,
-        server_name: Option<&str>,
-    ) -> Option<(Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)> {
-        // loop here and cut off more and more subdomains until we find
-        // a match to get a proper wildcard support. OTOH, we now do not
-        // use nested domains, so keep this simple for now.
-        //
-        // With the current coding foo.com will match *.foo.com and that
-        // repeats behavior of the old code.
-        if let Some(mut sni_name) = server_name {
-            loop {
-                if let Some(cert) = self.certs.get(sni_name) {
-                    return Some(cert.clone());
-                }
-                if let Some((_, rest)) = sni_name.split_once('.') {
-                    sni_name = rest;
-                } else {
-                    return None;
-                }
-            }
-        } else {
-            // No SNI, use the default certificate, otherwise we can't get to
-            // options parameter which can be used to set endpoint name too.
-            // That means that non-SNI flow will not work for CNAME domains in
-            // verify-full mode.
-            //
-            // If that will be a problem we can:
-            //
-            // a) Instead of multi-cert approach use single cert with extra
-            //    domains listed in Subject Alternative Name (SAN).
-            // b) Deploy separate proxy instances for extra domains.
-            self.default.clone()
-        }
-    }
-}
-
 #[derive(Debug)]
 pub struct EndpointCacheConfig {
     /// Batch size to receive all endpoints on the startup.
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index c477822e85..25a549039c 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -115,7 +115,7 @@ pub async fn task_main(
                 Ok(Some(p)) => {
                     ctx.set_success();
                     let _disconnect = ctx.log_connect();
-                    match p.proxy_pass().await {
+                    match p.proxy_pass(&config.connect_to_compute).await {
                         Ok(()) => {}
                         Err(ErrorSource::Client(e)) => {
                             error!(?session_id, "per-client task finished with an IO error from the client: {e:#}");
@@ -216,7 +216,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         },
         &user_info,
         config.wake_compute_retry_config,
-        config.connect_to_compute_retry_config,
+        &config.connect_to_compute,
     )
     .or_else(|e| stream.throw_error(e))
     .await?;
diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs
index 0ca1a6aae0..c65041df0e 100644
--- a/proxy/src/control_plane/mod.rs
+++ b/proxy/src/control_plane/mod.rs
@@ -10,13 +10,13 @@ pub mod client;
 pub(crate) mod errors;
 
 use std::sync::Arc;
-use std::time::Duration;
 
 use crate::auth::backend::jwt::AuthRule;
 use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
 use crate::auth::IpPattern;
 use crate::cache::project_info::ProjectInfoCacheImpl;
 use crate::cache::{Cached, TimedLru};
+use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo};
 use crate::intern::ProjectIdInt;
@@ -73,9 +73,9 @@ impl NodeInfo {
     pub(crate) async fn connect(
         &self,
         ctx: &RequestContext,
-        timeout: Duration,
+        config: &ComputeConfig,
     ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
-        self.config.connect(ctx, self.aux.clone(), timeout).await
+        self.config.connect(ctx, self.aux.clone(), config).await
     }
 
     pub(crate) fn reuse_settings(&mut self, other: Self) {
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index a5a72f26d9..c56474edd7 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -89,7 +89,6 @@ pub mod jemalloc;
 pub mod logging;
 pub mod metrics;
 pub mod parse;
-pub mod postgres_rustls;
 pub mod protocol2;
 pub mod proxy;
 pub mod rate_limiter;
@@ -99,6 +98,7 @@ pub mod scram;
 pub mod serverless;
 pub mod signals;
 pub mod stream;
+pub mod tls;
 pub mod types;
 pub mod url;
 pub mod usage_metrics;
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 4a30d23985..8a80494860 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -6,7 +6,7 @@ use tracing::{debug, info, warn};
 use super::retry::ShouldRetryWakeCompute;
 use crate::auth::backend::ComputeCredentialKeys;
 use crate::compute::{self, PostgresConnection, COULD_NOT_CONNECT};
-use crate::config::RetryConfig;
+use crate::config::{ComputeConfig, RetryConfig};
 use crate::context::RequestContext;
 use crate::control_plane::errors::WakeComputeError;
 use crate::control_plane::locks::ApiLocks;
@@ -19,8 +19,6 @@ use crate::proxy::retry::{retry_after, should_retry, CouldRetry};
 use crate::proxy::wake_compute::wake_compute;
 use crate::types::Host;
 
-const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
-
 /// If we couldn't connect, a cached connection info might be to blame
 /// (e.g. the compute node's address might've changed at the wrong time).
 /// Invalidate the cache entry (if any) to prevent subsequent errors.
@@ -49,7 +47,7 @@ pub(crate) trait ConnectMechanism {
         &self,
         ctx: &RequestContext,
         node_info: &control_plane::CachedNodeInfo,
-        timeout: time::Duration,
+        config: &ComputeConfig,
     ) -> Result<Self::Connection, Self::ConnectError>;
 
     fn update_connect_config(&self, conf: &mut compute::ConnCfg);
@@ -86,11 +84,11 @@ impl ConnectMechanism for TcpMechanism<'_> {
         &self,
         ctx: &RequestContext,
         node_info: &control_plane::CachedNodeInfo,
-        timeout: time::Duration,
+        config: &ComputeConfig,
     ) -> Result<PostgresConnection, Self::Error> {
         let host = node_info.config.get_host();
         let permit = self.locks.get_permit(&host).await?;
-        permit.release_result(node_info.connect(ctx, timeout).await)
+        permit.release_result(node_info.connect(ctx, config).await)
     }
 
     fn update_connect_config(&self, config: &mut compute::ConnCfg) {
@@ -105,7 +103,7 @@ pub(crate) async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBac
     mechanism: &M,
     user_info: &B,
     wake_compute_retry_config: RetryConfig,
-    connect_to_compute_retry_config: RetryConfig,
+    compute: &ComputeConfig,
 ) -> Result<M::Connection, M::Error>
 where
     M::ConnectError: CouldRetry + ShouldRetryWakeCompute + std::fmt::Debug,
@@ -119,10 +117,7 @@ where
     mechanism.update_connect_config(&mut node_info.config);
 
     // try once
-    let err = match mechanism
-        .connect_once(ctx, &node_info, CONNECT_TIMEOUT)
-        .await
-    {
+    let err = match mechanism.connect_once(ctx, &node_info, compute).await {
         Ok(res) => {
             ctx.success();
             Metrics::get().proxy.retries_metric.observe(
@@ -142,7 +137,7 @@ where
     let node_info = if !node_info.cached() || !err.should_retry_wake_compute() {
         // If we just recieved this from cplane and didn't get it from cache, we shouldn't retry.
         // Do not need to retrieve a new node_info, just return the old one.
-        if should_retry(&err, num_retries, connect_to_compute_retry_config) {
+        if should_retry(&err, num_retries, compute.retry) {
             Metrics::get().proxy.retries_metric.observe(
                 RetriesMetricGroup {
                     outcome: ConnectOutcome::Failed,
@@ -172,10 +167,7 @@ where
     debug!("wake_compute success. attempting to connect");
     num_retries = 1;
     loop {
-        match mechanism
-            .connect_once(ctx, &node_info, CONNECT_TIMEOUT)
-            .await
-        {
+        match mechanism.connect_once(ctx, &node_info, compute).await {
             Ok(res) => {
                 ctx.success();
                 Metrics::get().proxy.retries_metric.observe(
@@ -190,7 +182,7 @@ where
                 return Ok(res);
             }
             Err(e) => {
-                if !should_retry(&e, num_retries, connect_to_compute_retry_config) {
+                if !should_retry(&e, num_retries, compute.retry) {
                     // Don't log an error here, caller will print the error
                     Metrics::get().proxy.retries_metric.observe(
                         RetriesMetricGroup {
@@ -206,7 +198,7 @@ where
             }
         };
 
-        let wait_duration = retry_after(num_retries, connect_to_compute_retry_config);
+        let wait_duration = retry_after(num_retries, compute.retry);
         num_retries += 1;
 
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout);
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index e27c211932..955f754497 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -8,12 +8,13 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{debug, info, warn};
 
 use crate::auth::endpoint_sni;
-use crate::config::{TlsConfig, PG_ALPN_PROTOCOL};
+use crate::config::TlsConfig;
 use crate::context::RequestContext;
 use crate::error::ReportableError;
 use crate::metrics::Metrics;
 use crate::proxy::ERR_INSECURE_CONNECTION;
 use crate::stream::{PqStream, Stream, StreamUpgradeError};
+use crate::tls::PG_ALPN_PROTOCOL;
 
 #[derive(Error, Debug)]
 pub(crate) enum HandshakeError {
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index dbe174cab7..3926c56fec 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -152,7 +152,7 @@ pub async fn task_main(
                 Ok(Some(p)) => {
                     ctx.set_success();
                     let _disconnect = ctx.log_connect();
-                    match p.proxy_pass().await {
+                    match p.proxy_pass(&config.connect_to_compute).await {
                         Ok(()) => {}
                         Err(ErrorSource::Client(e)) => {
                             warn!(?session_id, "per-client task finished with an IO error from the client: {e:#}");
@@ -351,7 +351,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         },
         &user_info,
         config.wake_compute_retry_config,
-        config.connect_to_compute_retry_config,
+        &config.connect_to_compute,
     )
     .or_else(|e| stream.throw_error(e))
     .await?;
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index dcaa81e5cd..a42f9aad39 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -5,6 +5,7 @@ use utils::measured_stream::MeasuredStream;
 use super::copy_bidirectional::ErrorSource;
 use crate::cancellation;
 use crate::compute::PostgresConnection;
+use crate::config::ComputeConfig;
 use crate::control_plane::messages::MetricsAuxInfo;
 use crate::metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard};
 use crate::stream::Stream;
@@ -67,9 +68,17 @@ pub(crate) struct ProxyPassthrough<P, S> {
 }
 
 impl<P, S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<P, S> {
-    pub(crate) async fn proxy_pass(self) -> Result<(), ErrorSource> {
+    pub(crate) async fn proxy_pass(
+        self,
+        compute_config: &ComputeConfig,
+    ) -> Result<(), ErrorSource> {
         let res = proxy_pass(self.client, self.compute.stream, self.aux).await;
-        if let Err(err) = self.compute.cancel_closure.try_cancel_query().await {
+        if let Err(err) = self
+            .compute
+            .cancel_closure
+            .try_cancel_query(compute_config)
+            .await
+        {
             tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database");
         }
         res
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 95c518fed9..10db2bcb30 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -22,14 +22,16 @@ use super::*;
 use crate::auth::backend::{
     ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned,
 };
-use crate::config::{CertResolver, RetryConfig};
+use crate::config::{ComputeConfig, RetryConfig};
 use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient};
 use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status};
 use crate::control_plane::{
     self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, NodeInfoCache,
 };
 use crate::error::ErrorKind;
-use crate::postgres_rustls::MakeRustlsConnect;
+use crate::tls::client_config::compute_client_config_with_certs;
+use crate::tls::postgres_rustls::MakeRustlsConnect;
+use crate::tls::server_config::CertResolver;
 use crate::types::{BranchId, EndpointId, ProjectId};
 use crate::{sasl, scram};
 
@@ -67,7 +69,7 @@ fn generate_certs(
 }
 
 struct ClientConfig<'a> {
-    config: rustls::ClientConfig,
+    config: Arc<rustls::ClientConfig>,
     hostname: &'a str,
 }
 
@@ -110,16 +112,7 @@ fn generate_tls_config<'a>(
     };
 
     let client_config = {
-        let config =
-            rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
-                .with_safe_default_protocol_versions()
-                .context("ring should support the default protocol versions")?
-                .with_root_certificates({
-                    let mut store = rustls::RootCertStore::empty();
-                    store.add(ca)?;
-                    store
-                })
-                .with_no_client_auth();
+        let config = Arc::new(compute_client_config_with_certs([ca]));
 
         ClientConfig { config, hostname }
     };
@@ -468,7 +461,7 @@ impl ConnectMechanism for TestConnectMechanism {
         &self,
         _ctx: &RequestContext,
         _node_info: &control_plane::CachedNodeInfo,
-        _timeout: std::time::Duration,
+        _config: &ComputeConfig,
     ) -> Result<Self::Connection, Self::ConnectError> {
         let mut counter = self.counter.lock().unwrap();
         let action = self.sequence[*counter];
@@ -576,6 +569,20 @@ fn helper_create_connect_info(
     user_info
 }
 
+fn config() -> ComputeConfig {
+    let retry = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+
+    ComputeConfig {
+        retry,
+        tls: Arc::new(compute_client_config_with_certs(std::iter::empty())),
+        timeout: Duration::from_secs(2),
+    }
+}
+
 #[tokio::test]
 async fn connect_to_compute_success() {
     let _ = env_logger::try_init();
@@ -583,12 +590,8 @@ async fn connect_to_compute_success() {
     let ctx = RequestContext::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
-    let config = RetryConfig {
-        base_delay: Duration::from_secs(1),
-        max_retries: 5,
-        backoff_factor: 2.0,
-    };
-    connect_to_compute(&ctx, &mechanism, &user_info, config, config)
+    let config = config();
+    connect_to_compute(&ctx, &mechanism, &user_info, config.retry, &config)
         .await
         .unwrap();
     mechanism.verify();
@@ -601,12 +604,8 @@ async fn connect_to_compute_retry() {
     let ctx = RequestContext::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
-    let config = RetryConfig {
-        base_delay: Duration::from_secs(1),
-        max_retries: 5,
-        backoff_factor: 2.0,
-    };
-    connect_to_compute(&ctx, &mechanism, &user_info, config, config)
+    let config = config();
+    connect_to_compute(&ctx, &mechanism, &user_info, config.retry, &config)
         .await
         .unwrap();
     mechanism.verify();
@@ -620,12 +619,8 @@ async fn connect_to_compute_non_retry_1() {
     let ctx = RequestContext::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]);
     let user_info = helper_create_connect_info(&mechanism);
-    let config = RetryConfig {
-        base_delay: Duration::from_secs(1),
-        max_retries: 5,
-        backoff_factor: 2.0,
-    };
-    connect_to_compute(&ctx, &mechanism, &user_info, config, config)
+    let config = config();
+    connect_to_compute(&ctx, &mechanism, &user_info, config.retry, &config)
         .await
         .unwrap_err();
     mechanism.verify();
@@ -639,12 +634,8 @@ async fn connect_to_compute_non_retry_2() {
     let ctx = RequestContext::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
-    let config = RetryConfig {
-        base_delay: Duration::from_secs(1),
-        max_retries: 5,
-        backoff_factor: 2.0,
-    };
-    connect_to_compute(&ctx, &mechanism, &user_info, config, config)
+    let config = config();
+    connect_to_compute(&ctx, &mechanism, &user_info, config.retry, &config)
         .await
         .unwrap();
     mechanism.verify();
@@ -665,17 +656,13 @@ async fn connect_to_compute_non_retry_3() {
         max_retries: 1,
         backoff_factor: 2.0,
     };
-    let connect_to_compute_retry_config = RetryConfig {
-        base_delay: Duration::from_secs(1),
-        max_retries: 5,
-        backoff_factor: 2.0,
-    };
+    let config = config();
     connect_to_compute(
         &ctx,
         &mechanism,
         &user_info,
         wake_compute_retry_config,
-        connect_to_compute_retry_config,
+        &config,
     )
     .await
     .unwrap_err();
@@ -690,12 +677,8 @@ async fn wake_retry() {
     let ctx = RequestContext::test();
     let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
-    let config = RetryConfig {
-        base_delay: Duration::from_secs(1),
-        max_retries: 5,
-        backoff_factor: 2.0,
-    };
-    connect_to_compute(&ctx, &mechanism, &user_info, config, config)
+    let config = config();
+    connect_to_compute(&ctx, &mechanism, &user_info, config.retry, &config)
         .await
         .unwrap();
     mechanism.verify();
@@ -709,12 +692,8 @@ async fn wake_non_retry() {
     let ctx = RequestContext::test();
     let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]);
     let user_info = helper_create_connect_info(&mechanism);
-    let config = RetryConfig {
-        base_delay: Duration::from_secs(1),
-        max_retries: 5,
-        backoff_factor: 2.0,
-    };
-    connect_to_compute(&ctx, &mechanism, &user_info, config, config)
+    let config = config();
+    connect_to_compute(&ctx, &mechanism, &user_info, config.retry, &config)
         .await
         .unwrap_err();
     mechanism.verify();
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index d18dfd2465..80b93b6c4f 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -12,6 +12,7 @@ use uuid::Uuid;
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use crate::cache::project_info::ProjectInfoCache;
 use crate::cancellation::{CancelMap, CancellationHandler};
+use crate::config::ProxyConfig;
 use crate::intern::{ProjectIdInt, RoleNameInt};
 use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
 
@@ -249,6 +250,7 @@ async fn handle_messages<C: ProjectInfoCache + Send + Sync + 'static>(
 /// Handle console's invalidation messages.
 #[tracing::instrument(name = "redis_notifications", skip_all)]
 pub async fn task_main<C>(
+    config: &'static ProxyConfig,
     redis: ConnectionWithCredentialsProvider,
     cache: Arc<C>,
     cancel_map: CancelMap,
@@ -258,6 +260,7 @@ where
     C: ProjectInfoCache + Send + Sync + 'static,
 {
     let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
+        &config.connect_to_compute,
         cancel_map,
         crate::metrics::CancellationSource::FromRedis,
     ));
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 6a13f645a5..77853db3db 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -13,7 +13,6 @@ use super::secret::ServerSecret;
 use super::signature::SignatureBuilder;
 use super::threadpool::ThreadPool;
 use super::ScramKey;
-use crate::config;
 use crate::intern::EndpointIdInt;
 use crate::sasl::{self, ChannelBinding, Error as SaslError};
 
@@ -59,14 +58,14 @@ enum ExchangeState {
 pub(crate) struct Exchange<'a> {
     state: ExchangeState,
     secret: &'a ServerSecret,
-    tls_server_end_point: config::TlsServerEndPoint,
+    tls_server_end_point: crate::tls::TlsServerEndPoint,
 }
 
 impl<'a> Exchange<'a> {
     pub(crate) fn new(
         secret: &'a ServerSecret,
         nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN],
-        tls_server_end_point: config::TlsServerEndPoint,
+        tls_server_end_point: crate::tls::TlsServerEndPoint,
     ) -> Self {
         Self {
             state: ExchangeState::Initial(SaslInitial { nonce }),
@@ -120,7 +119,7 @@ impl SaslInitial {
     fn transition(
         &self,
         secret: &ServerSecret,
-        tls_server_end_point: &config::TlsServerEndPoint,
+        tls_server_end_point: &crate::tls::TlsServerEndPoint,
         input: &str,
     ) -> sasl::Result<sasl::Step<SaslSentInner, Infallible>> {
         let client_first_message = ClientFirstMessage::parse(input)
@@ -155,7 +154,7 @@ impl SaslSentInner {
     fn transition(
         &self,
         secret: &ServerSecret,
-        tls_server_end_point: &config::TlsServerEndPoint,
+        tls_server_end_point: &crate::tls::TlsServerEndPoint,
         input: &str,
     ) -> sasl::Result<sasl::Step<Infallible, super::ScramKey>> {
         let Self {
@@ -168,8 +167,8 @@ impl SaslSentInner {
             .ok_or(SaslError::BadClientMessage("invalid client-final-message"))?;
 
         let channel_binding = cbind_flag.encode(|_| match tls_server_end_point {
-            config::TlsServerEndPoint::Sha256(x) => Ok(x),
-            config::TlsServerEndPoint::Undefined => Err(SaslError::MissingBinding),
+            crate::tls::TlsServerEndPoint::Sha256(x) => Ok(x),
+            crate::tls::TlsServerEndPoint::Undefined => Err(SaslError::MissingBinding),
         })?;
 
         // This might've been caused by a MITM attack
diff --git a/proxy/src/scram/mod.rs b/proxy/src/scram/mod.rs
index b49a9f32ee..cfa571cbe1 100644
--- a/proxy/src/scram/mod.rs
+++ b/proxy/src/scram/mod.rs
@@ -77,11 +77,8 @@ mod tests {
         const NONCE: [u8; 18] = [
             1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
         ];
-        let mut exchange = Exchange::new(
-            &secret,
-            || NONCE,
-            crate::config::TlsServerEndPoint::Undefined,
-        );
+        let mut exchange =
+            Exchange::new(&secret, || NONCE, crate::tls::TlsServerEndPoint::Undefined);
 
         let client_first = "n,,n=user,r=rOprNGfwEbeRWgbNEkqO";
         let client_final = "c=biws,r=rOprNGfwEbeRWgbNEkqOAQIDBAUGBwgJCgsMDQ4PEBES,p=rw1r5Kph5ThxmaUBC2GAQ6MfXbPnNkFiTIvdb/Rear0=";
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 449d50b6e7..b398c3ddd0 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -22,7 +22,7 @@ use crate::compute;
 use crate::compute_ctl::{
     ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest,
 };
-use crate::config::ProxyConfig;
+use crate::config::{ComputeConfig, ProxyConfig};
 use crate::context::RequestContext;
 use crate::control_plane::client::ApiLockError;
 use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError};
@@ -196,7 +196,7 @@ impl PoolingBackend {
             },
             &backend,
             self.config.wake_compute_retry_config,
-            self.config.connect_to_compute_retry_config,
+            &self.config.connect_to_compute,
         )
         .await
     }
@@ -237,7 +237,7 @@ impl PoolingBackend {
             },
             &backend,
             self.config.wake_compute_retry_config,
-            self.config.connect_to_compute_retry_config,
+            &self.config.connect_to_compute,
         )
         .await
     }
@@ -502,7 +502,7 @@ impl ConnectMechanism for TokioMechanism {
         &self,
         ctx: &RequestContext,
         node_info: &CachedNodeInfo,
-        timeout: Duration,
+        compute_config: &ComputeConfig,
     ) -> Result<Self::Connection, Self::ConnectError> {
         let host = node_info.config.get_host();
         let permit = self.locks.get_permit(&host).await?;
@@ -511,7 +511,7 @@ impl ConnectMechanism for TokioMechanism {
         let config = config
             .user(&self.conn_info.user_info.user)
             .dbname(&self.conn_info.dbname)
-            .connect_timeout(timeout);
+            .connect_timeout(compute_config.timeout);
 
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let res = config.connect(postgres_client::NoTls).await;
@@ -552,7 +552,7 @@ impl ConnectMechanism for HyperMechanism {
         &self,
         ctx: &RequestContext,
         node_info: &CachedNodeInfo,
-        timeout: Duration,
+        config: &ComputeConfig,
     ) -> Result<Self::Connection, Self::ConnectError> {
         let host = node_info.config.get_host();
         let permit = self.locks.get_permit(&host).await?;
@@ -560,7 +560,7 @@ impl ConnectMechanism for HyperMechanism {
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
 
         let port = node_info.config.get_port();
-        let res = connect_http2(&host, port, timeout).await;
+        let res = connect_http2(&host, port, config.timeout).await;
         drop(pause);
         let (client, connection) = permit.release_result(res)?;
 
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 812fedaf04..47326c1181 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -168,7 +168,7 @@ pub(crate) async fn serve_websocket(
         Ok(Some(p)) => {
             ctx.set_success();
             ctx.log_connect();
-            match p.proxy_pass().await {
+            match p.proxy_pass(&config.connect_to_compute).await {
                 Ok(()) => Ok(()),
                 Err(ErrorSource::Client(err)) => Err(err).context("client"),
                 Err(ErrorSource::Compute(err)) => Err(err).context("compute"),
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 11f426819d..ace27a7284 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -11,9 +11,9 @@ use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
 use tokio_rustls::server::TlsStream;
 use tracing::debug;
 
-use crate::config::TlsServerEndPoint;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::metrics::Metrics;
+use crate::tls::TlsServerEndPoint;
 
 /// Stream wrapper which implements libpq's protocol.
 ///
diff --git a/proxy/src/tls/client_config.rs b/proxy/src/tls/client_config.rs
new file mode 100644
index 0000000000..a2d695aae1
--- /dev/null
+++ b/proxy/src/tls/client_config.rs
@@ -0,0 +1,42 @@
+use std::sync::Arc;
+
+use anyhow::bail;
+use rustls::crypto::ring;
+
+pub(crate) fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
+    let der_certs = rustls_native_certs::load_native_certs();
+
+    if !der_certs.errors.is_empty() {
+        bail!("could not parse certificates: {:?}", der_certs.errors);
+    }
+
+    let mut store = rustls::RootCertStore::empty();
+    store.add_parsable_certificates(der_certs.certs);
+    Ok(Arc::new(store))
+}
+
+/// Loads the root certificates and constructs a client config suitable for connecting to the neon compute.
+/// This function is blocking.
+pub fn compute_client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
+    Ok(
+        rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
+            .with_safe_default_protocol_versions()
+            .expect("ring should support the default protocol versions")
+            .with_root_certificates(load_certs()?)
+            .with_no_client_auth(),
+    )
+}
+
+#[cfg(test)]
+pub fn compute_client_config_with_certs(
+    certs: impl IntoIterator<Item = rustls::pki_types::CertificateDer<'static>>,
+) -> rustls::ClientConfig {
+    let mut store = rustls::RootCertStore::empty();
+    store.add_parsable_certificates(certs);
+
+    rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
+        .with_safe_default_protocol_versions()
+        .expect("ring should support the default protocol versions")
+        .with_root_certificates(store)
+        .with_no_client_auth()
+}
diff --git a/proxy/src/tls/mod.rs b/proxy/src/tls/mod.rs
new file mode 100644
index 0000000000..d6ce6bd9fc
--- /dev/null
+++ b/proxy/src/tls/mod.rs
@@ -0,0 +1,72 @@
+pub mod client_config;
+pub mod postgres_rustls;
+pub mod server_config;
+
+use anyhow::Context;
+use rustls::pki_types::CertificateDer;
+use sha2::{Digest, Sha256};
+use tracing::{error, info};
+use x509_parser::oid_registry;
+
+/// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L159>
+pub const PG_ALPN_PROTOCOL: &[u8] = b"postgresql";
+
+/// Channel binding parameter
+///
+/// <https://www.rfc-editor.org/rfc/rfc5929#section-4>
+/// Description: The hash of the TLS server's certificate as it
+/// appears, octet for octet, in the server's Certificate message.  Note
+/// that the Certificate message contains a certificate_list, in which
+/// the first element is the server's certificate.
+///
+/// The hash function is to be selected as follows:
+///
+/// * if the certificate's signatureAlgorithm uses a single hash
+///   function, and that hash function is either MD5 or SHA-1, then use SHA-256;
+///
+/// * if the certificate's signatureAlgorithm uses a single hash
+///   function and that hash function neither MD5 nor SHA-1, then use
+///   the hash function associated with the certificate's
+///   signatureAlgorithm;
+///
+/// * if the certificate's signatureAlgorithm uses no hash functions or
+///   uses multiple hash functions, then this channel binding type's
+///   channel bindings are undefined at this time (updates to is channel
+///   binding type may occur to address this issue if it ever arises).
+#[derive(Debug, Clone, Copy)]
+pub enum TlsServerEndPoint {
+    Sha256([u8; 32]),
+    Undefined,
+}
+
+impl TlsServerEndPoint {
+    pub fn new(cert: &CertificateDer<'_>) -> anyhow::Result<Self> {
+        let sha256_oids = [
+            // I'm explicitly not adding MD5 or SHA1 here... They're bad.
+            oid_registry::OID_SIG_ECDSA_WITH_SHA256,
+            oid_registry::OID_PKCS1_SHA256WITHRSA,
+        ];
+
+        let pem = x509_parser::parse_x509_certificate(cert)
+            .context("Failed to parse PEM object from cerficiate")?
+            .1;
+
+        info!(subject = %pem.subject, "parsing TLS certificate");
+
+        let reg = oid_registry::OidRegistry::default().with_all_crypto();
+        let oid = pem.signature_algorithm.oid();
+        let alg = reg.get(oid);
+        if sha256_oids.contains(oid) {
+            let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into();
+            info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding");
+            Ok(Self::Sha256(tls_server_end_point))
+        } else {
+            error!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), "unknown channel binding");
+            Ok(Self::Undefined)
+        }
+    }
+
+    pub fn supported(&self) -> bool {
+        !matches!(self, TlsServerEndPoint::Undefined)
+    }
+}
diff --git a/proxy/src/postgres_rustls/mod.rs b/proxy/src/tls/postgres_rustls.rs
similarity index 96%
rename from proxy/src/postgres_rustls/mod.rs
rename to proxy/src/tls/postgres_rustls.rs
index 5ef20991c3..0ad279b635 100644
--- a/proxy/src/postgres_rustls/mod.rs
+++ b/proxy/src/tls/postgres_rustls.rs
@@ -18,7 +18,7 @@ mod private {
     use tokio_rustls::client::TlsStream;
     use tokio_rustls::TlsConnector;
 
-    use crate::config::TlsServerEndPoint;
+    use crate::tls::TlsServerEndPoint;
 
     pub struct TlsConnectFuture<S> {
         inner: tokio_rustls::Connect<S>,
@@ -126,16 +126,14 @@ mod private {
 /// That way you can connect to PostgreSQL using `rustls` as the TLS stack.
 #[derive(Clone)]
 pub struct MakeRustlsConnect {
-    config: Arc<ClientConfig>,
+    pub config: Arc<ClientConfig>,
 }
 
 impl MakeRustlsConnect {
     /// Creates a new `MakeRustlsConnect` from the provided `ClientConfig`.
     #[must_use]
-    pub fn new(config: ClientConfig) -> Self {
-        Self {
-            config: Arc::new(config),
-        }
+    pub fn new(config: Arc<ClientConfig>) -> Self {
+        Self { config }
     }
 }
 
diff --git a/proxy/src/tls/server_config.rs b/proxy/src/tls/server_config.rs
new file mode 100644
index 0000000000..2cc1657eea
--- /dev/null
+++ b/proxy/src/tls/server_config.rs
@@ -0,0 +1,218 @@
+use std::collections::{HashMap, HashSet};
+use std::sync::Arc;
+
+use anyhow::{bail, Context};
+use itertools::Itertools;
+use rustls::crypto::ring::{self, sign};
+use rustls::pki_types::{CertificateDer, PrivateKeyDer};
+
+use super::{TlsServerEndPoint, PG_ALPN_PROTOCOL};
+
+pub struct TlsConfig {
+    pub config: Arc<rustls::ServerConfig>,
+    pub common_names: HashSet<String>,
+    pub cert_resolver: Arc<CertResolver>,
+}
+
+impl TlsConfig {
+    pub fn to_server_config(&self) -> Arc<rustls::ServerConfig> {
+        self.config.clone()
+    }
+}
+
+/// Configure TLS for the main endpoint.
+pub fn configure_tls(
+    key_path: &str,
+    cert_path: &str,
+    certs_dir: Option<&String>,
+    allow_tls_keylogfile: bool,
+) -> anyhow::Result<TlsConfig> {
+    let mut cert_resolver = CertResolver::new();
+
+    // add default certificate
+    cert_resolver.add_cert_path(key_path, cert_path, true)?;
+
+    // add extra certificates
+    if let Some(certs_dir) = certs_dir {
+        for entry in std::fs::read_dir(certs_dir)? {
+            let entry = entry?;
+            let path = entry.path();
+            if path.is_dir() {
+                // file names aligned with default cert-manager names
+                let key_path = path.join("tls.key");
+                let cert_path = path.join("tls.crt");
+                if key_path.exists() && cert_path.exists() {
+                    cert_resolver.add_cert_path(
+                        &key_path.to_string_lossy(),
+                        &cert_path.to_string_lossy(),
+                        false,
+                    )?;
+                }
+            }
+        }
+    }
+
+    let common_names = cert_resolver.get_common_names();
+
+    let cert_resolver = Arc::new(cert_resolver);
+
+    // allow TLS 1.2 to be compatible with older client libraries
+    let mut config =
+        rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
+            .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
+            .context("ring should support TLS1.2 and TLS1.3")?
+            .with_no_client_auth()
+            .with_cert_resolver(cert_resolver.clone());
+
+    config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()];
+
+    if allow_tls_keylogfile {
+        // KeyLogFile will check for the SSLKEYLOGFILE environment variable.
+        config.key_log = Arc::new(rustls::KeyLogFile::new());
+    }
+
+    Ok(TlsConfig {
+        config: Arc::new(config),
+        common_names,
+        cert_resolver,
+    })
+}
+
+#[derive(Default, Debug)]
+pub struct CertResolver {
+    certs: HashMap<String, (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
+    default: Option<(Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
+}
+
+impl CertResolver {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    fn add_cert_path(
+        &mut self,
+        key_path: &str,
+        cert_path: &str,
+        is_default: bool,
+    ) -> anyhow::Result<()> {
+        let priv_key = {
+            let key_bytes = std::fs::read(key_path)
+                .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?;
+            rustls_pemfile::private_key(&mut &key_bytes[..])
+                .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))?
+                .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))?
+        };
+
+        let cert_chain_bytes = std::fs::read(cert_path)
+            .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
+
+        let cert_chain = {
+            rustls_pemfile::certs(&mut &cert_chain_bytes[..])
+                .try_collect()
+                .with_context(|| {
+                    format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.")
+                })?
+        };
+
+        self.add_cert(priv_key, cert_chain, is_default)
+    }
+
+    pub fn add_cert(
+        &mut self,
+        priv_key: PrivateKeyDer<'static>,
+        cert_chain: Vec<CertificateDer<'static>>,
+        is_default: bool,
+    ) -> anyhow::Result<()> {
+        let key = sign::any_supported_type(&priv_key).context("invalid private key")?;
+
+        let first_cert = &cert_chain[0];
+        let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
+        let pem = x509_parser::parse_x509_certificate(first_cert)
+            .context("Failed to parse PEM object from cerficiate")?
+            .1;
+
+        let common_name = pem.subject().to_string();
+
+        // We need to get the canonical name for this certificate so we can match them against any domain names
+        // seen within the proxy codebase.
+        //
+        // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI.
+        // We need to remove the wildcard prefix for the purposes of certificate selection.
+        //
+        // auth-broker does not use SNI and instead uses the Neon-Connection-String header.
+        // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String.
+        //
+        // Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string
+        // validation, so let's we can continue with any common-name
+        let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") {
+            s.to_string()
+        } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") {
+            s.to_string()
+        } else if let Some(s) = common_name.strip_prefix("CN=") {
+            s.to_string()
+        } else {
+            bail!("Failed to parse common name from certificate")
+        };
+
+        let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key));
+
+        if is_default {
+            self.default = Some((cert.clone(), tls_server_end_point));
+        }
+
+        self.certs.insert(common_name, (cert, tls_server_end_point));
+
+        Ok(())
+    }
+
+    pub fn get_common_names(&self) -> HashSet<String> {
+        self.certs.keys().map(|s| s.to_string()).collect()
+    }
+}
+
+impl rustls::server::ResolvesServerCert for CertResolver {
+    fn resolve(
+        &self,
+        client_hello: rustls::server::ClientHello<'_>,
+    ) -> Option<Arc<rustls::sign::CertifiedKey>> {
+        self.resolve(client_hello.server_name()).map(|x| x.0)
+    }
+}
+
+impl CertResolver {
+    pub fn resolve(
+        &self,
+        server_name: Option<&str>,
+    ) -> Option<(Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)> {
+        // loop here and cut off more and more subdomains until we find
+        // a match to get a proper wildcard support. OTOH, we now do not
+        // use nested domains, so keep this simple for now.
+        //
+        // With the current coding foo.com will match *.foo.com and that
+        // repeats behavior of the old code.
+        if let Some(mut sni_name) = server_name {
+            loop {
+                if let Some(cert) = self.certs.get(sni_name) {
+                    return Some(cert.clone());
+                }
+                if let Some((_, rest)) = sni_name.split_once('.') {
+                    sni_name = rest;
+                } else {
+                    return None;
+                }
+            }
+        } else {
+            // No SNI, use the default certificate, otherwise we can't get to
+            // options parameter which can be used to set endpoint name too.
+            // That means that non-SNI flow will not work for CNAME domains in
+            // verify-full mode.
+            //
+            // If that will be a problem we can:
+            //
+            // a) Instead of multi-cert approach use single cert with extra
+            //    domains listed in Subject Alternative Name (SAN).
+            // b) Deploy separate proxy instances for extra domains.
+            self.default.clone()
+        }
+    }
+}

From b3cd883f93e63d9dab6f3ac8e6e8e3f697c14c29 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 2 Jan 2025 14:28:15 +0300
Subject: [PATCH 090/583] Unlock LFC mutex when LFC cache is disabled (#10235)

## Problem

See https://github.com/neondatabase/neon/issues/10233
`lfc_containsv` returns with holding lock when LFC was disabled.
This bug was introduced in commit  78938d1b59

## Summary of changes

Release lock before return.

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index f49415be68..ad5667cbab 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -541,6 +541,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		}
 		else
 		{
+			LWLockRelease(lfc_lock);
 			return found;
 		}
 

From 26600f2973b9d9495889a347cdd21f2c3b7c99e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Thu, 2 Jan 2025 12:33:42 +0100
Subject: [PATCH 091/583] Skip running clippy without default features (#10098)

## Problem

Running clippy with `cargo hack --feature-powerset` in CI isn't
particularly fast. This PR follows-up on
https://github.com/neondatabase/neon/pull/8912 to improve the speed of
our clippy runs.

Parallelism as suggested in
https://github.com/neondatabase/neon/issues/9901 was tested, but didn't
show consistent enough improvements to be worth it. It actually
increased the amount of work done, as there's less cache hits when
clippy runs are spread out over multiple target directories.
Additionally, parallelism makes it so caching needs to be thought about
more actively and copying around target directories to enable
parallelism eats the rest of the performance gains from parallel
execution.

After some discussion, the decision was to instead cut down on the
number of jobs that are running further. The easiest way to do this is
to not run clippy *without* default features. The list of default
features is empty for all crates, and I haven't found anything using
`cfg(feature = "default")` either, so this is likely not going to change
anything except speeding the runs up.

## Summary of changes

Reduce the amount of feature combinations tried by `cargo hack` (as
suggested in
https://github.com/neondatabase/neon/pull/8912#pullrequestreview-2286482368)
by never disabling default features.

## Alternatives

- We can split things out into different jobs which reduces the time
until everything is finished by running more things in parallel. This
does however decreases the amount of cache hits and increases the amount
of time spent on overhead tasks like repo cloning and restoring caches
by doing those multiple times instead of once.
- We could replace `cargo hack [...] clippy` with `cargo clippy [...];
cargo clippy --features testing`. I'm not 100% sure how this compares to
the change here in the PR, but it does seem to run a bit faster. That
likely means it's doing less work, but without understanding what
exactly we loose by that I'd rather not do that for now. I'd appreciate
input on this though.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 55c4bf08b9..12b1ac98ac 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -212,7 +212,7 @@ jobs:
           fi
           echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
       - name: Run cargo clippy (debug)
-        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
+        run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS
 
       - name: Check documentation generation
         run: cargo doc --workspace --no-deps --document-private-items

From ee22d4c9ef3d8dc59360e7675b0c36b5e5fd7be8 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Thu, 2 Jan 2025 14:32:24 +0100
Subject: [PATCH 092/583] proxy: Set TCP_NODELAY for compute connections
 (#10240)

neondatabase/cloud#19184
---
 proxy/src/compute.rs | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index d60dfd0f80..89de6692ad 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -193,11 +193,15 @@ impl ConnCfg {
 
         let connect_once = |host, port| {
             debug!("trying to connect to compute node at {host}:{port}");
-            connect_with_timeout(host, port).and_then(|socket| async {
-                let socket_addr = socket.peer_addr()?;
+            connect_with_timeout(host, port).and_then(|stream| async {
+                let socket_addr = stream.peer_addr()?;
+                let socket = socket2::SockRef::from(&stream);
+                // Disable Nagle's algorithm to not introduce latency between
+                // client and compute.
+                socket.set_nodelay(true)?;
                 // This prevents load balancer from severing the connection.
-                socket2::SockRef::from(&socket).set_keepalive(true)?;
-                Ok((socket_addr, socket))
+                socket.set_keepalive(true)?;
+                Ok((socket_addr, stream))
             })
         };
 

From 8c7dcd259846646858b230a05fd3923361d8a717 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 2 Jan 2025 17:14:18 +0300
Subject: [PATCH 093/583] Set heartbeat interval for chaos test (#10222)

## Problem

See https://neondb.slack.com/archives/C033RQ5SPDH/p1734707873215729

test_timeline_archival_chaos becomes more flaky with increased heartbeat
interval

Resolves #10250.

## Summary of changes

Override heatbeat interval for `test_timelirn_archival_chaos.py`

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 test_runner/regress/test_timeline_archive.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 87579f9e92..9b3a48add9 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -398,6 +398,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
 
     # Offloading is off by default at time of writing: remove this line when it's on by default
     neon_env_builder.pageserver_config_override = "timeline_offloading = true"
+    neon_env_builder.storage_controller_config = {"heartbeat_interval": "100msec"}
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
 
     # We will exercise migrations, so need multiple pageservers

From 1622fd8bda096d1eefb81bf4970ee54542be4595 Mon Sep 17 00:00:00 2001
From: Raphael 'kena' Poss <knz@thaumogen.net>
Date: Thu, 2 Jan 2025 17:02:48 +0100
Subject: [PATCH 094/583] proxy: recognize but ignore the 3 new redis message
 types (#10197)

## Problem

https://neondb.slack.com/archives/C085MBDUSS2/p1734604792755369

## Summary of changes

Recognize and ignore the 3 new broadcast messages:
- `/block_public_or_vpc_access_updated`
- `/allowed_vpc_endpoints_updated_for_org`
- `/allowed_vpc_endpoints_updated_for_projects`
---
 proxy/src/redis/notifications.rs | 56 +++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 80b93b6c4f..671305a300 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -40,6 +40,27 @@ pub(crate) enum Notification {
     AllowedIpsUpdate {
         allowed_ips_update: AllowedIpsUpdate,
     },
+    #[serde(
+        rename = "/block_public_or_vpc_access_updated",
+        deserialize_with = "deserialize_json_string"
+    )]
+    BlockPublicOrVpcAccessUpdated {
+        block_public_or_vpc_access_updated: BlockPublicOrVpcAccessUpdated,
+    },
+    #[serde(
+        rename = "/allowed_vpc_endpoints_updated_for_org",
+        deserialize_with = "deserialize_json_string"
+    )]
+    AllowedVpcEndpointsUpdatedForOrg {
+        allowed_vpc_endpoints_updated_for_org: AllowedVpcEndpointsUpdatedForOrg,
+    },
+    #[serde(
+        rename = "/allowed_vpc_endpoints_updated_for_projects",
+        deserialize_with = "deserialize_json_string"
+    )]
+    AllowedVpcEndpointsUpdatedForProjects {
+        allowed_vpc_endpoints_updated_for_projects: AllowedVpcEndpointsUpdatedForProjects,
+    },
     #[serde(
         rename = "/password_updated",
         deserialize_with = "deserialize_json_string"
@@ -52,6 +73,24 @@ pub(crate) enum Notification {
 pub(crate) struct AllowedIpsUpdate {
     project_id: ProjectIdInt,
 }
+
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub(crate) struct BlockPublicOrVpcAccessUpdated {
+    project_id: ProjectIdInt,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub(crate) struct AllowedVpcEndpointsUpdatedForOrg {
+    // TODO: change type once the implementation is more fully fledged.
+    // See e.g. https://github.com/neondatabase/neon/pull/10073.
+    account_id: ProjectIdInt,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub(crate) struct AllowedVpcEndpointsUpdatedForProjects {
+    project_ids: Vec<ProjectIdInt>,
+}
+
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 pub(crate) struct PasswordUpdate {
     project_id: ProjectIdInt,
@@ -165,7 +204,11 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                     }
                 }
             }
-            Notification::AllowedIpsUpdate { .. } | Notification::PasswordUpdate { .. } => {
+            Notification::AllowedIpsUpdate { .. }
+            | Notification::PasswordUpdate { .. }
+            | Notification::BlockPublicOrVpcAccessUpdated { .. }
+            | Notification::AllowedVpcEndpointsUpdatedForOrg { .. }
+            | Notification::AllowedVpcEndpointsUpdatedForProjects { .. } => {
                 invalidate_cache(self.cache.clone(), msg.clone());
                 if matches!(msg, Notification::AllowedIpsUpdate { .. }) {
                     Metrics::get()
@@ -178,6 +221,8 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                         .redis_events_count
                         .inc(RedisEventsCount::PasswordUpdate);
                 }
+                // TODO: add additional metrics for the other event types.
+
                 // It might happen that the invalid entry is on the way to be cached.
                 // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds.
                 // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message.
@@ -204,6 +249,15 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
                 password_update.role_name,
             ),
         Notification::Cancel(_) => unreachable!("cancel message should be handled separately"),
+        Notification::BlockPublicOrVpcAccessUpdated { .. } => {
+            // https://github.com/neondatabase/neon/pull/10073
+        }
+        Notification::AllowedVpcEndpointsUpdatedForOrg { .. } => {
+            // https://github.com/neondatabase/neon/pull/10073
+        }
+        Notification::AllowedVpcEndpointsUpdatedForProjects { .. } => {
+            // https://github.com/neondatabase/neon/pull/10073
+        }
     }
 }
 

From 56e6ebfe172ce0c7fb0faf9a3e64e8e2b3902b37 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 2 Jan 2025 16:05:14 +0000
Subject: [PATCH 095/583] chore: building compute_tools and local_proxy
 together (#10257)

## Problem

Building local_proxy and compute_tools features the same dependency
tree, but as they are currently built in separate clean layers all that
progress is wasted. For our arm builds that's an extra 10 minutes.

## Summary of changes

Combines the compute_tools and local_proxy build layers.
---
 compute/compute-node.Dockerfile | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 5e7b4e8287..06aaf9e7f4 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1285,7 +1285,7 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
 
 #########################################################################################
 #
-# Compile and run the Neon-specific `compute_ctl` and `fast_import` binaries
+# Compile the Neon-specific `compute_ctl`, `fast_import`, and `local_proxy` binaries
 #
 #########################################################################################
 FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
@@ -1295,7 +1295,7 @@ ENV BUILD_TAG=$BUILD_TAG
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
-RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto
+RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy
 
 #########################################################################################
 #
@@ -1338,20 +1338,6 @@ RUN set -e \
     && make -j $(nproc) dist_man_MANS= \
     && make install dist_man_MANS=
 
-#########################################################################################
-#
-# Compile the Neon-specific `local_proxy` binary
-#
-#########################################################################################
-FROM $REPOSITORY/$IMAGE:$TAG AS local_proxy
-ARG BUILD_TAG
-ENV BUILD_TAG=$BUILD_TAG
-
-USER nonroot
-# Copy entire project to get Cargo.* files with proper dependencies for the whole project
-COPY --chown=nonroot . .
-RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin local_proxy
-
 #########################################################################################
 #
 # Layers "postgres-exporter" and "sql-exporter"
@@ -1491,7 +1477,7 @@ COPY --from=pgbouncer         /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/
 COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini
 
 # local_proxy and its config
-COPY --from=local_proxy --chown=postgres /home/nonroot/target/release-line-debug-size-lto/local_proxy /usr/local/bin/local_proxy
+COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/local_proxy /usr/local/bin/local_proxy
 RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
 
 # Metrics exporter binaries and  configuration files

From 363ea97f691cbf9026309c08239300a0d351018d Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 2 Jan 2025 12:37:50 -0600
Subject: [PATCH 096/583] Add more substantial tests for compute migrations
 (#9811)

The previous tests really didn't do much. This set should be quite a bit
more encompassing.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 Cargo.lock                                    |  1 +
 compute_tools/Cargo.toml                      |  3 +-
 compute_tools/src/bin/compute_ctl.rs          |  5 ++
 compute_tools/src/compute.rs                  | 15 +++-
 compute_tools/src/http/api.rs                 | 15 ++++
 compute_tools/src/migration.rs                | 41 ++++++++-
 .../tests/0001-neon_superuser_bypass_rls.sql  |  9 ++
 .../src/migrations/tests/0002-alter_roles.sql | 25 ++++++
 ..._create_subscription_to_neon_superuser.sql | 10 +++
 ...004-grant_pg_monitor_to_neon_superuser.sql | 19 ++++
 ...-grant_all_on_tables_to_neon_superuser.sql |  2 +
 ...ant_all_on_sequences_to_neon_superuser.sql |  2 +
 ...es_to_neon_superuser_with_grant_option.sql |  2 +
 ...es_to_neon_superuser_with_grant_option.sql |  2 +
 ...plication_for_previously_allowed_roles.sql |  2 +
 ...ynchronization_funcs_to_neon_superuser.sql | 13 +++
 ...cation_origin_status_to_neon_superuser.sql | 13 +++
 libs/utils/src/failpoint_support.rs           |  4 +-
 test_runner/conftest.py                       |  1 +
 test_runner/fixtures/compute_migrations.py    | 34 +++++++
 test_runner/fixtures/endpoint/http.py         | 14 +++
 test_runner/fixtures/neon_cli.py              |  5 +-
 test_runner/fixtures/neon_fixtures.py         | 12 ++-
 test_runner/fixtures/paths.py                 |  2 +-
 .../regress/test_compute_migrations.py        | 90 +++++++++++++++++++
 test_runner/regress/test_migrations.py        | 33 -------
 26 files changed, 327 insertions(+), 47 deletions(-)
 create mode 100644 compute_tools/src/migrations/tests/0001-neon_superuser_bypass_rls.sql
 create mode 100644 compute_tools/src/migrations/tests/0002-alter_roles.sql
 create mode 100644 compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_neon_superuser.sql
 create mode 100644 compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
 create mode 100644 compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_neon_superuser.sql
 create mode 100644 compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_neon_superuser.sql
 create mode 100644 compute_tools/src/migrations/tests/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
 create mode 100644 compute_tools/src/migrations/tests/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
 create mode 100644 compute_tools/src/migrations/tests/0009-revoke_replication_for_previously_allowed_roles.sql
 create mode 100644 compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
 create mode 100644 compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql
 create mode 100644 test_runner/fixtures/compute_migrations.py
 create mode 100644 test_runner/regress/test_compute_migrations.py
 delete mode 100644 test_runner/regress/test_migrations.py

diff --git a/Cargo.lock b/Cargo.lock
index d9ac167042..420def152d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1274,6 +1274,7 @@ dependencies = [
  "chrono",
  "clap",
  "compute_api",
+ "fail",
  "flate2",
  "futures",
  "hyper 0.14.30",
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index c0c390caef..9525b27818 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -7,7 +7,7 @@ license.workspace = true
 [features]
 default = []
 # Enables test specific features.
-testing = []
+testing = ["fail/failpoints"]
 
 [dependencies]
 base64.workspace = true
@@ -19,6 +19,7 @@ camino.workspace = true
 chrono.workspace = true
 cfg-if.workspace = true
 clap.workspace = true
+fail.workspace = true
 flate2.workspace = true
 futures.workspace = true
 hyper0 = { workspace = true, features = ["full"] }
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index bb248734a8..95ade9a87d 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -67,12 +67,15 @@ use compute_tools::params::*;
 use compute_tools::spec::*;
 use compute_tools::swap::resize_swap;
 use rlimit::{setrlimit, Resource};
+use utils::failpoint_support;
 
 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
 const BUILD_TAG_DEFAULT: &str = "latest";
 
 fn main() -> Result<()> {
+    let scenario = failpoint_support::init();
+
     let (build_tag, clap_args) = init()?;
 
     // enable core dumping for all child processes
@@ -100,6 +103,8 @@ fn main() -> Result<()> {
 
     maybe_delay_exit(delay_exit);
 
+    scenario.teardown();
+
     deinit_and_exit(wait_pg_result);
 }
 
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index d72a04f2f9..78f6033429 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1181,8 +1181,19 @@ impl ComputeNode {
             let mut conf = postgres::config::Config::from(conf);
             conf.application_name("compute_ctl:migrations");
 
-            let mut client = conf.connect(NoTls)?;
-            handle_migrations(&mut client).context("apply_config handle_migrations")
+            match conf.connect(NoTls) {
+                Ok(mut client) => {
+                    if let Err(e) = handle_migrations(&mut client) {
+                        error!("Failed to run migrations: {}", e);
+                    }
+                }
+                Err(e) => {
+                    error!(
+                        "Failed to connect to the compute for running migrations: {}",
+                        e
+                    );
+                }
+            };
         });
 
         Ok::<(), anyhow::Error>(())
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 7fa6426d8f..a4b1a63e6d 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -24,8 +24,11 @@ use metrics::proto::MetricFamily;
 use metrics::Encoder;
 use metrics::TextEncoder;
 use tokio::task;
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, warn};
 use tracing_utils::http::OtelName;
+use utils::failpoint_support::failpoints_handler;
+use utils::http::error::ApiError;
 use utils::http::request::must_get_query_param;
 
 fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
@@ -310,6 +313,18 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
             }
         }
 
+        (&Method::POST, "/failpoints") if cfg!(feature = "testing") => {
+            match failpoints_handler(req, CancellationToken::new()).await {
+                Ok(r) => r,
+                Err(ApiError::BadRequest(e)) => {
+                    render_json_error(&e.to_string(), StatusCode::BAD_REQUEST)
+                }
+                Err(_) => {
+                    render_json_error("Internal server error", StatusCode::INTERNAL_SERVER_ERROR)
+                }
+            }
+        }
+
         // download extension files from remote extension storage on demand
         (&Method::POST, route) if route.starts_with("/extension_server/") => {
             info!("serving {:?} POST request", route);
diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs
index 22ab145eda..07d738abe9 100644
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -1,13 +1,16 @@
 use anyhow::{Context, Result};
+use fail::fail_point;
 use postgres::Client;
 use tracing::info;
 
+/// Runs a series of migrations on a target database
 pub(crate) struct MigrationRunner<'m> {
     client: &'m mut Client,
     migrations: &'m [&'m str],
 }
 
 impl<'m> MigrationRunner<'m> {
+    /// Create a new migration runner
     pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
         // The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64
         assert!(migrations.len() + 1 < i64::MAX as usize);
@@ -15,6 +18,7 @@ impl<'m> MigrationRunner<'m> {
         Self { client, migrations }
     }
 
+    /// Get the current value neon_migration.migration_id
     fn get_migration_id(&mut self) -> Result<i64> {
         let query = "SELECT id FROM neon_migration.migration_id";
         let row = self
@@ -25,9 +29,34 @@ impl<'m> MigrationRunner<'m> {
         Ok(row.get::<&str, i64>("id"))
     }
 
+    /// Update the neon_migration.migration_id value
+    ///
+    /// This function has a fail point called compute-migration, which can be
+    /// used if you would like to fail the application of a series of migrations
+    /// at some point.
     fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
         let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id);
 
+        // We use this fail point in order to check that failing in the
+        // middle of applying a series of migrations fails in an expected
+        // manner
+        if cfg!(feature = "testing") {
+            let fail = (|| {
+                fail_point!("compute-migration", |fail_migration_id| {
+                    migration_id == fail_migration_id.unwrap().parse::<i64>().unwrap()
+                });
+
+                false
+            })();
+
+            if fail {
+                return Err(anyhow::anyhow!(format!(
+                    "migration {} was configured to fail because of a failpoint",
+                    migration_id
+                )));
+            }
+        }
+
         self.client
             .simple_query(&setval)
             .context("run_migrations update id")?;
@@ -35,7 +64,8 @@ impl<'m> MigrationRunner<'m> {
         Ok(())
     }
 
-    fn prepare_migrations(&mut self) -> Result<()> {
+    /// Prepare the migrations the target database for handling migrations
+    fn prepare_database(&mut self) -> Result<()> {
         let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
         self.client.simple_query(query)?;
 
@@ -54,8 +84,9 @@ impl<'m> MigrationRunner<'m> {
         Ok(())
     }
 
+    /// Run the configrured set of migrations
     pub fn run_migrations(mut self) -> Result<()> {
-        self.prepare_migrations()?;
+        self.prepare_database()?;
 
         let mut current_migration = self.get_migration_id()? as usize;
         while current_migration < self.migrations.len() {
@@ -69,6 +100,11 @@ impl<'m> MigrationRunner<'m> {
 
             if migration.starts_with("-- SKIP") {
                 info!("Skipping migration id={}", migration_id!(current_migration));
+
+                // Even though we are skipping the migration, updating the
+                // migration ID should help keep logic easy to understand when
+                // trying to understand the state of a cluster.
+                self.update_migration_id(migration_id!(current_migration))?;
             } else {
                 info!(
                     "Running migration id={}:\n{}\n",
@@ -87,7 +123,6 @@ impl<'m> MigrationRunner<'m> {
                     )
                 })?;
 
-                // Migration IDs start at 1
                 self.update_migration_id(migration_id!(current_migration))?;
 
                 self.client
diff --git a/compute_tools/src/migrations/tests/0001-neon_superuser_bypass_rls.sql b/compute_tools/src/migrations/tests/0001-neon_superuser_bypass_rls.sql
new file mode 100644
index 0000000000..0c81cef1c4
--- /dev/null
+++ b/compute_tools/src/migrations/tests/0001-neon_superuser_bypass_rls.sql
@@ -0,0 +1,9 @@
+DO $$
+DECLARE
+    bypassrls boolean;
+BEGIN
+    SELECT rolbypassrls INTO bypassrls FROM pg_roles WHERE rolname = 'neon_superuser';
+    IF NOT bypassrls THEN
+        RAISE EXCEPTION 'neon_superuser cannot bypass RLS';
+    END IF;
+END $$;
diff --git a/compute_tools/src/migrations/tests/0002-alter_roles.sql b/compute_tools/src/migrations/tests/0002-alter_roles.sql
new file mode 100644
index 0000000000..433f7b34f7
--- /dev/null
+++ b/compute_tools/src/migrations/tests/0002-alter_roles.sql
@@ -0,0 +1,25 @@
+DO $$
+DECLARE
+    role record;
+BEGIN
+    FOR role IN
+        SELECT rolname AS name, rolinherit AS inherit
+        FROM pg_roles
+        WHERE pg_has_role(rolname, 'neon_superuser', 'member')
+    LOOP
+        IF NOT role.inherit THEN
+            RAISE EXCEPTION '% cannot inherit', quote_ident(role.name);
+        END IF;
+    END LOOP;
+
+    FOR role IN
+        SELECT rolname AS name, rolbypassrls AS bypassrls
+        FROM pg_roles
+        WHERE NOT pg_has_role(rolname, 'neon_superuser', 'member')
+            AND NOT starts_with(rolname, 'pg_')
+    LOOP
+        IF role.bypassrls THEN
+            RAISE EXCEPTION  '% can bypass RLS', quote_ident(role.name);
+        END IF;
+    END LOOP;
+END $$;
diff --git a/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_neon_superuser.sql
new file mode 100644
index 0000000000..b164d61295
--- /dev/null
+++ b/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_neon_superuser.sql
@@ -0,0 +1,10 @@
+DO $$
+BEGIN
+    IF (SELECT current_setting('server_version_num')::numeric < 160000) THEN
+        RETURN;
+    END IF;
+
+    IF NOT (SELECT pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN
+        RAISE EXCEPTION 'neon_superuser cannot execute pg_create_subscription';
+    END IF;
+END $$;
diff --git a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
new file mode 100644
index 0000000000..acb8dd417d
--- /dev/null
+++ b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
@@ -0,0 +1,19 @@
+DO $$
+DECLARE
+    monitor record;
+BEGIN
+    SELECT pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member,
+            admin_option AS admin
+        INTO monitor
+        FROM pg_auth_members
+        WHERE roleid = 'pg_monitor'::regrole
+            AND member = 'pg_monitor'::regrole;
+
+    IF NOT monitor.member THEN
+        RAISE EXCEPTION 'neon_superuser is not a member of pg_monitor';
+    END IF;
+
+    IF NOT monitor.admin THEN
+        RAISE EXCEPTION 'neon_superuser cannot grant pg_monitor';
+    END IF;
+END $$;
diff --git a/compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_neon_superuser.sql
new file mode 100644
index 0000000000..f99101bd65
--- /dev/null
+++ b/compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_neon_superuser.sql
@@ -0,0 +1,2 @@
+-- This test was never written becuase at the time migration tests were added
+-- the accompanying migration was already skipped.
diff --git a/compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_neon_superuser.sql
new file mode 100644
index 0000000000..f99101bd65
--- /dev/null
+++ b/compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_neon_superuser.sql
@@ -0,0 +1,2 @@
+-- This test was never written becuase at the time migration tests were added
+-- the accompanying migration was already skipped.
diff --git a/compute_tools/src/migrations/tests/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/tests/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
new file mode 100644
index 0000000000..f99101bd65
--- /dev/null
+++ b/compute_tools/src/migrations/tests/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
@@ -0,0 +1,2 @@
+-- This test was never written becuase at the time migration tests were added
+-- the accompanying migration was already skipped.
diff --git a/compute_tools/src/migrations/tests/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/tests/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
new file mode 100644
index 0000000000..f99101bd65
--- /dev/null
+++ b/compute_tools/src/migrations/tests/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
@@ -0,0 +1,2 @@
+-- This test was never written becuase at the time migration tests were added
+-- the accompanying migration was already skipped.
diff --git a/compute_tools/src/migrations/tests/0009-revoke_replication_for_previously_allowed_roles.sql b/compute_tools/src/migrations/tests/0009-revoke_replication_for_previously_allowed_roles.sql
new file mode 100644
index 0000000000..f99101bd65
--- /dev/null
+++ b/compute_tools/src/migrations/tests/0009-revoke_replication_for_previously_allowed_roles.sql
@@ -0,0 +1,2 @@
+-- This test was never written becuase at the time migration tests were added
+-- the accompanying migration was already skipped.
diff --git a/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
new file mode 100644
index 0000000000..af7f50e95d
--- /dev/null
+++ b/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
@@ -0,0 +1,13 @@
+DO $$
+DECLARE
+    can_execute boolean;
+BEGIN
+    SELECT bool_and(has_function_privilege('neon_superuser', oid, 'execute'))
+       INTO can_execute
+       FROM pg_proc
+       WHERE proname IN ('pg_export_snapshot', 'pg_log_standby_snapshot')
+           AND pronamespace = 'pg_catalog'::regnamespace;
+    IF NOT can_execute THEN
+        RAISE EXCEPTION 'neon_superuser cannot execute both pg_export_snapshot and pg_log_standby_snapshot';
+    END IF;
+END $$;
diff --git a/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql
new file mode 100644
index 0000000000..e55dcdc3b6
--- /dev/null
+++ b/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql
@@ -0,0 +1,13 @@
+DO $$
+DECLARE
+    can_execute boolean;
+BEGIN
+    SELECT has_function_privilege('neon_superuser', oid, 'execute')
+       INTO can_execute
+       FROM pg_proc
+       WHERE proname = 'pg_show_replication_origin_status'
+           AND pronamespace = 'pg_catalog'::regnamespace;
+    IF NOT can_execute THEN
+        RAISE EXCEPTION 'neon_superuser cannot execute pg_show_replication_origin_status';
+    END IF;
+END $$;
diff --git a/libs/utils/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs
index 870684b399..701ba2d42c 100644
--- a/libs/utils/src/failpoint_support.rs
+++ b/libs/utils/src/failpoint_support.rs
@@ -9,7 +9,7 @@ use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 
-/// Declare a failpoint that can use the `pause` failpoint action.
+/// Declare a failpoint that can use to `pause` failpoint action.
 /// We don't want to block the executor thread, hence, spawn_blocking + await.
 #[macro_export]
 macro_rules! pausable_failpoint {
@@ -181,7 +181,7 @@ pub async fn failpoints_handler(
 ) -> Result<Response<Body>, ApiError> {
     if !fail::has_failpoints() {
         return Err(ApiError::BadRequest(anyhow::anyhow!(
-            "Cannot manage failpoints because storage was compiled without failpoints support"
+            "Cannot manage failpoints because neon was compiled without failpoints support"
         )));
     }
 
diff --git a/test_runner/conftest.py b/test_runner/conftest.py
index 887bfef478..9e32469d69 100644
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -8,6 +8,7 @@ pytest_plugins = (
     "fixtures.compute_reconfigure",
     "fixtures.storage_controller_proxy",
     "fixtures.paths",
+    "fixtures.compute_migrations",
     "fixtures.neon_fixtures",
     "fixtures.benchmark_fixture",
     "fixtures.pg_stats",
diff --git a/test_runner/fixtures/compute_migrations.py b/test_runner/fixtures/compute_migrations.py
new file mode 100644
index 0000000000..ea99785af0
--- /dev/null
+++ b/test_runner/fixtures/compute_migrations.py
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+import os
+from typing import TYPE_CHECKING
+
+import pytest
+
+from fixtures.paths import BASE_DIR
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+    from pathlib import Path
+
+COMPUTE_MIGRATIONS_DIR = BASE_DIR / "compute_tools" / "src" / "migrations"
+COMPUTE_MIGRATIONS_TEST_DIR = COMPUTE_MIGRATIONS_DIR / "tests"
+
+COMPUTE_MIGRATIONS = sorted(next(os.walk(COMPUTE_MIGRATIONS_DIR))[2])
+NUM_COMPUTE_MIGRATIONS = len(COMPUTE_MIGRATIONS)
+
+
+@pytest.fixture(scope="session")
+def compute_migrations_dir() -> Iterator[Path]:
+    """
+    Retrieve the path to the compute migrations directory.
+    """
+    yield COMPUTE_MIGRATIONS_DIR
+
+
+@pytest.fixture(scope="session")
+def compute_migrations_test_dir() -> Iterator[Path]:
+    """
+    Retrieve the path to the compute migrations test directory.
+    """
+    yield COMPUTE_MIGRATIONS_TEST_DIR
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
index 1cd9158c68..aa0d95fe80 100644
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -55,3 +55,17 @@ class EndpointHttpClient(requests.Session):
         res = self.get(f"http://localhost:{self.port}/metrics")
         res.raise_for_status()
         return res.text
+
+    def configure_failpoints(self, *args: tuple[str, str]) -> None:
+        body: list[dict[str, str]] = []
+
+        for fp in args:
+            body.append(
+                {
+                    "name": fp[0],
+                    "action": fp[1],
+                }
+            )
+
+        res = self.post(f"http://localhost:{self.port}/failpoints", json=body)
+        res.raise_for_status()
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index a85a191455..adbd6414a7 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -522,14 +522,15 @@ class NeonLocalCli(AbstractNeonCli):
         safekeepers: list[int] | None = None,
         remote_ext_config: str | None = None,
         pageserver_id: int | None = None,
-        allow_multiple=False,
+        allow_multiple: bool = False,
         basebackup_request_tries: int | None = None,
+        env: dict[str, str] | None = None,
     ) -> subprocess.CompletedProcess[str]:
         args = [
             "endpoint",
             "start",
         ]
-        extra_env_vars = {}
+        extra_env_vars = env or {}
         if basebackup_request_tries is not None:
             extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries)
         if remote_ext_config is not None:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9f78ad120b..a0c642163d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -54,6 +54,7 @@ from fixtures.common_types import (
     TimelineArchivalState,
     TimelineId,
 )
+from fixtures.compute_migrations import NUM_COMPUTE_MIGRATIONS
 from fixtures.endpoint.http import EndpointHttpClient
 from fixtures.h2server import H2Server
 from fixtures.log_helper import log
@@ -3855,6 +3856,7 @@ class Endpoint(PgProtocol, LogUtils):
         safekeepers: list[int] | None = None,
         allow_multiple: bool = False,
         basebackup_request_tries: int | None = None,
+        env: dict[str, str] | None = None,
     ) -> Self:
         """
         Start the Postgres instance.
@@ -3875,6 +3877,7 @@ class Endpoint(PgProtocol, LogUtils):
             pageserver_id=pageserver_id,
             allow_multiple=allow_multiple,
             basebackup_request_tries=basebackup_request_tries,
+            env=env,
         )
         self._running.release(1)
         self.log_config_value("shared_buffers")
@@ -3988,14 +3991,17 @@ class Endpoint(PgProtocol, LogUtils):
             log.info("Updating compute spec to: %s", json.dumps(data_dict, indent=4))
             json.dump(data_dict, file, indent=4)
 
-    # Please note: Migrations only run if pg_skip_catalog_updates is false
-    def wait_for_migrations(self, num_migrations: int = 11):
+    def wait_for_migrations(self, wait_for: int = NUM_COMPUTE_MIGRATIONS) -> None:
+        """
+        Wait for all compute migrations to be ran. Remember that migrations only
+        run if "pg_skip_catalog_updates" is set in the compute spec to false.
+        """
         with self.cursor() as cur:
 
             def check_migrations_done():
                 cur.execute("SELECT id FROM neon_migration.migration_id")
                 migration_id: int = cur.fetchall()[0][0]
-                assert migration_id >= num_migrations
+                assert migration_id >= wait_for
 
             wait_until(check_migrations_done)
 
diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py
index 80777d65e9..fc4fb3629b 100644
--- a/test_runner/fixtures/paths.py
+++ b/test_runner/fixtures/paths.py
@@ -21,8 +21,8 @@ if TYPE_CHECKING:
 
 
 BASE_DIR = Path(__file__).parents[2]
-COMPUTE_CONFIG_DIR = BASE_DIR / "compute" / "etc"
 DEFAULT_OUTPUT_DIR: str = "test_output"
+COMPUTE_CONFIG_DIR = BASE_DIR / "compute" / "etc"
 
 
 def get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str | None = None) -> Path:
diff --git a/test_runner/regress/test_compute_migrations.py b/test_runner/regress/test_compute_migrations.py
new file mode 100644
index 0000000000..803702a6f8
--- /dev/null
+++ b/test_runner/regress/test_compute_migrations.py
@@ -0,0 +1,90 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import TYPE_CHECKING, cast
+
+import pytest
+from fixtures.compute_migrations import COMPUTE_MIGRATIONS, NUM_COMPUTE_MIGRATIONS
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnv
+
+
+def test_compute_migrations_retry(neon_simple_env: NeonEnv, compute_migrations_dir: Path):
+    """
+    Test that compute_ctl can recover from migration failures next time it
+    starts, and that the persisted migration ID is correct in such cases.
+    """
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create("main")
+    endpoint.respec(skip_pg_catalog_updates=False)
+
+    for i in range(1, NUM_COMPUTE_MIGRATIONS + 1):
+        endpoint.start(env={"FAILPOINTS": f"compute-migration=return({i})"})
+
+        # Make sure that the migrations ran
+        endpoint.wait_for_migrations(wait_for=i - 1)
+
+        # Confirm that we correctly recorded that in the
+        # neon_migration.migration_id table
+        with endpoint.cursor() as cur:
+            cur.execute("SELECT id FROM neon_migration.migration_id")
+            migration_id = cast("int", cur.fetchall()[0][0])
+            assert migration_id == i - 1
+
+        endpoint.stop()
+
+    endpoint.start()
+
+    # Now wait for the rest of the migrations
+    endpoint.wait_for_migrations()
+
+    with endpoint.cursor() as cur:
+        cur.execute("SELECT id FROM neon_migration.migration_id")
+        migration_id = cast("int", cur.fetchall()[0][0])
+        assert migration_id == NUM_COMPUTE_MIGRATIONS
+
+    for i, m in enumerate(COMPUTE_MIGRATIONS, start=1):
+        migration_query = (compute_migrations_dir / m).read_text(encoding="utf-8")
+        if not migration_query.startswith("-- SKIP"):
+            pattern = rf"Skipping migration id={i}"
+        else:
+            pattern = rf"Running migration id={i}"
+
+        endpoint.log_contains(pattern)
+
+
+@pytest.mark.parametrize(
+    "migration",
+    (pytest.param((i, m), id=str(i)) for i, m in enumerate(COMPUTE_MIGRATIONS, start=1)),
+)
+def test_compute_migrations_e2e(
+    neon_simple_env: NeonEnv,
+    compute_migrations_dir: Path,
+    compute_migrations_test_dir: Path,
+    migration: tuple[int, str],
+):
+    """
+    Test that the migrations perform as advertised.
+    """
+    env = neon_simple_env
+
+    migration_id = migration[0]
+    migration_filename = migration[1]
+
+    migration_query = (compute_migrations_dir / migration_filename).read_text(encoding="utf-8")
+    if migration_query.startswith("-- SKIP"):
+        pytest.skip("The migration is marked as SKIP")
+
+    endpoint = env.endpoints.create("main")
+    endpoint.respec(skip_pg_catalog_updates=False)
+
+    # Stop applying migrations after the one we want to test, so that we can
+    # test the state of the cluster at the given migration ID
+    endpoint.start(env={"FAILPOINTS": f"compute-migration=return({migration_id + 1})"})
+
+    endpoint.wait_for_migrations(wait_for=migration_id)
+
+    check_query = (compute_migrations_test_dir / migration_filename).read_text(encoding="utf-8")
+    endpoint.safe_psql(check_query)
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
deleted file mode 100644
index 7211619a99..0000000000
--- a/test_runner/regress/test_migrations.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from __future__ import annotations
-
-import time
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from fixtures.neon_fixtures import NeonEnv
-
-
-def test_migrations(neon_simple_env: NeonEnv):
-    env = neon_simple_env
-
-    endpoint = env.endpoints.create("main")
-    endpoint.respec(skip_pg_catalog_updates=False)
-    endpoint.start()
-
-    num_migrations = 11
-    endpoint.wait_for_migrations(num_migrations=num_migrations)
-
-    with endpoint.cursor() as cur:
-        cur.execute("SELECT id FROM neon_migration.migration_id")
-        migration_id = cur.fetchall()
-        assert migration_id[0][0] == num_migrations
-
-    endpoint.stop()
-    endpoint.start()
-    # We don't have a good way of knowing that the migrations code path finished executing
-    # in compute_ctl in the case that no migrations are being run
-    time.sleep(1)
-    with endpoint.cursor() as cur:
-        cur.execute("SELECT id FROM neon_migration.migration_id")
-        migration_id = cur.fetchall()
-        assert migration_id[0][0] == num_migrations

From cd10c719f9c7ec4424a6c30a6419256589c60d2e Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Thu, 2 Jan 2025 11:45:59 -0800
Subject: [PATCH 097/583] compute: Add spec support for disabling LFC resizing
 (#10132)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ref neondatabase/cloud#21731

## Problem

When we manually override the LFC size for particular computes,
autoscaling will typically undo that because vm-monitor will resize LFC
itself.

So, we'd like a way to make vm-monitor not set LFC size — this actually
already exists, if we just don't give vm-monitor a postgres connection
string.

## Summary of changes

Add a new field to the compute spec, `disable_lfc_resizing`. When set to
`true`, we pass in `None` for its postgres connection string. That
matches the configuration tested in `neondatabase/autoscaling` CI.
---
 compute_tools/src/bin/compute_ctl.rs | 19 +++++++++++++++----
 control_plane/src/endpoint.rs        |  1 +
 libs/compute_api/src/spec.rs         |  9 +++++++++
 3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 95ade9a87d..26ae25ec20 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -424,9 +424,13 @@ fn start_postgres(
         "running compute with features: {:?}",
         state.pspec.as_ref().unwrap().spec.features
     );
-    // before we release the mutex, fetch the swap size (if any) for later.
-    let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes;
-    let disk_quota_bytes = state.pspec.as_ref().unwrap().spec.disk_quota_bytes;
+    // before we release the mutex, fetch some parameters for later.
+    let &ComputeSpec {
+        swap_size_bytes,
+        disk_quota_bytes,
+        disable_lfc_resizing,
+        ..
+    } = &state.pspec.as_ref().unwrap().spec;
     drop(state);
 
     // Launch remaining service threads
@@ -531,11 +535,18 @@ fn start_postgres(
             // This token is used internally by the monitor to clean up all threads
             let token = CancellationToken::new();
 
+            // don't pass postgres connection string to vm-monitor if we don't want it to resize LFC
+            let pgconnstr = if disable_lfc_resizing.unwrap_or(false) {
+                None
+            } else {
+                file_cache_connstr.cloned()
+            };
+
             let vm_monitor = rt.as_ref().map(|rt| {
                 rt.spawn(vm_monitor::start(
                     Box::leak(Box::new(vm_monitor::Args {
                         cgroup: cgroup.cloned(),
-                        pgconnstr: file_cache_connstr.cloned(),
+                        pgconnstr,
                         addr: vm_monitor_addr.clone(),
                     })),
                     token.clone(),
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 5ebf842813..5e47ec4811 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -585,6 +585,7 @@ impl Endpoint {
             features: self.features.clone(),
             swap_size_bytes: None,
             disk_quota_bytes: None,
+            disable_lfc_resizing: None,
             cluster: Cluster {
                 cluster_id: None, // project ID: not used
                 name: None,       // project name: not used
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 6d9c353cda..54d6a1d38f 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -67,6 +67,15 @@ pub struct ComputeSpec {
     #[serde(default)]
     pub disk_quota_bytes: Option<u64>,
 
+    /// Disables the vm-monitor behavior that resizes LFC on upscale/downscale, instead relying on
+    /// the initial size of LFC.
+    ///
+    /// This is intended for use when the LFC size is being overridden from the default but
+    /// autoscaling is still enabled, and we don't want the vm-monitor to interfere with the custom
+    /// LFC sizing.
+    #[serde(default)]
+    pub disable_lfc_resizing: Option<bool>,
+
     /// Expected cluster state at the end of transition process.
     pub cluster: Cluster,
     pub delta_operations: Option<Vec<DeltaOp>>,

From eefad27538757a760532ccf4ae2da0088dafb47a Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 2 Jan 2025 16:12:56 -0600
Subject: [PATCH 098/583] Inline various migration queries (#10231)

There was no value in saving them off to temporary variables.

Signed-off-by: Tristan Partin <tristan@neon.tech>

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/migration.rs | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs
index 07d738abe9..1f3de65806 100644
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -35,8 +35,6 @@ impl<'m> MigrationRunner<'m> {
     /// used if you would like to fail the application of a series of migrations
     /// at some point.
     fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
-        let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id);
-
         // We use this fail point in order to check that failing in the
         // middle of applying a series of migrations fails in an expected
         // manner
@@ -58,7 +56,10 @@ impl<'m> MigrationRunner<'m> {
         }
 
         self.client
-            .simple_query(&setval)
+            .query(
+                "UPDATE neon_migration.migration_id SET id = $1",
+                &[&migration_id],
+            )
             .context("run_migrations update id")?;
 
         Ok(())
@@ -66,20 +67,16 @@ impl<'m> MigrationRunner<'m> {
 
     /// Prepare the migrations the target database for handling migrations
     fn prepare_database(&mut self) -> Result<()> {
-        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
-        self.client.simple_query(query)?;
-
-        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
-        self.client.simple_query(query)?;
-
-        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
-        self.client.simple_query(query)?;
-
-        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
-        self.client.simple_query(query)?;
-
-        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
-        self.client.simple_query(query)?;
+        self.client
+            .simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration")?;
+        self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)")?;
+        self.client.simple_query(
+            "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING",
+        )?;
+        self.client
+            .simple_query("ALTER SCHEMA neon_migration OWNER TO cloud_admin")?;
+        self.client
+            .simple_query("REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC")?;
 
         Ok(())
     }

From 7a598b9842f47707d9d93dda621094494221c5a6 Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Fri, 3 Jan 2025 12:04:58 +0200
Subject: [PATCH 099/583] [proxy/docs]imprv: Add local testing section to proxy
 README (#10230)

Add commands to run proxy locally with the mocked control plane
---
 proxy/README.md | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/proxy/README.md b/proxy/README.md
index 8d850737be..4b98342d72 100644
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -102,23 +102,39 @@ User can pass several optional headers that will affect resulting json.
 2. `Neon-Array-Mode: true`. Return postgres rows as arrays instead of objects. That is more compact representation and also helps in some edge
 cases where it is hard to use rows represented as objects (e.g. when several fields have the same name).
 
+## Test proxy locally
 
-## Using SNI-based routing on localhost
-
-Now proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy:
+Proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so we can use *.localtest.me` which resolves to `127.0.0.1`.
 
+Let's create self-signed certificate by running:
 ```sh
 openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me"
 ```
 
-start proxy
-
+Then we need to build proxy with 'testing' feature and run, e.g.:
 ```sh
-./target/debug/proxy -c server.crt -k server.key
+RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://proxy:password@endpoint.localtest.me:5432/postgres' --is-private-access-proxy true -c server.crt -k server.key
 ```
 
-and connect to it
+We will also need to have a postgres instance. Assuming that we have setted up docker we can set it up as follows:
+```sh
+docker run \
+  --detach \
+  --name proxy-postgres \
+  --env POSTGRES_PASSWORD=proxy-postgres \
+  --publish 5432:5432 \
+  postgres:17-bookworm
+```
+
+Next step is setting up auth table and schema as well as creating role (without the JWT table):
+```sh
+docker exec -it proxy-postgres psql -U postgres -c "CREATE SCHEMA IF NOT EXISTS neon_control_plane"
+docker exec -it proxy-postgres psql -U postgres -c "CREATE TABLE neon_control_plane.endpoints (endpoint_id VARCHAR(255) PRIMARY KEY, allowed_ips VARCHAR(255))"
+docker exec -it proxy-postgres psql -U postgres -c "CREATE ROLE proxy WITH SUPERUSER LOGIN PASSWORD 'password';"
+```
+
+Now from client you can start a new session:
 
 ```sh
-PGSSLROOTCERT=./server.crt psql 'postgres://my-cluster-42.localtest.me:1234?sslmode=verify-full'
-```
+PGSSLROOTCERT=./server.crt psql  "postgresql://proxy:password@endpoint.localtest.me:4432/postgres?sslmode=verify-full"
+```
\ No newline at end of file

From 2d4f267983858c197de795074046a2b1376a8616 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 3 Jan 2025 10:20:18 +0000
Subject: [PATCH 100/583] cargo: update diesel, pq-sys (#10256)

## Problem

Versions of `diesel` and `pq-sys` were somewhat stale. I was checking on
libpq->openssl versions while investigating a segfault via
https://github.com/neondatabase/cloud/issues/21010. I don't think these
rust bindings are likely to be the source of issues, but we might as
well freshen them as a precaution.

## Summary of changes

- Update diesel to 2.2.6
- Update pq-sys to 0.6.3
---
 Cargo.lock                    | 8 ++++----
 storage_controller/Cargo.toml | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 420def152d..9e0e343996 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1733,9 +1733,9 @@ checksum = "ab03c107fafeb3ee9f5925686dbb7a73bc76e3932abb0d2b365cb64b169cf04c"
 
 [[package]]
 name = "diesel"
-version = "2.2.3"
+version = "2.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "65e13bab2796f412722112327f3e575601a3e9cdcbe426f0d30dbf43f3f5dc71"
+checksum = "ccf1bedf64cdb9643204a36dd15b19a6ce8e7aa7f7b105868e9f1fad5ffa7d12"
 dependencies = [
  "bitflags 2.4.1",
  "byteorder",
@@ -4494,9 +4494,9 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
 [[package]]
 name = "pq-sys"
-version = "0.4.8"
+version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "31c0052426df997c0cbd30789eb44ca097e3541717a7b8fa36b1c464ee7edebd"
+checksum = "f6cc05d7ea95200187117196eee9edd0644424911821aeb28a18ce60ea0b8793"
 dependencies = [
  "vcpkg",
 ]
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 2f5d266567..5f3319512d 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -43,13 +43,13 @@ scopeguard.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 
-diesel = { version = "2.1.4", features = [
+diesel = { version = "2.2.6", features = [
     "serde_json",
     "postgres",
     "r2d2",
     "chrono",
 ] }
-diesel_migrations = { version = "2.1.0" }
+diesel_migrations = { version = "2.2.0" }
 r2d2 = { version = "0.8.10" }
 
 utils = { path = "../libs/utils/" }

From ba9722a2fd913d30b639ec506c42761cdca44440 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 3 Jan 2025 10:55:07 +0000
Subject: [PATCH 101/583] tests: add upload wait in
 test_scrubber_physical_gc_ancestors (#10260)

## Problem

We see periodic failures in `test_scrubber_physical_gc_ancestors`, where
the logs show that the pageserver is creating image layers that should
cause child shards to no longer reference their parents' layers, but
then the scrubber runs and doesn't find any unreferenced layers.[


https://neon-github-public-dev.s3.amazonaws.com/reports/pr-10256/12582034135/index.html#/testresult/78ea06dea6ba8dd3

From inspecting the code & test, it seems like this could be as simple
as the test failing to wait for uploads before running the scrubber. It
had a 2 second delay built in to satisfy the scrubbers time threshold
checks, which on a lightly loaded machine would also have been easily
enough for uploads to complete, but our test machines are more heavily
loaded all the time.

## Summary of changes

- Wait for uploads to complete after generating images layers in
test_scrubber_physical_gc_ancestors, so that the scrubber should
reliably see the post-compaction metadata.
---
 test_runner/regress/test_storage_scrubber.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 198e4f0460..220c428531 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -266,7 +266,9 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_
     for shard in shards:
         ps = env.get_tenant_pageserver(shard)
         assert ps is not None
-        ps.http_client().timeline_compact(shard, timeline_id, force_image_layer_creation=True)
+        ps.http_client().timeline_compact(
+            shard, timeline_id, force_image_layer_creation=True, wait_until_uploaded=True
+        )
         ps.http_client().timeline_gc(shard, timeline_id, 0)
 
     # We will use a min_age_secs=1 threshold for deletion, let it pass

From c08759f367063fde2558f979b83f6f9209741c7a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 3 Jan 2025 10:55:15 +0000
Subject: [PATCH 102/583] storcon: verbose logs in rare case of shards not
 attached yet (#10262)

## Problem

When we do a timeline CRUD operation, we check that the shards we need
to mutate are currently attached to a pageserver, by reading
`generation` and `generation_pageserver` from the database.

If any don't appear to be attached, we respond with a a 503 and "One or
more shards in tenant is not yet attached".

This is happening more often than expected, and it's not obvious with
current logging what's going on: specifically which shard has a problem,
and exactly what we're seeing in these persistent generation columns.

(Aside: it's possible that we broke something with the change in #10011
which clears generation_pageserver when we detach a shard, although if
so the mechanism isn't trivial: what should happen is that if we stamp
on generation_pageserver if a reconciler is running, then it shouldn't
matter because we're about to

## Summary of changes

- When we are in Attached mode but find that
generation_pageserver/generation are unset, output details while looping
over shards.
---
 storage_controller/src/service.rs | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index c0c5bc371a..222cb9fdd4 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3572,6 +3572,11 @@ impl Service {
                 .iter()
                 .any(|i| i.generation.is_none() || i.generation_pageserver.is_none())
             {
+                let shard_generations = generations
+                    .into_iter()
+                    .map(|i| (i.tenant_shard_id, (i.generation, i.generation_pageserver)))
+                    .collect::<HashMap<_, _>>();
+
                 // One or more shards has not been attached to a pageserver.  Check if this is because it's configured
                 // to be detached (409: caller should give up), or because it's meant to be attached but isn't yet (503: caller should retry)
                 let locked = self.inner.read().unwrap();
@@ -3582,6 +3587,28 @@ impl Service {
                         PlacementPolicy::Attached(_) => {
                             // This shard is meant to be attached: the caller is not wrong to try and
                             // use this function, but we can't service the request right now.
+                            let Some(generation) = shard_generations.get(shard_id) else {
+                                // This can only happen if there is a split brain controller modifying the database.  This should
+                                // never happen when testing, and if it happens in production we can only log the issue.
+                                debug_assert!(false);
+                                tracing::error!("Shard {shard_id} not found in generation state!  Is another rogue controller running?");
+                                continue;
+                            };
+                            let (generation, generation_pageserver) = generation;
+                            if let Some(generation) = generation {
+                                if generation_pageserver.is_none() {
+                                    // This is legitimate only in a very narrow window where the shard was only just configured into
+                                    // Attached mode after being created in Secondary or Detached mode, and it has had its generation
+                                    // set but not yet had a Reconciler run (reconciler is the only thing that sets generation_pageserver).
+                                    tracing::warn!("Shard {shard_id} generation is set ({generation:?}) but generation_pageserver is None, reconciler not run yet?");
+                                }
+                            } else {
+                                // This should never happen: a shard with no generation is only permitted when it was created in some state
+                                // other than PlacementPolicy::Attached (and generation is always written to DB before setting Attached in memory)
+                                debug_assert!(false);
+                                tracing::error!("Shard {shard_id} generation is None, but it is in PlacementPolicy::Attached mode!");
+                                continue;
+                            }
                         }
                         PlacementPolicy::Secondary | PlacementPolicy::Detached => {
                             return Err(ApiError::Conflict(format!(

From 1303cd5d05062c95660ec00f8846b4258fc62b4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 3 Jan 2025 13:36:01 +0100
Subject: [PATCH 103/583] Fix defusing race between Tenant::shutdown and
 offload_timeline (#10150)

There is a race condition between `Tenant::shutdown`'s `defuse_for_drop`
loop and `offload_timeline`, where timeline offloading can insert into a
tenant that is in the process of shutting down, in fact so far
progressed that the `defuse_for_drop` has already been called.

This prevents warn log lines of the form:

```
offloaded timeline <hash> was dropped without having cleaned it up at the ancestor
```

The solution piggybacks on the `offloaded_timelines` lock: both the
defuse loop and the offloaded timeline insertion need to acquire the
lock, and we know that the defuse loop only runs after the tenant has
set its `TenantState` to `Stopping`.

So if we hold the `offloaded_timelines` lock, and know that the
`TenantState` is not `Stopping`, then we know that the defuse loop has
not ran yet, and holding the lock ensures that it doesn't start running
while we are inserting the offloaded timeline.

Fixes #10070
---
 pageserver/src/tenant/timeline/offload.rs | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 3bfbfb5061..15628a9645 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -1,5 +1,7 @@
 use std::sync::Arc;
 
+use pageserver_api::models::TenantState;
+
 use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
 use super::Timeline;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -70,6 +72,15 @@ pub(crate) async fn offload_timeline(
 
     {
         let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
+        if matches!(
+            tenant.current_state(),
+            TenantState::Stopping { .. } | TenantState::Broken { .. }
+        ) {
+            // Cancel the operation if the tenant is shutting down. Do this while the
+            // timelines_offloaded lock is held to prevent a race with Tenant::shutdown
+            // for defusing the lock
+            return Err(OffloadError::Cancelled);
+        }
         offloaded_timelines.insert(
             timeline.timeline_id,
             Arc::new(

From e9d30edc7f7f5fab66f7bdffcca9e8c5bb9b8d27 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 3 Jan 2025 13:13:22 +0000
Subject: [PATCH 104/583] pageserver: fix a 500 during timeline creation +
 shutdown (#10259)

## Problem

The test_create_churn_during_restart test fails if timeline creation
calls return 500 errors (because the API shouldn't do it), and it's
sometimes failing, for example:

https://neon-github-public-dev.s3.amazonaws.com/reports/pr-10256/12582034135/index.html#/testresult/3ce2e7045465012e

## Summary of changes

- Avoid handling UploadQueueShutDownOrStopped case as an Other (i.e.
500)
---
 pageserver/src/tenant.rs | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2e4c47c6e4..90017b25f2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2604,9 +2604,15 @@ impl Tenant {
                 WaitCompletionError::NotInitialized(
                     e, // If the queue is already stopped, it's a shutdown error.
                 ) if e.is_stopping() => CreateTimelineError::ShuttingDown,
-                e => CreateTimelineError::Other(e.into()),
-            })
-            .context("wait for timeline initial uploads to complete")?;
+                WaitCompletionError::NotInitialized(_) => {
+                    // This is a bug: we should never try to wait for uploads before initializing the timeline
+                    debug_assert!(false);
+                    CreateTimelineError::Other(anyhow::anyhow!("timeline not initialized"))
+                }
+                WaitCompletionError::UploadQueueShutDownOrStopped => {
+                    CreateTimelineError::ShuttingDown
+                }
+            })?;
 
         // The creating task is responsible for activating the timeline.
         // We do this after `wait_completion()` so that we don't spin up tasks that start

From b33299dc37d9269fe55bd3256b7a4a72c129b81c Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 3 Jan 2025 16:21:31 +0100
Subject: [PATCH 105/583] pageserver,safekeeper: disable heap profiling
 (#10268)

## Problem

Since enabling continuous profiling in staging, we've seen frequent seg
faults. This is suspected to be because jemalloc and pprof-rs take a
stack trace at the same time, and the handlers aren't signal safe.
jemalloc does this probabilistically on every allocation, regardless of
whether someone is taking a heap profile, which means that any CPU
profile has a chance to cause a seg fault.

Touches #10225.

## Summary of changes

For now, just disable heap profiles -- CPU profiles are more important,
and we need to be able to take them without risking a crash.
---
 pageserver/src/bin/pageserver.rs | 10 ++++++----
 safekeeper/src/bin/safekeeper.rs | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 567a69da3b..b92ff4ebf9 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -53,10 +53,12 @@ project_build_tag!(BUILD_TAG);
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
-/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
-#[allow(non_upper_case_globals)]
-#[export_name = "malloc_conf"]
-pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
+// TODO: disabled because concurrent CPU profiles cause seg faults. See:
+// https://github.com/neondatabase/neon/issues/10225.
+//#[allow(non_upper_case_globals)]
+//#[export_name = "malloc_conf"]
+//pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
 
 const PID_FILE_NAME: &str = "pageserver.pid";
 
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 13f6e34575..e0ba38d638 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -51,10 +51,12 @@ use utils::{
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
-/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
-#[allow(non_upper_case_globals)]
-#[export_name = "malloc_conf"]
-pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
+// TODO: disabled because concurrent CPU profiles cause seg faults. See:
+// https://github.com/neondatabase/neon/issues/10225.
+//#[allow(non_upper_case_globals)]
+//#[export_name = "malloc_conf"]
+//pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
 
 const PID_FILE_NAME: &str = "safekeeper.pid";
 const ID_FILE_NAME: &str = "safekeeper.id";

From 1393cc668bce904cf5300f8829addce86437e755 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 3 Jan 2025 16:38:51 +0100
Subject: [PATCH 106/583] Revert "pageserver: revert flush backpressure (#8550)
 (#10135)" (#10270)

This reverts commit f3ecd5d76ad8b858b2bfaaabba5018046aca46ac.

It is
[suspected](https://neondb.slack.com/archives/C033RQ5SPDH/p1735907405716759)
to have caused significant read amplification in the [ingest
benchmark](https://neonprod.grafana.net/d/de3mupf4g68e8e/perf-test3a-ingest-benchmark?orgId=1&from=now-30d&to=now&timezone=utc&var-new_project_endpoint_id=ep-solitary-sun-w22bmut6&var-large_tenant_endpoint_id=ep-holy-bread-w203krzs)
(specifically during index creation).

We will revisit an intermediate improvement here to unblock [upload
parallelism](https://github.com/neondatabase/neon/issues/10096) before
properly addressing [compaction
backpressure](https://github.com/neondatabase/neon/issues/8390).
---
 pageserver/src/metrics.rs                  | 25 ++++++++++-
 pageserver/src/tenant/timeline.rs          | 38 +++++++++++++----
 test_runner/fixtures/metrics.py            |  1 +
 test_runner/regress/test_branching.py      | 13 ++++--
 test_runner/regress/test_remote_storage.py | 48 ++++++++++++++++++++++
 5 files changed, 112 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index bdbabf3f75..b4e20cb8b9 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -3,7 +3,7 @@ use metrics::{
     register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
     register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
     register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
-    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
+    Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
     IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
@@ -445,6 +445,15 @@ pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static FLUSH_WAIT_UPLOAD_TIME: Lazy<GaugeVec> = Lazy::new(|| {
+    register_gauge_vec!(
+        "pageserver_flush_wait_upload_seconds",
+        "Time spent waiting for preceding uploads during layer flush",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
         "pageserver_last_record_lsn",
@@ -2577,6 +2586,7 @@ pub(crate) struct TimelineMetrics {
     shard_id: String,
     timeline_id: String,
     pub flush_time_histo: StorageTimeMetrics,
+    pub flush_wait_upload_time_gauge: Gauge,
     pub compact_time_histo: StorageTimeMetrics,
     pub create_images_time_histo: StorageTimeMetrics,
     pub logical_size_histo: StorageTimeMetrics,
@@ -2622,6 +2632,9 @@ impl TimelineMetrics {
             &shard_id,
             &timeline_id,
         );
+        let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
         let compact_time_histo = StorageTimeMetrics::new(
             StorageTimeOperation::Compact,
             &tenant_id,
@@ -2767,6 +2780,7 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
             flush_time_histo,
+            flush_wait_upload_time_gauge,
             compact_time_histo,
             create_images_time_histo,
             logical_size_histo,
@@ -2816,6 +2830,14 @@ impl TimelineMetrics {
         self.resident_physical_size_gauge.get()
     }
 
+    pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) {
+        self.flush_wait_upload_time_gauge.add(duration);
+        crate::metrics::FLUSH_WAIT_UPLOAD_TIME
+            .get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id])
+            .unwrap()
+            .add(duration);
+    }
+
     pub(crate) fn shutdown(&self) {
         let was_shutdown = self
             .shutdown
@@ -2833,6 +2855,7 @@ impl TimelineMetrics {
         let shard_id = &self.shard_id;
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         {
             RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e71cb4db80..b36c2f487f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -144,15 +144,19 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::config::TenantConf;
-use super::remote_timeline_client::index::IndexPart;
-use super::remote_timeline_client::RemoteTimelineClient;
-use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
-use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
-use super::upload_queue::NotInitialized;
-use super::GcError;
 use super::{
-    debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf, MaybeOffloaded,
+    config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized,
+    MaybeOffloaded,
+};
+use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
+use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
+use super::{
+    remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
+    storage_layer::ReadableLayer,
+};
+use super::{
+    secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
+    GcError,
 };
 
 #[cfg(test)]
@@ -3836,6 +3840,24 @@ impl Timeline {
             // release lock on 'layers'
         };
 
+        // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
+        // This makes us refuse ingest until the new layers have been persisted to the remote
+        let start = Instant::now();
+        self.remote_client
+            .wait_completion()
+            .await
+            .map_err(|e| match e {
+                WaitCompletionError::UploadQueueShutDownOrStopped
+                | WaitCompletionError::NotInitialized(
+                    NotInitialized::ShuttingDown | NotInitialized::Stopped,
+                ) => FlushLayerError::Cancelled,
+                WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => {
+                    FlushLayerError::Other(anyhow!(e).into())
+                }
+            })?;
+        let duration = start.elapsed().as_secs_f64();
+        self.metrics.flush_wait_upload_time_gauge_add(duration);
+
         // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
         // a compaction can delete the file and then it won't be available for uploads any more.
         // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index eb3d06b949..c5295360c3 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -170,6 +170,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     "pageserver_evictions_with_low_residence_duration_total",
     "pageserver_aux_file_estimated_size",
     "pageserver_valid_lsn_lease_count",
+    "pageserver_flush_wait_upload_seconds",
     counter("pageserver_tenant_throttling_count_accounted_start"),
     counter("pageserver_tenant_throttling_count_accounted_finish"),
     counter("pageserver_tenant_throttling_wait_usecs_sum"),
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index a4056404f0..34e4e994cb 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -19,7 +19,6 @@ from fixtures.pageserver.utils import wait_until_tenant_active
 from fixtures.utils import query_scalar
 from performance.test_perf_pgbench import get_scales_matrix
 from requests import RequestException
-from requests.exceptions import RetryError
 
 
 # Test branch creation
@@ -177,8 +176,11 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
 
         env.neon_cli.mappings_map_branch(initial_branch, env.initial_tenant, env.initial_timeline)
 
-        with pytest.raises(RuntimeError, match="is not active, state: Loading"):
-            env.endpoints.create_start(initial_branch, tenant_id=env.initial_tenant)
+        with pytest.raises(RuntimeError, match="ERROR: Not found: Timeline"):
+            env.endpoints.create_start(
+                initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2
+            )
+        ps_http.configure_failpoints(("before-upload-index-pausable", "off"))
     finally:
         env.pageserver.stop(immediate=True)
 
@@ -219,7 +221,10 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
 
         branch_id = TimelineId.generate()
 
-        with pytest.raises(RetryError, match="too many 503 error responses"):
+        with pytest.raises(
+            PageserverApiException,
+            match="Cannot branch off the timeline that's not present in pageserver",
+        ):
             ps_http.timeline_create(
                 env.pg_version,
                 env.initial_tenant,
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 52b6b254aa..76a42ef4a2 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -784,6 +784,54 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
         create_thread.join()
 
 
+def test_paused_upload_stalls_checkpoint(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    This test checks that checkpoints block on uploads to remote storage.
+    """
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            # Set a small compaction threshold
+            "compaction_threshold": "3",
+            # Disable GC
+            "gc_period": "0s",
+            # disable PITR
+            "pitr_interval": "0s",
+        }
+    )
+
+    env.pageserver.allowed_errors.append(
+        f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
+    )
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    client = env.pageserver.http_client()
+    layers_at_creation = client.layer_map_info(tenant_id, timeline_id)
+    deltas_at_creation = len(layers_at_creation.delta_layers())
+    assert (
+        deltas_at_creation == 1
+    ), "are you fixing #5863? make sure we end up with 2 deltas at the end of endpoint lifecycle"
+
+    # Make new layer uploads get stuck.
+    # Note that timeline creation waits for the initial layers to reach remote storage.
+    # So at this point, the `layers_at_creation` are in remote storage.
+    client.configure_failpoints(("before-upload-layer-pausable", "pause"))
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        # Build two tables with some data inside
+        endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)")
+        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+
+        with pytest.raises(ReadTimeout):
+            client.timeline_checkpoint(tenant_id, timeline_id, timeout=5)
+        client.configure_failpoints(("before-upload-layer-pausable", "off"))
+
+
 def wait_upload_queue_empty(
     client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
 ):

From a77e87a48a628abcac77afeb1f64e5a491275f1c Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 3 Jan 2025 17:03:19 +0100
Subject: [PATCH 107/583] pageserver: assert that uploads don't modify indexed
 layers (#10228)

## Problem

It's not legal to modify layers that are referenced by the current layer
index. Assert this in the upload queue, as preparation for upload queue
reordering.

Touches #10096.

## Summary of changes

Add a debug assertion that the upload queue does not modify layers
referenced by the current index.

I could be convinced that this should be a plain assertion, but will be
conservative for now.
---
 .../src/tenant/remote_timeline_client.rs      | 39 +++++++++++++++++++
 .../tenant/remote_timeline_client/index.rs    | 21 +++++++---
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index fee11bc742..b27ac3e933 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1943,6 +1943,30 @@ impl RemoteTimelineClient {
                 return;
             }
 
+            // Assert that we don't modify a layer that's referenced by the current index.
+            if cfg!(debug_assertions) {
+                let modified = match &task.op {
+                    UploadOp::UploadLayer(layer, layer_metadata, _) => {
+                        vec![(layer.layer_desc().layer_name(), layer_metadata)]
+                    }
+                    UploadOp::Delete(delete) => {
+                        delete.layers.iter().map(|(n, m)| (n.clone(), m)).collect()
+                    }
+                    // These don't modify layers.
+                    UploadOp::UploadMetadata { .. } => Vec::new(),
+                    UploadOp::Barrier(_) => Vec::new(),
+                    UploadOp::Shutdown => Vec::new(),
+                };
+                if let Ok(queue) = self.upload_queue.lock().unwrap().initialized_mut() {
+                    for (ref name, metadata) in modified {
+                        debug_assert!(
+                            !queue.clean.0.references(name, metadata),
+                            "layer {name} modified while referenced by index",
+                        );
+                    }
+                }
+            }
+
             let upload_result: anyhow::Result<()> = match &task.op {
                 UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => {
                     if let Some(OpType::FlushDeletion) = mode {
@@ -2509,6 +2533,21 @@ pub fn remote_layer_path(
     RemotePath::from_string(&path).expect("Failed to construct path")
 }
 
+/// Returns true if a and b have the same layer path within a tenant/timeline. This is essentially
+/// remote_layer_path(a) == remote_layer_path(b) without the string allocations.
+///
+/// TODO: there should be a variant of LayerName for the physical path that contains information
+/// about the shard and generation, such that this could be replaced by a simple comparison.
+pub fn is_same_remote_layer_path(
+    aname: &LayerName,
+    ameta: &LayerFileMetadata,
+    bname: &LayerName,
+    bmeta: &LayerFileMetadata,
+) -> bool {
+    // NB: don't assert remote_layer_path(a) == remote_layer_path(b); too expensive even for debug.
+    aname == bname && ameta.shard == bmeta.shard && ameta.generation == bmeta.generation
+}
+
 pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath {
     RemotePath::from_string(&format!(
         "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{INITDB_PATH}"
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 506990fb2f..51f093cb87 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -8,14 +8,14 @@ use std::collections::HashMap;
 use chrono::NaiveDateTime;
 use pageserver_api::models::AuxFilePolicy;
 use serde::{Deserialize, Serialize};
-use utils::id::TimelineId;
 
+use super::is_same_remote_layer_path;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::import_pgdata;
 use crate::tenant::Generation;
 use pageserver_api::shard::ShardIndex;
-
+use utils::id::TimelineId;
 use utils::lsn::Lsn;
 
 /// In-memory representation of an `index_part.json` file
@@ -45,10 +45,8 @@ pub struct IndexPart {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub import_pgdata: Option<import_pgdata::index_part_format::Root>,
 
-    /// Per layer file name metadata, which can be present for a present or missing layer file.
-    ///
-    /// Older versions of `IndexPart` will not have this property or have only a part of metadata
-    /// that latest version stores.
+    /// Layer filenames and metadata. For an index persisted in remote storage, all layers must
+    /// exist in remote storage.
     pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,
 
     /// Because of the trouble of eyeballing the legacy "metadata" field, we copied the
@@ -143,6 +141,17 @@ impl IndexPart {
     pub(crate) fn example() -> Self {
         Self::empty(TimelineMetadata::example())
     }
+
+    /// Returns true if the index contains a reference to the given layer (i.e. file path).
+    ///
+    /// TODO: there should be a variant of LayerName for the physical remote path that contains
+    /// information about the shard and generation, to avoid passing in metadata.
+    pub fn references(&self, name: &LayerName, metadata: &LayerFileMetadata) -> bool {
+        let Some(index_metadata) = self.layer_metadata.get(name) else {
+            return false;
+        };
+        is_same_remote_layer_path(name, metadata, name, index_metadata)
+    }
 }
 
 /// Metadata gathered for each of the layer files.

From 4b2f56862dc5738407893df451348e78696abd65 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 3 Jan 2025 16:16:04 +0000
Subject: [PATCH 108/583] docker: include vanilla debian postgres client
 (#10269)

## Problem

We are chasing down segfaults in the storage controller
https://github.com/neondatabase/cloud/issues/21010

This is for use by the storage controller, which links dynamically with
`libpq`. We currently use the neon-built libpq, but this may be unsafe
for use from multi-threaded programs like the controller, as it uses a
statically linked openssl

Precursor to https://github.com/neondatabase/neon/pull/10258

## Summary of changes

- Include `postgresql-15` in container builds.

The reason for using version 15 is simply because that is what's
available in Debian 12 without adding any extra repositories, and we
don't have any special need for latest version in our libpq usage.
---
 Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index e888efbae2..df9bcb3002 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -69,6 +69,8 @@ RUN set -e \
         libreadline-dev \
         libseccomp-dev \
         ca-certificates \
+	# System postgres for use with client libraries (e.g. in storage controller)
+        postgresql-15 \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
     && useradd -d /data neon \
     && chown -R neon:neon /data

From b368e62cfc374bd48ca656b476c5c081c4018546 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 4 Jan 2025 15:40:50 +0000
Subject: [PATCH 109/583] build(deps): bump jinja2 from 3.1.4 to 3.1.5 in the
 pip group (#10236)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 poetry.lock    | 20 +++++++++++++++-----
 pyproject.toml |  2 +-
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 59ae5cf1ca..072bf9a5e9 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -1322,13 +1322,13 @@ files = [
 
 [[package]]
 name = "jinja2"
-version = "3.1.4"
+version = "3.1.5"
 description = "A very fast and expressive template engine."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"},
-    {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"},
+    {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"},
+    {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"},
 ]
 
 [package.dependencies]
@@ -3309,6 +3309,16 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -3524,4 +3534,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "426c385df93f578ba3537c40a269535e27fbcca1978b3cf266096ecbc298c6a9"
+content-hash = "9032c11f264f2f6d8a50230e5021c606d460aafdf370da0524784c3f0f1f31b1"
diff --git a/pyproject.toml b/pyproject.toml
index 01d15ee6bb..ba4ab0b1f7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@ requests = "^2.32.3"
 pytest-xdist = "^3.3.1"
 asyncpg = "^0.29.0"
 aiopg = "^1.4.0"
-Jinja2 = "^3.1.4"
+Jinja2 = "^3.1.5"
 types-requests = "^2.31.0.0"
 types-psycopg2 = "^2.9.21.20241019"
 boto3 = "^1.34.11"

From 406cca643b9529979522b33abf3a0457681fc987 Mon Sep 17 00:00:00 2001
From: Busra Kugler <busra@neon.tech>
Date: Mon, 6 Jan 2025 11:44:23 +0100
Subject: [PATCH 110/583] Update neon_fixtures.py - remove logs (#10219)

We need to remove this line to prevent aws keys exposing in the public
s3 buckets
---
 test_runner/fixtures/neon_fixtures.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a0c642163d..8fd9eec8ce 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4606,7 +4606,8 @@ class StorageScrubber:
         ]
         args = base_args + args
 
-        log.info(f"Invoking scrubber command {args} with env: {env}")
+        log.info(f"Invoking scrubber command {args}")
+
         (output_path, stdout, status_code) = subprocess_capture(
             self.log_dir,
             args,

From fda52a0005ea4ca8e4e1d6a16de75724a7e619fe Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 6 Jan 2025 13:05:35 +0000
Subject: [PATCH 111/583] feat(proxy): dont trigger error alerts for unknown
 topics (#10266)

## Problem

Before the holidays, and just before our code freeze, a change to cplane
was made that started publishing the topics from #10197. This triggered
our alerts and put us in a sticky situation as it was not an error, and
we didn't want to silence the alert for the entire holidays, and we
didn't want to release proxy 2 days in a row if it was not essential.

We fixed it eventually by rewriting the alert based on logs, but this is
not a good solution.

## Summary of changes

Introduces an intermediate parsing step to check the topic name first,
to allow us to ignore parsing errors for any topics we do not know
about.
---
 proxy/src/redis/notifications.rs | 48 +++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 671305a300..4383d6be2c 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -30,8 +30,14 @@ async fn try_connect(client: &ConnectionWithCredentialsProvider) -> anyhow::Resu
     Ok(conn)
 }
 
+#[derive(Debug, Deserialize)]
+struct NotificationHeader<'a> {
+    topic: &'a str,
+}
+
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 #[serde(tag = "topic", content = "data")]
+// Message to contributors: Make sure to align these topic names with the list below.
 pub(crate) enum Notification {
     #[serde(
         rename = "/allowed_ips_updated",
@@ -69,6 +75,22 @@ pub(crate) enum Notification {
     #[serde(rename = "/cancel_session")]
     Cancel(CancelSession),
 }
+
+/// Returns true if the topic name given is a known topic that we can deserialize and action on.
+/// Returns false otherwise.
+fn known_topic(s: &str) -> bool {
+    // Message to contributors: Make sure to align these topic names with the enum above.
+    matches!(
+        s,
+        "/allowed_ips_updated"
+            | "/block_public_or_vpc_access_updated"
+            | "/allowed_vpc_endpoints_updated_for_org"
+            | "/allowed_vpc_endpoints_updated_for_projects"
+            | "/password_updated"
+            | "/cancel_session"
+    )
+}
+
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 pub(crate) struct AllowedIpsUpdate {
     project_id: ProjectIdInt,
@@ -96,6 +118,7 @@ pub(crate) struct PasswordUpdate {
     project_id: ProjectIdInt,
     role_name: RoleNameInt,
 }
+
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 pub(crate) struct CancelSession {
     pub(crate) region_id: Option<String>,
@@ -141,18 +164,23 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
             region_id,
         }
     }
+
     pub(crate) async fn increment_active_listeners(&self) {
         self.cache.increment_active_listeners().await;
     }
+
     pub(crate) async fn decrement_active_listeners(&self) {
         self.cache.decrement_active_listeners().await;
     }
+
     #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))]
     async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> {
         let payload: String = msg.get_payload()?;
         tracing::debug!(?payload, "received a message payload");
 
-        let msg: Notification = match serde_json::from_str(&payload) {
+        // For better error handling, we first parse the payload to extract the topic.
+        // If there's a topic we don't support, we can handle that error more gracefully.
+        let header: NotificationHeader = match serde_json::from_str(&payload) {
             Ok(msg) => msg,
             Err(e) => {
                 Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
@@ -162,6 +190,24 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                 return Ok(());
             }
         };
+
+        if !known_topic(header.topic) {
+            // don't update the metric for redis errors if it's just a topic we don't know about.
+            tracing::warn!(topic = header.topic, "unknown topic");
+            return Ok(());
+        }
+
+        let msg: Notification = match serde_json::from_str(&payload) {
+            Ok(msg) => msg,
+            Err(e) => {
+                Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
+                    channel: msg.get_channel_name(),
+                });
+                tracing::error!(topic = header.topic, "broken message: {e}");
+                return Ok(());
+            }
+        };
+
         tracing::debug!(?msg, "received a message");
         match msg {
             Notification::Cancel(cancel_session) => {

From 95f1920231465e2b898b71a9959acec9ddd63896 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 6 Jan 2025 18:27:08 +0100
Subject: [PATCH 112/583] cargo: build with frame pointers (#10226)

## Problem

Frame pointers are typically disabled by default (depending on CPU
architecture), to improve performance. This frees up a CPU register, and
avoids a couple of instructions per function call. However, it makes
stack unwinding much more inefficient, since it has to use DWARF debug
information instead, and gives worse results with e.g. `perf` and eBPF
profiles. The `backtrace` implementation of `libunwind` is also
suspected to cause seg faults.

The performance benefit of frame pointer omission doesn't appear to
matter that much on modern 64-bit CPU architectures (which have plenty
of registers and optimized instruction execution), and benchmarks did
not show measurable overhead.

The Rust standard library and jemalloc already enable frame pointers by
default.

For more information, see
https://www.brendangregg.com/blog/2024-03-17/the-return-of-the-frame-pointers.html.

Resolves #10224.
Touches #10225.

## Summary of changes

Enable frame pointers in all builds, and use frame pointers for pprof-rs
stack sampling.
---
 .cargo/config.toml | 8 ++++++++
 Cargo.toml         | 4 +++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/.cargo/config.toml b/.cargo/config.toml
index 5e452974ad..20a2a929b9 100644
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -3,6 +3,14 @@
 # by the RUSTDOCFLAGS env var in CI.
 rustdocflags = ["-Arustdoc::private_intra_doc_links"]
 
+# Enable frame pointers. This may have a minor performance overhead, but makes it easier and more
+# efficient to obtain stack traces (and thus CPU/heap profiles). With continuous profiling, this is
+# likely a net win, and allows higher profiling resolution. See also:
+#
+# * <https://www.brendangregg.com/blog/2024-03-17/the-return-of-the-frame-pointers.html>
+# * <https://github.com/rust-lang/rust/pull/122646>
+rustflags = ["-Cforce-frame-pointers=yes"]
+
 [alias]
 build_testing = ["build", "--features", "testing"]
 neon = ["run", "--bin", "neon_local"]
diff --git a/Cargo.toml b/Cargo.toml
index 885f02ba81..197808d5ae 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -135,7 +135,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
-pprof = { version = "0.14", features = ["criterion", "flamegraph", "protobuf", "protobuf-codec"] }
+pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "protobuf", "protobuf-codec"] }
 procfs = "0.16"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.13"
@@ -266,6 +266,8 @@ tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", br
 [profile.release]
 # This is useful for profiling and, to some extent, debug.
 # Besides, debug info should not affect the performance.
+#
+# NB: we also enable frame pointers for improved profiling, see .cargo/config.toml.
 debug = true
 
 # disable debug symbols for all packages except this one to decrease binaries size

From 4a6556e269018844a8c3413bd7414331cd968fce Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 6 Jan 2025 14:29:18 -0500
Subject: [PATCH 113/583] fix(pageserver): ensure GC computes time cutoff using
 the same start time (#10193)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

close https://github.com/neondatabase/neon/issues/10192

## Summary of changes

* `find_gc_time_cutoff` takes `now` parameter so that all branches
compute the cutoff based on the same start time, avoiding races.
* gc-compaction uses a single `get_gc_compaction_watermark` function to
get the safe LSN to compact.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 pageserver/src/tenant.rs                     |  6 +++++-
 pageserver/src/tenant/timeline.rs            |  5 +++--
 pageserver/src/tenant/timeline/compaction.rs | 22 ++++++++++++++++++--
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 90017b25f2..e3dab2fc1d 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4488,13 +4488,17 @@ impl Tenant {
         let mut gc_cutoffs: HashMap<TimelineId, GcCutoffs> =
             HashMap::with_capacity(timelines.len());
 
+        // Ensures all timelines use the same start time when computing the time cutoff.
+        let now_ts_for_pitr_calc = SystemTime::now();
         for timeline in timelines.iter() {
             let cutoff = timeline
                 .get_last_record_lsn()
                 .checked_sub(horizon)
                 .unwrap_or(Lsn(0));
 
-            let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?;
+            let cutoffs = timeline
+                .find_gc_cutoffs(now_ts_for_pitr_calc, cutoff, pitr, cancel, ctx)
+                .await?;
             let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
             assert!(old.is_none());
         }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b36c2f487f..c1b71262e0 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4859,6 +4859,7 @@ impl Timeline {
 
     async fn find_gc_time_cutoff(
         &self,
+        now: SystemTime,
         pitr: Duration,
         cancel: &CancellationToken,
         ctx: &RequestContext,
@@ -4866,7 +4867,6 @@ impl Timeline {
         debug_assert_current_span_has_tenant_and_timeline_id();
         if self.shard_identity.is_shard_zero() {
             // Shard Zero has SLRU data and can calculate the PITR time -> LSN mapping itself
-            let now = SystemTime::now();
             let time_range = if pitr == Duration::ZERO {
                 humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid")
             } else {
@@ -4952,6 +4952,7 @@ impl Timeline {
     #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
     pub(super) async fn find_gc_cutoffs(
         &self,
+        now: SystemTime,
         space_cutoff: Lsn,
         pitr: Duration,
         cancel: &CancellationToken,
@@ -4979,7 +4980,7 @@ impl Timeline {
         // - if PITR interval is set, then this is our cutoff.
         // - if PITR interval is not set, then we do a lookup
         //   based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases.
-        let time_cutoff = self.find_gc_time_cutoff(pitr, cancel, ctx).await?;
+        let time_cutoff = self.find_gc_time_cutoff(now, pitr, cancel, ctx).await?;
 
         Ok(match (pitr, time_cutoff) {
             (Duration::ZERO, Some(time_cutoff)) => {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 94c65631b2..55cde8603e 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1799,6 +1799,24 @@ impl Timeline {
         Ok(())
     }
 
+    /// Get a watermark for gc-compaction, that is the lowest LSN that we can use as the `gc_horizon` for
+    /// the compaction algorithm. It is min(space_cutoff, time_cutoff, latest_gc_cutoff, standby_horizon).
+    /// Leases and retain_lsns are considered in the gc-compaction job itself so we don't need to account for them
+    /// here.
+    pub(crate) fn get_gc_compaction_watermark(self: &Arc<Self>) -> Lsn {
+        let gc_cutoff_lsn = {
+            let gc_info = self.gc_info.read().unwrap();
+            gc_info.min_cutoff()
+        };
+
+        // TODO: standby horizon should use leases so we don't really need to consider it here.
+        // let watermark = watermark.min(self.standby_horizon.load());
+
+        // TODO: ensure the child branches will not use anything below the watermark, or consider
+        // them when computing the watermark.
+        gc_cutoff_lsn.min(*self.get_latest_gc_cutoff_lsn())
+    }
+
     /// Split a gc-compaction job into multiple compaction jobs. The split is based on the key range and the estimated size of the compaction job.
     /// The function returns a list of compaction jobs that can be executed separately. If the upper bound of the compact LSN
     /// range is not specified, we will use the latest gc_cutoff as the upper bound, so that all jobs in the jobset acts
@@ -1811,7 +1829,7 @@ impl Timeline {
         let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX {
             job.compact_lsn_range.end
         } else {
-            *self.get_latest_gc_cutoff_lsn() // use the real gc cutoff
+            self.get_gc_compaction_watermark()
         };
 
         // Split compaction job to about 4GB each
@@ -2006,7 +2024,7 @@ impl Timeline {
                 // Therefore, it can only clean up data that cannot be cleaned up with legacy gc, instead of
                 // cleaning everything that theoritically it could. In the future, it should use `self.gc_info`
                 // to get the truth data.
-                let real_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
+                let real_gc_cutoff = self.get_gc_compaction_watermark();
                 // The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for
                 // each of the retain_lsn. Therefore, if the user-provided `compact_lsn_range.end` is larger than the real gc cutoff, we will use
                 // the real cutoff.

From b342a02b1c591642e2d52be606ccc42857af112d Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 6 Jan 2025 21:17:43 +0100
Subject: [PATCH 114/583] Dockerfile: build with `force-frame-pointers=yes`
 (#10286)

See
https://github.com/neondatabase/neon/pull/10226#issuecomment-2573725182.
---
 .cargo/config.toml | 6 ++++--
 Dockerfile         | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.cargo/config.toml b/.cargo/config.toml
index 20a2a929b9..c71d491303 100644
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -4,11 +4,13 @@
 rustdocflags = ["-Arustdoc::private_intra_doc_links"]
 
 # Enable frame pointers. This may have a minor performance overhead, but makes it easier and more
-# efficient to obtain stack traces (and thus CPU/heap profiles). With continuous profiling, this is
-# likely a net win, and allows higher profiling resolution. See also:
+# efficient to obtain stack traces (and thus CPU/heap profiles). It may also avoid seg faults that
+# we've seen with libunwind-based profiling. See also:
 #
 # * <https://www.brendangregg.com/blog/2024-03-17/the-return-of-the-frame-pointers.html>
 # * <https://github.com/rust-lang/rust/pull/122646>
+#
+# NB: the RUSTFLAGS envvar will replace this. Make sure to update e.g. Dockerfile as well.
 rustflags = ["-Cforce-frame-pointers=yes"]
 
 [alias]
diff --git a/Dockerfile b/Dockerfile
index df9bcb3002..2c157b3b2a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,7 +45,7 @@ COPY --chown=nonroot . .
 
 ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
-    && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \
+    && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
       --bin pg_sni_router  \
       --bin pageserver  \
       --bin pagectl  \

From ad7f14d526a4e7e5195ca5d8651672aae0c96d93 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 6 Jan 2025 20:25:31 +0000
Subject: [PATCH 115/583] test_runner: update packages for Python 3.13 (#10285)

## Problem

It's impossible to run regression tests with Python 3.13 as some
dependencies don't support it (some of them are outdated, and `jsonnet`
doesn't support it at all yet)

## Summary of changes
- Update dependencies for Python 3.13
- Install `jsonnet` only on Python < 3.13 and skip relevant tests on
Python 3.13

Closes #10237
---
 poetry.lock                                 | 771 +++++++++++---------
 pyproject.toml                              |  11 +-
 test_runner/regress/test_compute_metrics.py |   9 +-
 3 files changed, 459 insertions(+), 332 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 072bf9a5e9..5f15223dca 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -239,60 +239,66 @@ files = [
 
 [[package]]
 name = "asyncpg"
-version = "0.29.0"
+version = "0.30.0"
 description = "An asyncio PostgreSQL driver"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "asyncpg-0.29.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72fd0ef9f00aeed37179c62282a3d14262dbbafb74ec0ba16e1b1864d8a12169"},
-    {file = "asyncpg-0.29.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52e8f8f9ff6e21f9b39ca9f8e3e33a5fcdceaf5667a8c5c32bee158e313be385"},
-    {file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9e6823a7012be8b68301342ba33b4740e5a166f6bbda0aee32bc01638491a22"},
-    {file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:746e80d83ad5d5464cfbf94315eb6744222ab00aa4e522b704322fb182b83610"},
-    {file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ff8e8109cd6a46ff852a5e6bab8b0a047d7ea42fcb7ca5ae6eaae97d8eacf397"},
-    {file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:97eb024685b1d7e72b1972863de527c11ff87960837919dac6e34754768098eb"},
-    {file = "asyncpg-0.29.0-cp310-cp310-win32.whl", hash = "sha256:5bbb7f2cafd8d1fa3e65431833de2642f4b2124be61a449fa064e1a08d27e449"},
-    {file = "asyncpg-0.29.0-cp310-cp310-win_amd64.whl", hash = "sha256:76c3ac6530904838a4b650b2880f8e7af938ee049e769ec2fba7cd66469d7772"},
-    {file = "asyncpg-0.29.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4900ee08e85af01adb207519bb4e14b1cae8fd21e0ccf80fac6aa60b6da37b4"},
-    {file = "asyncpg-0.29.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a65c1dcd820d5aea7c7d82a3fdcb70e096f8f70d1a8bf93eb458e49bfad036ac"},
-    {file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b52e46f165585fd6af4863f268566668407c76b2c72d366bb8b522fa66f1870"},
-    {file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc600ee8ef3dd38b8d67421359779f8ccec30b463e7aec7ed481c8346decf99f"},
-    {file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:039a261af4f38f949095e1e780bae84a25ffe3e370175193174eb08d3cecab23"},
-    {file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6feaf2d8f9138d190e5ec4390c1715c3e87b37715cd69b2c3dfca616134efd2b"},
-    {file = "asyncpg-0.29.0-cp311-cp311-win32.whl", hash = "sha256:1e186427c88225ef730555f5fdda6c1812daa884064bfe6bc462fd3a71c4b675"},
-    {file = "asyncpg-0.29.0-cp311-cp311-win_amd64.whl", hash = "sha256:cfe73ffae35f518cfd6e4e5f5abb2618ceb5ef02a2365ce64f132601000587d3"},
-    {file = "asyncpg-0.29.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6011b0dc29886ab424dc042bf9eeb507670a3b40aece3439944006aafe023178"},
-    {file = "asyncpg-0.29.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b544ffc66b039d5ec5a7454667f855f7fec08e0dfaf5a5490dfafbb7abbd2cfb"},
-    {file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d84156d5fb530b06c493f9e7635aa18f518fa1d1395ef240d211cb563c4e2364"},
-    {file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54858bc25b49d1114178d65a88e48ad50cb2b6f3e475caa0f0c092d5f527c106"},
-    {file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:bde17a1861cf10d5afce80a36fca736a86769ab3579532c03e45f83ba8a09c59"},
-    {file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:37a2ec1b9ff88d8773d3eb6d3784dc7e3fee7756a5317b67f923172a4748a175"},
-    {file = "asyncpg-0.29.0-cp312-cp312-win32.whl", hash = "sha256:bb1292d9fad43112a85e98ecdc2e051602bce97c199920586be83254d9dafc02"},
-    {file = "asyncpg-0.29.0-cp312-cp312-win_amd64.whl", hash = "sha256:2245be8ec5047a605e0b454c894e54bf2ec787ac04b1cb7e0d3c67aa1e32f0fe"},
-    {file = "asyncpg-0.29.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0009a300cae37b8c525e5b449233d59cd9868fd35431abc470a3e364d2b85cb9"},
-    {file = "asyncpg-0.29.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5cad1324dbb33f3ca0cd2074d5114354ed3be2b94d48ddfd88af75ebda7c43cc"},
-    {file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:012d01df61e009015944ac7543d6ee30c2dc1eb2f6b10b62a3f598beb6531548"},
-    {file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000c996c53c04770798053e1730d34e30cb645ad95a63265aec82da9093d88e7"},
-    {file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e0bfe9c4d3429706cf70d3249089de14d6a01192d617e9093a8e941fea8ee775"},
-    {file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:642a36eb41b6313ffa328e8a5c5c2b5bea6ee138546c9c3cf1bffaad8ee36dd9"},
-    {file = "asyncpg-0.29.0-cp38-cp38-win32.whl", hash = "sha256:a921372bbd0aa3a5822dd0409da61b4cd50df89ae85150149f8c119f23e8c408"},
-    {file = "asyncpg-0.29.0-cp38-cp38-win_amd64.whl", hash = "sha256:103aad2b92d1506700cbf51cd8bb5441e7e72e87a7b3a2ca4e32c840f051a6a3"},
-    {file = "asyncpg-0.29.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5340dd515d7e52f4c11ada32171d87c05570479dc01dc66d03ee3e150fb695da"},
-    {file = "asyncpg-0.29.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e17b52c6cf83e170d3d865571ba574577ab8e533e7361a2b8ce6157d02c665d3"},
-    {file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f100d23f273555f4b19b74a96840aa27b85e99ba4b1f18d4ebff0734e78dc090"},
-    {file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48e7c58b516057126b363cec8ca02b804644fd012ef8e6c7e23386b7d5e6ce83"},
-    {file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f9ea3f24eb4c49a615573724d88a48bd1b7821c890c2effe04f05382ed9e8810"},
-    {file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8d36c7f14a22ec9e928f15f92a48207546ffe68bc412f3be718eedccdf10dc5c"},
-    {file = "asyncpg-0.29.0-cp39-cp39-win32.whl", hash = "sha256:797ab8123ebaed304a1fad4d7576d5376c3a006a4100380fb9d517f0b59c1ab2"},
-    {file = "asyncpg-0.29.0-cp39-cp39-win_amd64.whl", hash = "sha256:cce08a178858b426ae1aa8409b5cc171def45d4293626e7aa6510696d46decd8"},
-    {file = "asyncpg-0.29.0.tar.gz", hash = "sha256:d1c49e1f44fffafd9a55e1a9b101590859d881d639ea2922516f5d9c512d354e"},
+    {file = "asyncpg-0.30.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bfb4dd5ae0699bad2b233672c8fc5ccbd9ad24b89afded02341786887e37927e"},
+    {file = "asyncpg-0.30.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc1f62c792752a49f88b7e6f774c26077091b44caceb1983509edc18a2222ec0"},
+    {file = "asyncpg-0.30.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3152fef2e265c9c24eec4ee3d22b4f4d2703d30614b0b6753e9ed4115c8a146f"},
+    {file = "asyncpg-0.30.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c7255812ac85099a0e1ffb81b10dc477b9973345793776b128a23e60148dd1af"},
+    {file = "asyncpg-0.30.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:578445f09f45d1ad7abddbff2a3c7f7c291738fdae0abffbeb737d3fc3ab8b75"},
+    {file = "asyncpg-0.30.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c42f6bb65a277ce4d93f3fba46b91a265631c8df7250592dd4f11f8b0152150f"},
+    {file = "asyncpg-0.30.0-cp310-cp310-win32.whl", hash = "sha256:aa403147d3e07a267ada2ae34dfc9324e67ccc4cdca35261c8c22792ba2b10cf"},
+    {file = "asyncpg-0.30.0-cp310-cp310-win_amd64.whl", hash = "sha256:fb622c94db4e13137c4c7f98834185049cc50ee01d8f657ef898b6407c7b9c50"},
+    {file = "asyncpg-0.30.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5e0511ad3dec5f6b4f7a9e063591d407eee66b88c14e2ea636f187da1dcfff6a"},
+    {file = "asyncpg-0.30.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:915aeb9f79316b43c3207363af12d0e6fd10776641a7de8a01212afd95bdf0ed"},
+    {file = "asyncpg-0.30.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c198a00cce9506fcd0bf219a799f38ac7a237745e1d27f0e1f66d3707c84a5a"},
+    {file = "asyncpg-0.30.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3326e6d7381799e9735ca2ec9fd7be4d5fef5dcbc3cb555d8a463d8460607956"},
+    {file = "asyncpg-0.30.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:51da377487e249e35bd0859661f6ee2b81db11ad1f4fc036194bc9cb2ead5056"},
+    {file = "asyncpg-0.30.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bc6d84136f9c4d24d358f3b02be4b6ba358abd09f80737d1ac7c444f36108454"},
+    {file = "asyncpg-0.30.0-cp311-cp311-win32.whl", hash = "sha256:574156480df14f64c2d76450a3f3aaaf26105869cad3865041156b38459e935d"},
+    {file = "asyncpg-0.30.0-cp311-cp311-win_amd64.whl", hash = "sha256:3356637f0bd830407b5597317b3cb3571387ae52ddc3bca6233682be88bbbc1f"},
+    {file = "asyncpg-0.30.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c902a60b52e506d38d7e80e0dd5399f657220f24635fee368117b8b5fce1142e"},
+    {file = "asyncpg-0.30.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aca1548e43bbb9f0f627a04666fedaca23db0a31a84136ad1f868cb15deb6e3a"},
+    {file = "asyncpg-0.30.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c2a2ef565400234a633da0eafdce27e843836256d40705d83ab7ec42074efb3"},
+    {file = "asyncpg-0.30.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1292b84ee06ac8a2ad8e51c7475aa309245874b61333d97411aab835c4a2f737"},
+    {file = "asyncpg-0.30.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0f5712350388d0cd0615caec629ad53c81e506b1abaaf8d14c93f54b35e3595a"},
+    {file = "asyncpg-0.30.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:db9891e2d76e6f425746c5d2da01921e9a16b5a71a1c905b13f30e12a257c4af"},
+    {file = "asyncpg-0.30.0-cp312-cp312-win32.whl", hash = "sha256:68d71a1be3d83d0570049cd1654a9bdfe506e794ecc98ad0873304a9f35e411e"},
+    {file = "asyncpg-0.30.0-cp312-cp312-win_amd64.whl", hash = "sha256:9a0292c6af5c500523949155ec17b7fe01a00ace33b68a476d6b5059f9630305"},
+    {file = "asyncpg-0.30.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:05b185ebb8083c8568ea8a40e896d5f7af4b8554b64d7719c0eaa1eb5a5c3a70"},
+    {file = "asyncpg-0.30.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c47806b1a8cbb0a0db896f4cd34d89942effe353a5035c62734ab13b9f938da3"},
+    {file = "asyncpg-0.30.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b6fde867a74e8c76c71e2f64f80c64c0f3163e687f1763cfaf21633ec24ec33"},
+    {file = "asyncpg-0.30.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46973045b567972128a27d40001124fbc821c87a6cade040cfcd4fa8a30bcdc4"},
+    {file = "asyncpg-0.30.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9110df111cabc2ed81aad2f35394a00cadf4f2e0635603db6ebbd0fc896f46a4"},
+    {file = "asyncpg-0.30.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04ff0785ae7eed6cc138e73fc67b8e51d54ee7a3ce9b63666ce55a0bf095f7ba"},
+    {file = "asyncpg-0.30.0-cp313-cp313-win32.whl", hash = "sha256:ae374585f51c2b444510cdf3595b97ece4f233fde739aa14b50e0d64e8a7a590"},
+    {file = "asyncpg-0.30.0-cp313-cp313-win_amd64.whl", hash = "sha256:f59b430b8e27557c3fb9869222559f7417ced18688375825f8f12302c34e915e"},
+    {file = "asyncpg-0.30.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:29ff1fc8b5bf724273782ff8b4f57b0f8220a1b2324184846b39d1ab4122031d"},
+    {file = "asyncpg-0.30.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:64e899bce0600871b55368b8483e5e3e7f1860c9482e7f12e0a771e747988168"},
+    {file = "asyncpg-0.30.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b290f4726a887f75dcd1b3006f484252db37602313f806e9ffc4e5996cfe5cb"},
+    {file = "asyncpg-0.30.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f86b0e2cd3f1249d6fe6fd6cfe0cd4538ba994e2d8249c0491925629b9104d0f"},
+    {file = "asyncpg-0.30.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:393af4e3214c8fa4c7b86da6364384c0d1b3298d45803375572f415b6f673f38"},
+    {file = "asyncpg-0.30.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:fd4406d09208d5b4a14db9a9dbb311b6d7aeeab57bded7ed2f8ea41aeef39b34"},
+    {file = "asyncpg-0.30.0-cp38-cp38-win32.whl", hash = "sha256:0b448f0150e1c3b96cb0438a0d0aa4871f1472e58de14a3ec320dbb2798fb0d4"},
+    {file = "asyncpg-0.30.0-cp38-cp38-win_amd64.whl", hash = "sha256:f23b836dd90bea21104f69547923a02b167d999ce053f3d502081acea2fba15b"},
+    {file = "asyncpg-0.30.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6f4e83f067b35ab5e6371f8a4c93296e0439857b4569850b178a01385e82e9ad"},
+    {file = "asyncpg-0.30.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5df69d55add4efcd25ea2a3b02025b669a285b767bfbf06e356d68dbce4234ff"},
+    {file = "asyncpg-0.30.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a3479a0d9a852c7c84e822c073622baca862d1217b10a02dd57ee4a7a081f708"},
+    {file = "asyncpg-0.30.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26683d3b9a62836fad771a18ecf4659a30f348a561279d6227dab96182f46144"},
+    {file = "asyncpg-0.30.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1b982daf2441a0ed314bd10817f1606f1c28b1136abd9e4f11335358c2c631cb"},
+    {file = "asyncpg-0.30.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1c06a3a50d014b303e5f6fc1e5f95eb28d2cee89cf58384b700da621e5d5e547"},
+    {file = "asyncpg-0.30.0-cp39-cp39-win32.whl", hash = "sha256:1b11a555a198b08f5c4baa8f8231c74a366d190755aa4f99aacec5970afe929a"},
+    {file = "asyncpg-0.30.0-cp39-cp39-win_amd64.whl", hash = "sha256:8b684a3c858a83cd876f05958823b68e8d14ec01bb0c0d14a6704c5bf9711773"},
+    {file = "asyncpg-0.30.0.tar.gz", hash = "sha256:c551e9928ab6707602f44811817f82ba3c446e018bfe1d3abecc8ba5f3eac851"},
 ]
 
-[package.dependencies]
-async-timeout = {version = ">=4.0.3", markers = "python_version < \"3.12.0\""}
-
 [package.extras]
-docs = ["Sphinx (>=5.3.0,<5.4.0)", "sphinx-rtd-theme (>=1.2.2)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"]
-test = ["flake8 (>=6.1,<7.0)", "uvloop (>=0.15.3)"]
+docs = ["Sphinx (>=8.1.3,<8.2.0)", "sphinx-rtd-theme (>=1.2.2)"]
+gssauth = ["gssapi", "sspilib"]
+test = ["distro (>=1.9.0,<1.10.0)", "flake8 (>=6.1,<7.0)", "flake8-pyi (>=24.1.0,<24.2.0)", "gssapi", "k5test", "mypy (>=1.8.0,<1.9.0)", "sspilib", "uvloop (>=0.15.3)"]
 
 [[package]]
 name = "attrs"
@@ -766,75 +772,78 @@ files = [
 
 [[package]]
 name = "cffi"
-version = "1.15.1"
+version = "1.17.1"
 description = "Foreign Function Interface for Python calling C code."
 optional = false
-python-versions = "*"
+python-versions = ">=3.8"
 files = [
-    {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"},
-    {file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"},
-    {file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"},
-    {file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"},
-    {file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"},
-    {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"},
-    {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"},
-    {file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"},
-    {file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"},
-    {file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"},
-    {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"},
-    {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"},
-    {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"},
-    {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"},
-    {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"},
-    {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"},
-    {file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"},
-    {file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"},
-    {file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"},
-    {file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"},
-    {file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"},
-    {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"},
-    {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"},
-    {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"},
-    {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"},
-    {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"},
-    {file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"},
-    {file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"},
-    {file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"},
-    {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"},
-    {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"},
-    {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"},
-    {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"},
-    {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"},
-    {file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"},
-    {file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"},
-    {file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"},
-    {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"},
-    {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"},
-    {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"},
-    {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"},
-    {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"},
-    {file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"},
-    {file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"},
-    {file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"},
-    {file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"},
-    {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"},
-    {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"},
-    {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"},
-    {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"},
-    {file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"},
-    {file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"},
-    {file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"},
-    {file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"},
-    {file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"},
-    {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"},
-    {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"},
-    {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"},
-    {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"},
-    {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"},
-    {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"},
-    {file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"},
-    {file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"},
-    {file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"},
+    {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"},
+    {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"},
+    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edae79245293e15384b51f88b00613ba9f7198016a5948b5dddf4917d4d26382"},
+    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45398b671ac6d70e67da8e4224a065cec6a93541bb7aebe1b198a61b58c7b702"},
+    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad9413ccdeda48c5afdae7e4fa2192157e991ff761e7ab8fdd8926f40b160cc3"},
+    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5da5719280082ac6bd9aa7becb3938dc9f9cbd57fac7d2871717b1feb0902ab6"},
+    {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bb1a08b8008b281856e5971307cc386a8e9c5b625ac297e853d36da6efe9c17"},
+    {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8"},
+    {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6883e737d7d9e4899a8a695e00ec36bd4e5e4f18fabe0aca0efe0a4b44cdb13e"},
+    {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6b8b4a92e1c65048ff98cfe1f735ef8f1ceb72e3d5f0c25fdb12087a23da22be"},
+    {file = "cffi-1.17.1-cp310-cp310-win32.whl", hash = "sha256:c9c3d058ebabb74db66e431095118094d06abf53284d9c81f27300d0e0d8bc7c"},
+    {file = "cffi-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:0f048dcf80db46f0098ccac01132761580d28e28bc0f78ae0d58048063317e15"},
+    {file = "cffi-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401"},
+    {file = "cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf"},
+    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4"},
+    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41"},
+    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1"},
+    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a24ed04c8ffd54b0729c07cee15a81d964e6fee0e3d4d342a27b020d22959dc6"},
+    {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:610faea79c43e44c71e1ec53a554553fa22321b65fae24889706c0a84d4ad86d"},
+    {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6"},
+    {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f"},
+    {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b"},
+    {file = "cffi-1.17.1-cp311-cp311-win32.whl", hash = "sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655"},
+    {file = "cffi-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0"},
+    {file = "cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4"},
+    {file = "cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c"},
+    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36"},
+    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5"},
+    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff"},
+    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99"},
+    {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93"},
+    {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3"},
+    {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8"},
+    {file = "cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65"},
+    {file = "cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903"},
+    {file = "cffi-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e"},
+    {file = "cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2"},
+    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3"},
+    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683"},
+    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5"},
+    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4"},
+    {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd"},
+    {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed"},
+    {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9"},
+    {file = "cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d"},
+    {file = "cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a"},
+    {file = "cffi-1.17.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:636062ea65bd0195bc012fea9321aca499c0504409f413dc88af450b57ffd03b"},
+    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7eac2ef9b63c79431bc4b25f1cd649d7f061a28808cbc6c47b534bd789ef964"},
+    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e221cf152cff04059d011ee126477f0d9588303eb57e88923578ace7baad17f9"},
+    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31000ec67d4221a71bd3f67df918b1f88f676f1c3b535a7eb473255fdc0b83fc"},
+    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f17be4345073b0a7b8ea599688f692ac3ef23ce28e5df79c04de519dbc4912c"},
+    {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1"},
+    {file = "cffi-1.17.1-cp38-cp38-win32.whl", hash = "sha256:7596d6620d3fa590f677e9ee430df2958d2d6d6de2feeae5b20e82c00b76fbf8"},
+    {file = "cffi-1.17.1-cp38-cp38-win_amd64.whl", hash = "sha256:78122be759c3f8a014ce010908ae03364d00a1f81ab5c7f4a7a5120607ea56e1"},
+    {file = "cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16"},
+    {file = "cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36"},
+    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8"},
+    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca74b8dbe6e8e8263c0ffd60277de77dcee6c837a3d0881d8c1ead7268c9e576"},
+    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7f5baafcc48261359e14bcd6d9bff6d4b28d9103847c9e136694cb0501aef87"},
+    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98e3969bcff97cae1b2def8ba499ea3d6f31ddfdb7635374834cf89a1a08ecf0"},
+    {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdf5ce3acdfd1661132f2a9c19cac174758dc2352bfe37d98aa7512c6b7178b3"},
+    {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9755e4345d1ec879e3849e62222a18c7174d65a6a92d5b346b1863912168b595"},
+    {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f1e22e8c4419538cb197e4dd60acc919d7696e5ef98ee4da4e01d3f8cfa4cc5a"},
+    {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c03e868a0b3bc35839ba98e74211ed2b05d2119be4e8a0f224fba9384f1fe02e"},
+    {file = "cffi-1.17.1-cp39-cp39-win32.whl", hash = "sha256:e31ae45bc2e29f6b2abd0de1cc3b9d5205aa847cafaecb8af1476a609a2f6eb7"},
+    {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"},
+    {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"},
 ]
 
 [package.dependencies]
@@ -1114,72 +1123,103 @@ Flask = ">=0.9"
 
 [[package]]
 name = "frozenlist"
-version = "1.4.0"
+version = "1.5.0"
 description = "A list-like structure which implements collections.abc.MutableSequence"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:764226ceef3125e53ea2cb275000e309c0aa5464d43bd72abd661e27fffc26ab"},
-    {file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d6484756b12f40003c6128bfcc3fa9f0d49a687e171186c2d85ec82e3758c559"},
-    {file = "frozenlist-1.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9ac08e601308e41eb533f232dbf6b7e4cea762f9f84f6357136eed926c15d12c"},
-    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d081f13b095d74b67d550de04df1c756831f3b83dc9881c38985834387487f1b"},
-    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71932b597f9895f011f47f17d6428252fc728ba2ae6024e13c3398a087c2cdea"},
-    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:981b9ab5a0a3178ff413bca62526bb784249421c24ad7381e39d67981be2c326"},
-    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e41f3de4df3e80de75845d3e743b3f1c4c8613c3997a912dbf0229fc61a8b963"},
-    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6918d49b1f90821e93069682c06ffde41829c346c66b721e65a5c62b4bab0300"},
-    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0e5c8764c7829343d919cc2dfc587a8db01c4f70a4ebbc49abde5d4b158b007b"},
-    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:8d0edd6b1c7fb94922bf569c9b092ee187a83f03fb1a63076e7774b60f9481a8"},
-    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e29cda763f752553fa14c68fb2195150bfab22b352572cb36c43c47bedba70eb"},
-    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0c7c1b47859ee2cac3846fde1c1dc0f15da6cec5a0e5c72d101e0f83dcb67ff9"},
-    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:901289d524fdd571be1c7be054f48b1f88ce8dddcbdf1ec698b27d4b8b9e5d62"},
-    {file = "frozenlist-1.4.0-cp310-cp310-win32.whl", hash = "sha256:1a0848b52815006ea6596c395f87449f693dc419061cc21e970f139d466dc0a0"},
-    {file = "frozenlist-1.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:b206646d176a007466358aa21d85cd8600a415c67c9bd15403336c331a10d956"},
-    {file = "frozenlist-1.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:de343e75f40e972bae1ef6090267f8260c1446a1695e77096db6cfa25e759a95"},
-    {file = "frozenlist-1.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ad2a9eb6d9839ae241701d0918f54c51365a51407fd80f6b8289e2dfca977cc3"},
-    {file = "frozenlist-1.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bd7bd3b3830247580de99c99ea2a01416dfc3c34471ca1298bccabf86d0ff4dc"},
-    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bdf1847068c362f16b353163391210269e4f0569a3c166bc6a9f74ccbfc7e839"},
-    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:38461d02d66de17455072c9ba981d35f1d2a73024bee7790ac2f9e361ef1cd0c"},
-    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5a32087d720c608f42caed0ef36d2b3ea61a9d09ee59a5142d6070da9041b8f"},
-    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dd65632acaf0d47608190a71bfe46b209719bf2beb59507db08ccdbe712f969b"},
-    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261b9f5d17cac914531331ff1b1d452125bf5daa05faf73b71d935485b0c510b"},
-    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b89ac9768b82205936771f8d2eb3ce88503b1556324c9f903e7156669f521472"},
-    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:008eb8b31b3ea6896da16c38c1b136cb9fec9e249e77f6211d479db79a4eaf01"},
-    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e74b0506fa5aa5598ac6a975a12aa8928cbb58e1f5ac8360792ef15de1aa848f"},
-    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:490132667476f6781b4c9458298b0c1cddf237488abd228b0b3650e5ecba7467"},
-    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:76d4711f6f6d08551a7e9ef28c722f4a50dd0fc204c56b4bcd95c6cc05ce6fbb"},
-    {file = "frozenlist-1.4.0-cp311-cp311-win32.whl", hash = "sha256:a02eb8ab2b8f200179b5f62b59757685ae9987996ae549ccf30f983f40602431"},
-    {file = "frozenlist-1.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:515e1abc578dd3b275d6a5114030b1330ba044ffba03f94091842852f806f1c1"},
-    {file = "frozenlist-1.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f0ed05f5079c708fe74bf9027e95125334b6978bf07fd5ab923e9e55e5fbb9d3"},
-    {file = "frozenlist-1.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ca265542ca427bf97aed183c1676e2a9c66942e822b14dc6e5f42e038f92a503"},
-    {file = "frozenlist-1.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:491e014f5c43656da08958808588cc6c016847b4360e327a62cb308c791bd2d9"},
-    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17ae5cd0f333f94f2e03aaf140bb762c64783935cc764ff9c82dff626089bebf"},
-    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e78fb68cf9c1a6aa4a9a12e960a5c9dfbdb89b3695197aa7064705662515de2"},
-    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5655a942f5f5d2c9ed93d72148226d75369b4f6952680211972a33e59b1dfdc"},
-    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c11b0746f5d946fecf750428a95f3e9ebe792c1ee3b1e96eeba145dc631a9672"},
-    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e66d2a64d44d50d2543405fb183a21f76b3b5fd16f130f5c99187c3fb4e64919"},
-    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:88f7bc0fcca81f985f78dd0fa68d2c75abf8272b1f5c323ea4a01a4d7a614efc"},
-    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5833593c25ac59ede40ed4de6d67eb42928cca97f26feea219f21d0ed0959b79"},
-    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:fec520865f42e5c7f050c2a79038897b1c7d1595e907a9e08e3353293ffc948e"},
-    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:b826d97e4276750beca7c8f0f1a4938892697a6bcd8ec8217b3312dad6982781"},
-    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ceb6ec0a10c65540421e20ebd29083c50e6d1143278746a4ef6bcf6153171eb8"},
-    {file = "frozenlist-1.4.0-cp38-cp38-win32.whl", hash = "sha256:2b8bcf994563466db019fab287ff390fffbfdb4f905fc77bc1c1d604b1c689cc"},
-    {file = "frozenlist-1.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:a6c8097e01886188e5be3e6b14e94ab365f384736aa1fca6a0b9e35bd4a30bc7"},
-    {file = "frozenlist-1.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:6c38721585f285203e4b4132a352eb3daa19121a035f3182e08e437cface44bf"},
-    {file = "frozenlist-1.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a0c6da9aee33ff0b1a451e867da0c1f47408112b3391dd43133838339e410963"},
-    {file = "frozenlist-1.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:93ea75c050c5bb3d98016b4ba2497851eadf0ac154d88a67d7a6816206f6fa7f"},
-    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f61e2dc5ad442c52b4887f1fdc112f97caeff4d9e6ebe78879364ac59f1663e1"},
-    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa384489fefeb62321b238e64c07ef48398fe80f9e1e6afeff22e140e0850eef"},
-    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:10ff5faaa22786315ef57097a279b833ecab1a0bfb07d604c9cbb1c4cdc2ed87"},
-    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:007df07a6e3eb3e33e9a1fe6a9db7af152bbd8a185f9aaa6ece10a3529e3e1c6"},
-    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f4f399d28478d1f604c2ff9119907af9726aed73680e5ed1ca634d377abb087"},
-    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c5374b80521d3d3f2ec5572e05adc94601985cc526fb276d0c8574a6d749f1b3"},
-    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ce31ae3e19f3c902de379cf1323d90c649425b86de7bbdf82871b8a2a0615f3d"},
-    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7211ef110a9194b6042449431e08c4d80c0481e5891e58d429df5899690511c2"},
-    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:556de4430ce324c836789fa4560ca62d1591d2538b8ceb0b4f68fb7b2384a27a"},
-    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7645a8e814a3ee34a89c4a372011dcd817964ce8cb273c8ed6119d706e9613e3"},
-    {file = "frozenlist-1.4.0-cp39-cp39-win32.whl", hash = "sha256:19488c57c12d4e8095a922f328df3f179c820c212940a498623ed39160bc3c2f"},
-    {file = "frozenlist-1.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:6221d84d463fb110bdd7619b69cb43878a11d51cbb9394ae3105d082d5199167"},
-    {file = "frozenlist-1.4.0.tar.gz", hash = "sha256:09163bdf0b2907454042edb19f887c6d33806adc71fbd54afc14908bfdc22251"},
+    {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5b6a66c18b5b9dd261ca98dffcb826a525334b2f29e7caa54e182255c5f6a65a"},
+    {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d1b3eb7b05ea246510b43a7e53ed1653e55c2121019a97e60cad7efb881a97bb"},
+    {file = "frozenlist-1.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:15538c0cbf0e4fa11d1e3a71f823524b0c46299aed6e10ebb4c2089abd8c3bec"},
+    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e79225373c317ff1e35f210dd5f1344ff31066ba8067c307ab60254cd3a78ad5"},
+    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9272fa73ca71266702c4c3e2d4a28553ea03418e591e377a03b8e3659d94fa76"},
+    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:498524025a5b8ba81695761d78c8dd7382ac0b052f34e66939c42df860b8ff17"},
+    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:92b5278ed9d50fe610185ecd23c55d8b307d75ca18e94c0e7de328089ac5dcba"},
+    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f3c8c1dacd037df16e85227bac13cca58c30da836c6f936ba1df0c05d046d8d"},
+    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f2ac49a9bedb996086057b75bf93538240538c6d9b38e57c82d51f75a73409d2"},
+    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e66cc454f97053b79c2ab09c17fbe3c825ea6b4de20baf1be28919460dd7877f"},
+    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:5a3ba5f9a0dfed20337d3e966dc359784c9f96503674c2faf015f7fe8e96798c"},
+    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:6321899477db90bdeb9299ac3627a6a53c7399c8cd58d25da094007402b039ab"},
+    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:76e4753701248476e6286f2ef492af900ea67d9706a0155335a40ea21bf3b2f5"},
+    {file = "frozenlist-1.5.0-cp310-cp310-win32.whl", hash = "sha256:977701c081c0241d0955c9586ffdd9ce44f7a7795df39b9151cd9a6fd0ce4cfb"},
+    {file = "frozenlist-1.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:189f03b53e64144f90990d29a27ec4f7997d91ed3d01b51fa39d2dbe77540fd4"},
+    {file = "frozenlist-1.5.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:fd74520371c3c4175142d02a976aee0b4cb4a7cc912a60586ffd8d5929979b30"},
+    {file = "frozenlist-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2f3f7a0fbc219fb4455264cae4d9f01ad41ae6ee8524500f381de64ffaa077d5"},
+    {file = "frozenlist-1.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f47c9c9028f55a04ac254346e92977bf0f166c483c74b4232bee19a6697e4778"},
+    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0996c66760924da6e88922756d99b47512a71cfd45215f3570bf1e0b694c206a"},
+    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a2fe128eb4edeabe11896cb6af88fca5346059f6c8d807e3b910069f39157869"},
+    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1a8ea951bbb6cacd492e3948b8da8c502a3f814f5d20935aae74b5df2b19cf3d"},
+    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de537c11e4aa01d37db0d403b57bd6f0546e71a82347a97c6a9f0dcc532b3a45"},
+    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c2623347b933fcb9095841f1cc5d4ff0b278addd743e0e966cb3d460278840d"},
+    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cee6798eaf8b1416ef6909b06f7dc04b60755206bddc599f52232606e18179d3"},
+    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f5f9da7f5dbc00a604fe74aa02ae7c98bcede8a3b8b9666f9f86fc13993bc71a"},
+    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:90646abbc7a5d5c7c19461d2e3eeb76eb0b204919e6ece342feb6032c9325ae9"},
+    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:bdac3c7d9b705d253b2ce370fde941836a5f8b3c5c2b8fd70940a3ea3af7f4f2"},
+    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03d33c2ddbc1816237a67f66336616416e2bbb6beb306e5f890f2eb22b959cdf"},
+    {file = "frozenlist-1.5.0-cp311-cp311-win32.whl", hash = "sha256:237f6b23ee0f44066219dae14c70ae38a63f0440ce6750f868ee08775073f942"},
+    {file = "frozenlist-1.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:0cc974cc93d32c42e7b0f6cf242a6bd941c57c61b618e78b6c0a96cb72788c1d"},
+    {file = "frozenlist-1.5.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:31115ba75889723431aa9a4e77d5f398f5cf976eea3bdf61749731f62d4a4a21"},
+    {file = "frozenlist-1.5.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7437601c4d89d070eac8323f121fcf25f88674627505334654fd027b091db09d"},
+    {file = "frozenlist-1.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7948140d9f8ece1745be806f2bfdf390127cf1a763b925c4a805c603df5e697e"},
+    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:feeb64bc9bcc6b45c6311c9e9b99406660a9c05ca8a5b30d14a78555088b0b3a"},
+    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:683173d371daad49cffb8309779e886e59c2f369430ad28fe715f66d08d4ab1a"},
+    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7d57d8f702221405a9d9b40f9da8ac2e4a1a8b5285aac6100f3393675f0a85ee"},
+    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30c72000fbcc35b129cb09956836c7d7abf78ab5416595e4857d1cae8d6251a6"},
+    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000a77d6034fbad9b6bb880f7ec073027908f1b40254b5d6f26210d2dab1240e"},
+    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5d7f5a50342475962eb18b740f3beecc685a15b52c91f7d975257e13e029eca9"},
+    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:87f724d055eb4785d9be84e9ebf0f24e392ddfad00b3fe036e43f489fafc9039"},
+    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:6e9080bb2fb195a046e5177f10d9d82b8a204c0736a97a153c2466127de87784"},
+    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9b93d7aaa36c966fa42efcaf716e6b3900438632a626fb09c049f6a2f09fc631"},
+    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:52ef692a4bc60a6dd57f507429636c2af8b6046db8b31b18dac02cbc8f507f7f"},
+    {file = "frozenlist-1.5.0-cp312-cp312-win32.whl", hash = "sha256:29d94c256679247b33a3dc96cce0f93cbc69c23bf75ff715919332fdbb6a32b8"},
+    {file = "frozenlist-1.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:8969190d709e7c48ea386db202d708eb94bdb29207a1f269bab1196ce0dcca1f"},
+    {file = "frozenlist-1.5.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:7a1a048f9215c90973402e26c01d1cff8a209e1f1b53f72b95c13db61b00f953"},
+    {file = "frozenlist-1.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dd47a5181ce5fcb463b5d9e17ecfdb02b678cca31280639255ce9d0e5aa67af0"},
+    {file = "frozenlist-1.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1431d60b36d15cda188ea222033eec8e0eab488f39a272461f2e6d9e1a8e63c2"},
+    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6482a5851f5d72767fbd0e507e80737f9c8646ae7fd303def99bfe813f76cf7f"},
+    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:44c49271a937625619e862baacbd037a7ef86dd1ee215afc298a417ff3270608"},
+    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:12f78f98c2f1c2429d42e6a485f433722b0061d5c0b0139efa64f396efb5886b"},
+    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce3aa154c452d2467487765e3adc730a8c153af77ad84096bc19ce19a2400840"},
+    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b7dc0c4338e6b8b091e8faf0db3168a37101943e687f373dce00959583f7439"},
+    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:45e0896250900b5aa25180f9aec243e84e92ac84bd4a74d9ad4138ef3f5c97de"},
+    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:561eb1c9579d495fddb6da8959fd2a1fca2c6d060d4113f5844b433fc02f2641"},
+    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:df6e2f325bfee1f49f81aaac97d2aa757c7646534a06f8f577ce184afe2f0a9e"},
+    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:140228863501b44b809fb39ec56b5d4071f4d0aa6d216c19cbb08b8c5a7eadb9"},
+    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7707a25d6a77f5d27ea7dc7d1fc608aa0a478193823f88511ef5e6b8a48f9d03"},
+    {file = "frozenlist-1.5.0-cp313-cp313-win32.whl", hash = "sha256:31a9ac2b38ab9b5a8933b693db4939764ad3f299fcaa931a3e605bc3460e693c"},
+    {file = "frozenlist-1.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:11aabdd62b8b9c4b84081a3c246506d1cddd2dd93ff0ad53ede5defec7886b28"},
+    {file = "frozenlist-1.5.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:dd94994fc91a6177bfaafd7d9fd951bc8689b0a98168aa26b5f543868548d3ca"},
+    {file = "frozenlist-1.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2d0da8bbec082bf6bf18345b180958775363588678f64998c2b7609e34719b10"},
+    {file = "frozenlist-1.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:73f2e31ea8dd7df61a359b731716018c2be196e5bb3b74ddba107f694fbd7604"},
+    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:828afae9f17e6de596825cf4228ff28fbdf6065974e5ac1410cecc22f699d2b3"},
+    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f1577515d35ed5649d52ab4319db757bb881ce3b2b796d7283e6634d99ace307"},
+    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2150cc6305a2c2ab33299453e2968611dacb970d2283a14955923062c8d00b10"},
+    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a72b7a6e3cd2725eff67cd64c8f13335ee18fc3c7befc05aed043d24c7b9ccb9"},
+    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c16d2fa63e0800723139137d667e1056bee1a1cf7965153d2d104b62855e9b99"},
+    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:17dcc32fc7bda7ce5875435003220a457bcfa34ab7924a49a1c19f55b6ee185c"},
+    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:97160e245ea33d8609cd2b8fd997c850b56db147a304a262abc2b3be021a9171"},
+    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:f1e6540b7fa044eee0bb5111ada694cf3dc15f2b0347ca125ee9ca984d5e9e6e"},
+    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:91d6c171862df0a6c61479d9724f22efb6109111017c87567cfeb7b5d1449fdf"},
+    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c1fac3e2ace2eb1052e9f7c7db480818371134410e1f5c55d65e8f3ac6d1407e"},
+    {file = "frozenlist-1.5.0-cp38-cp38-win32.whl", hash = "sha256:b97f7b575ab4a8af9b7bc1d2ef7f29d3afee2226bd03ca3875c16451ad5a7723"},
+    {file = "frozenlist-1.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:374ca2dabdccad8e2a76d40b1d037f5bd16824933bf7bcea3e59c891fd4a0923"},
+    {file = "frozenlist-1.5.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9bbcdfaf4af7ce002694a4e10a0159d5a8d20056a12b05b45cea944a4953f972"},
+    {file = "frozenlist-1.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1893f948bf6681733aaccf36c5232c231e3b5166d607c5fa77773611df6dc336"},
+    {file = "frozenlist-1.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2b5e23253bb709ef57a8e95e6ae48daa9ac5f265637529e4ce6b003a37b2621f"},
+    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f253985bb515ecd89629db13cb58d702035ecd8cfbca7d7a7e29a0e6d39af5f"},
+    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:04a5c6babd5e8fb7d3c871dc8b321166b80e41b637c31a995ed844a6139942b6"},
+    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a9fe0f1c29ba24ba6ff6abf688cb0b7cf1efab6b6aa6adc55441773c252f7411"},
+    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:226d72559fa19babe2ccd920273e767c96a49b9d3d38badd7c91a0fdeda8ea08"},
+    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15b731db116ab3aedec558573c1a5eec78822b32292fe4f2f0345b7f697745c2"},
+    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:366d8f93e3edfe5a918c874702f78faac300209a4d5bf38352b2c1bdc07a766d"},
+    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1b96af8c582b94d381a1c1f51ffaedeb77c821c690ea5f01da3d70a487dd0a9b"},
+    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:c03eff4a41bd4e38415cbed054bbaff4a075b093e2394b6915dca34a40d1e38b"},
+    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:50cf5e7ee9b98f22bdecbabf3800ae78ddcc26e4a435515fc72d97903e8488e0"},
+    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1e76bfbc72353269c44e0bc2cfe171900fbf7f722ad74c9a7b638052afe6a00c"},
+    {file = "frozenlist-1.5.0-cp39-cp39-win32.whl", hash = "sha256:666534d15ba8f0fda3f53969117383d5dc021266b3c1a42c9ec4855e4b58b9d3"},
+    {file = "frozenlist-1.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:5c28f4b5dbef8a0d8aad0d4de24d1e9e981728628afaf4ea0792f5d0939372f0"},
+    {file = "frozenlist-1.5.0-py3-none-any.whl", hash = "sha256:d994863bba198a4a518b467bb971c56e1db3f180a25c6cf7bb1949c267f748c3"},
+    {file = "frozenlist-1.5.0.tar.gz", hash = "sha256:81d5af29e61b9c8348e876d442253723928dce6433e0e76cd925cd83f1b4b817"},
 ]
 
 [[package]]
@@ -2295,109 +2335,131 @@ files = [
 
 [[package]]
 name = "pydantic"
-version = "2.7.1"
+version = "2.10.4"
 description = "Data validation using Python type hints"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pydantic-2.7.1-py3-none-any.whl", hash = "sha256:e029badca45266732a9a79898a15ae2e8b14840b1eabbb25844be28f0b33f3d5"},
-    {file = "pydantic-2.7.1.tar.gz", hash = "sha256:e9dbb5eada8abe4d9ae5f46b9939aead650cd2b68f249bb3a8139dbe125803cc"},
+    {file = "pydantic-2.10.4-py3-none-any.whl", hash = "sha256:597e135ea68be3a37552fb524bc7d0d66dcf93d395acd93a00682f1efcb8ee3d"},
+    {file = "pydantic-2.10.4.tar.gz", hash = "sha256:82f12e9723da6de4fe2ba888b5971157b3be7ad914267dea8f05f82b28254f06"},
 ]
 
 [package.dependencies]
-annotated-types = ">=0.4.0"
-pydantic-core = "2.18.2"
-typing-extensions = ">=4.6.1"
+annotated-types = ">=0.6.0"
+pydantic-core = "2.27.2"
+typing-extensions = ">=4.12.2"
 
 [package.extras]
 email = ["email-validator (>=2.0.0)"]
+timezone = ["tzdata"]
 
 [[package]]
 name = "pydantic-core"
-version = "2.18.2"
+version = "2.27.2"
 description = "Core functionality for Pydantic validation and serialization"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pydantic_core-2.18.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:9e08e867b306f525802df7cd16c44ff5ebbe747ff0ca6cf3fde7f36c05a59a81"},
-    {file = "pydantic_core-2.18.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f0a21cbaa69900cbe1a2e7cad2aa74ac3cf21b10c3efb0fa0b80305274c0e8a2"},
-    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0680b1f1f11fda801397de52c36ce38ef1c1dc841a0927a94f226dea29c3ae3d"},
-    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:95b9d5e72481d3780ba3442eac863eae92ae43a5f3adb5b4d0a1de89d42bb250"},
-    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fcf5cd9c4b655ad666ca332b9a081112cd7a58a8b5a6ca7a3104bc950f2038"},
-    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b5155ff768083cb1d62f3e143b49a8a3432e6789a3abee8acd005c3c7af1c74"},
-    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:553ef617b6836fc7e4df130bb851e32fe357ce36336d897fd6646d6058d980af"},
-    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b89ed9eb7d616ef5714e5590e6cf7f23b02d0d539767d33561e3675d6f9e3857"},
-    {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:75f7e9488238e920ab6204399ded280dc4c307d034f3924cd7f90a38b1829563"},
-    {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ef26c9e94a8c04a1b2924149a9cb081836913818e55681722d7f29af88fe7b38"},
-    {file = "pydantic_core-2.18.2-cp310-none-win32.whl", hash = "sha256:182245ff6b0039e82b6bb585ed55a64d7c81c560715d1bad0cbad6dfa07b4027"},
-    {file = "pydantic_core-2.18.2-cp310-none-win_amd64.whl", hash = "sha256:e23ec367a948b6d812301afc1b13f8094ab7b2c280af66ef450efc357d2ae543"},
-    {file = "pydantic_core-2.18.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:219da3f096d50a157f33645a1cf31c0ad1fe829a92181dd1311022f986e5fbe3"},
-    {file = "pydantic_core-2.18.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cc1cfd88a64e012b74e94cd00bbe0f9c6df57049c97f02bb07d39e9c852e19a4"},
-    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05b7133a6e6aeb8df37d6f413f7705a37ab4031597f64ab56384c94d98fa0e90"},
-    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:224c421235f6102e8737032483f43c1a8cfb1d2f45740c44166219599358c2cd"},
-    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b14d82cdb934e99dda6d9d60dc84a24379820176cc4a0d123f88df319ae9c150"},
-    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2728b01246a3bba6de144f9e3115b532ee44bd6cf39795194fb75491824a1413"},
-    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:470b94480bb5ee929f5acba6995251ada5e059a5ef3e0dfc63cca287283ebfa6"},
-    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:997abc4df705d1295a42f95b4eec4950a37ad8ae46d913caeee117b6b198811c"},
-    {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75250dbc5290e3f1a0f4618db35e51a165186f9034eff158f3d490b3fed9f8a0"},
-    {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4456f2dca97c425231d7315737d45239b2b51a50dc2b6f0c2bb181fce6207664"},
-    {file = "pydantic_core-2.18.2-cp311-none-win32.whl", hash = "sha256:269322dcc3d8bdb69f054681edff86276b2ff972447863cf34c8b860f5188e2e"},
-    {file = "pydantic_core-2.18.2-cp311-none-win_amd64.whl", hash = "sha256:800d60565aec896f25bc3cfa56d2277d52d5182af08162f7954f938c06dc4ee3"},
-    {file = "pydantic_core-2.18.2-cp311-none-win_arm64.whl", hash = "sha256:1404c69d6a676245199767ba4f633cce5f4ad4181f9d0ccb0577e1f66cf4c46d"},
-    {file = "pydantic_core-2.18.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:fb2bd7be70c0fe4dfd32c951bc813d9fe6ebcbfdd15a07527796c8204bd36242"},
-    {file = "pydantic_core-2.18.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6132dd3bd52838acddca05a72aafb6eab6536aa145e923bb50f45e78b7251043"},
-    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d904828195733c183d20a54230c0df0eb46ec746ea1a666730787353e87182"},
-    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c9bd70772c720142be1020eac55f8143a34ec9f82d75a8e7a07852023e46617f"},
-    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b8ed04b3582771764538f7ee7001b02e1170223cf9b75dff0bc698fadb00cf3"},
-    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e6dac87ddb34aaec85f873d737e9d06a3555a1cc1a8e0c44b7f8d5daeb89d86f"},
-    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ca4ae5a27ad7a4ee5170aebce1574b375de390bc01284f87b18d43a3984df72"},
-    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:886eec03591b7cf058467a70a87733b35f44707bd86cf64a615584fd72488b7c"},
-    {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ca7b0c1f1c983e064caa85f3792dd2fe3526b3505378874afa84baf662e12241"},
-    {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b4356d3538c3649337df4074e81b85f0616b79731fe22dd11b99499b2ebbdf3"},
-    {file = "pydantic_core-2.18.2-cp312-none-win32.whl", hash = "sha256:8b172601454f2d7701121bbec3425dd71efcb787a027edf49724c9cefc14c038"},
-    {file = "pydantic_core-2.18.2-cp312-none-win_amd64.whl", hash = "sha256:b1bd7e47b1558ea872bd16c8502c414f9e90dcf12f1395129d7bb42a09a95438"},
-    {file = "pydantic_core-2.18.2-cp312-none-win_arm64.whl", hash = "sha256:98758d627ff397e752bc339272c14c98199c613f922d4a384ddc07526c86a2ec"},
-    {file = "pydantic_core-2.18.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:9fdad8e35f278b2c3eb77cbdc5c0a49dada440657bf738d6905ce106dc1de439"},
-    {file = "pydantic_core-2.18.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1d90c3265ae107f91a4f279f4d6f6f1d4907ac76c6868b27dc7fb33688cfb347"},
-    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:390193c770399861d8df9670fb0d1874f330c79caaca4642332df7c682bf6b91"},
-    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:82d5d4d78e4448683cb467897fe24e2b74bb7b973a541ea1dcfec1d3cbce39fb"},
-    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4774f3184d2ef3e14e8693194f661dea5a4d6ca4e3dc8e39786d33a94865cefd"},
-    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4d938ec0adf5167cb335acb25a4ee69a8107e4984f8fbd2e897021d9e4ca21b"},
-    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0e8b1be28239fc64a88a8189d1df7fad8be8c1ae47fcc33e43d4be15f99cc70"},
-    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:868649da93e5a3d5eacc2b5b3b9235c98ccdbfd443832f31e075f54419e1b96b"},
-    {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:78363590ef93d5d226ba21a90a03ea89a20738ee5b7da83d771d283fd8a56761"},
-    {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:852e966fbd035a6468fc0a3496589b45e2208ec7ca95c26470a54daed82a0788"},
-    {file = "pydantic_core-2.18.2-cp38-none-win32.whl", hash = "sha256:6a46e22a707e7ad4484ac9ee9f290f9d501df45954184e23fc29408dfad61350"},
-    {file = "pydantic_core-2.18.2-cp38-none-win_amd64.whl", hash = "sha256:d91cb5ea8b11607cc757675051f61b3d93f15eca3cefb3e6c704a5d6e8440f4e"},
-    {file = "pydantic_core-2.18.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:ae0a8a797a5e56c053610fa7be147993fe50960fa43609ff2a9552b0e07013e8"},
-    {file = "pydantic_core-2.18.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:042473b6280246b1dbf530559246f6842b56119c2926d1e52b631bdc46075f2a"},
-    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a388a77e629b9ec814c1b1e6b3b595fe521d2cdc625fcca26fbc2d44c816804"},
-    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25add29b8f3b233ae90ccef2d902d0ae0432eb0d45370fe315d1a5cf231004b"},
-    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f459a5ce8434614dfd39bbebf1041952ae01da6bed9855008cb33b875cb024c0"},
-    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eff2de745698eb46eeb51193a9f41d67d834d50e424aef27df2fcdee1b153845"},
-    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8309f67285bdfe65c372ea3722b7a5642680f3dba538566340a9d36e920b5f0"},
-    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f93a8a2e3938ff656a7c1bc57193b1319960ac015b6e87d76c76bf14fe0244b4"},
-    {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:22057013c8c1e272eb8d0eebc796701167d8377441ec894a8fed1af64a0bf399"},
-    {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cfeecd1ac6cc1fb2692c3d5110781c965aabd4ec5d32799773ca7b1456ac636b"},
-    {file = "pydantic_core-2.18.2-cp39-none-win32.whl", hash = "sha256:0d69b4c2f6bb3e130dba60d34c0845ba31b69babdd3f78f7c0c8fae5021a253e"},
-    {file = "pydantic_core-2.18.2-cp39-none-win_amd64.whl", hash = "sha256:d9319e499827271b09b4e411905b24a426b8fb69464dfa1696258f53a3334641"},
-    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a1874c6dd4113308bd0eb568418e6114b252afe44319ead2b4081e9b9521fe75"},
-    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:ccdd111c03bfd3666bd2472b674c6899550e09e9f298954cfc896ab92b5b0e6d"},
-    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e18609ceaa6eed63753037fc06ebb16041d17d28199ae5aba0052c51449650a9"},
-    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e5c584d357c4e2baf0ff7baf44f4994be121e16a2c88918a5817331fc7599d7"},
-    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43f0f463cf89ace478de71a318b1b4f05ebc456a9b9300d027b4b57c1a2064fb"},
-    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e1b395e58b10b73b07b7cf740d728dd4ff9365ac46c18751bf8b3d8cca8f625a"},
-    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0098300eebb1c837271d3d1a2cd2911e7c11b396eac9661655ee524a7f10587b"},
-    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:36789b70d613fbac0a25bb07ab3d9dba4d2e38af609c020cf4d888d165ee0bf3"},
-    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3f9a801e7c8f1ef8718da265bba008fa121243dfe37c1cea17840b0944dfd72c"},
-    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:3a6515ebc6e69d85502b4951d89131ca4e036078ea35533bb76327f8424531ce"},
-    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20aca1e2298c56ececfd8ed159ae4dde2df0781988c97ef77d5c16ff4bd5b400"},
-    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:223ee893d77a310a0391dca6df00f70bbc2f36a71a895cecd9a0e762dc37b349"},
-    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2334ce8c673ee93a1d6a65bd90327588387ba073c17e61bf19b4fd97d688d63c"},
-    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cbca948f2d14b09d20268cda7b0367723d79063f26c4ffc523af9042cad95592"},
-    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b3ef08e20ec49e02d5c6717a91bb5af9b20f1805583cb0adfe9ba2c6b505b5ae"},
-    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c6fdc8627910eed0c01aed6a390a252fe3ea6d472ee70fdde56273f198938374"},
-    {file = "pydantic_core-2.18.2.tar.gz", hash = "sha256:2e29d20810dfc3043ee13ac7d9e25105799817683348823f305ab3f349b9386e"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2d367ca20b2f14095a8f4fa1210f5a7b78b8a20009ecced6b12818f455b1e9fa"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:491a2b73db93fab69731eaee494f320faa4e093dbed776be1a829c2eb222c34c"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7969e133a6f183be60e9f6f56bfae753585680f3b7307a8e555a948d443cc05a"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3de9961f2a346257caf0aa508a4da705467f53778e9ef6fe744c038119737ef5"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e2bb4d3e5873c37bb3dd58714d4cd0b0e6238cebc4177ac8fe878f8b3aa8e74c"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:280d219beebb0752699480fe8f1dc61ab6615c2046d76b7ab7ee38858de0a4e7"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47956ae78b6422cbd46f772f1746799cbb862de838fd8d1fbd34a82e05b0983a"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:14d4a5c49d2f009d62a2a7140d3064f686d17a5d1a268bc641954ba181880236"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:337b443af21d488716f8d0b6164de833e788aa6bd7e3a39c005febc1284f4962"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:03d0f86ea3184a12f41a2d23f7ccb79cdb5a18e06993f8a45baa8dfec746f0e9"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7041c36f5680c6e0f08d922aed302e98b3745d97fe1589db0a3eebf6624523af"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-win32.whl", hash = "sha256:50a68f3e3819077be2c98110c1f9dcb3817e93f267ba80a2c05bb4f8799e2ff4"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-win_amd64.whl", hash = "sha256:e0fd26b16394ead34a424eecf8a31a1f5137094cabe84a1bcb10fa6ba39d3d31"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:8e10c99ef58cfdf2a66fc15d66b16c4a04f62bca39db589ae8cba08bc55331bc"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:26f32e0adf166a84d0cb63be85c562ca8a6fa8de28e5f0d92250c6b7e9e2aff7"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c19d1ea0673cd13cc2f872f6c9ab42acc4e4f492a7ca9d3795ce2b112dd7e15"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e68c4446fe0810e959cdff46ab0a41ce2f2c86d227d96dc3847af0ba7def306"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9640b0059ff4f14d1f37321b94061c6db164fbe49b334b31643e0528d100d99"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:40d02e7d45c9f8af700f3452f329ead92da4c5f4317ca9b896de7ce7199ea459"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c1fd185014191700554795c99b347d64f2bb637966c4cfc16998a0ca700d048"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d81d2068e1c1228a565af076598f9e7451712700b673de8f502f0334f281387d"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1a4207639fb02ec2dbb76227d7c751a20b1a6b4bc52850568e52260cae64ca3b"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:3de3ce3c9ddc8bbd88f6e0e304dea0e66d843ec9de1b0042b0911c1663ffd474"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:30c5f68ded0c36466acede341551106821043e9afaad516adfb6e8fa80a4e6a6"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-win32.whl", hash = "sha256:c70c26d2c99f78b125a3459f8afe1aed4d9687c24fd677c6a4436bc042e50d6c"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-win_amd64.whl", hash = "sha256:08e125dbdc505fa69ca7d9c499639ab6407cfa909214d500897d02afb816e7cc"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-win_arm64.whl", hash = "sha256:26f0d68d4b235a2bae0c3fc585c585b4ecc51382db0e3ba402a22cbc440915e4"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9e0c8cfefa0ef83b4da9588448b6d8d2a2bf1a53c3f1ae5fca39eb3061e2f0b0"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:83097677b8e3bd7eaa6775720ec8e0405f1575015a463285a92bfdfe254529ef"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:172fce187655fece0c90d90a678424b013f8fbb0ca8b036ac266749c09438cb7"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:519f29f5213271eeeeb3093f662ba2fd512b91c5f188f3bb7b27bc5973816934"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05e3a55d124407fffba0dd6b0c0cd056d10e983ceb4e5dbd10dda135c31071d6"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c3ed807c7b91de05e63930188f19e921d1fe90de6b4f5cd43ee7fcc3525cb8c"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fb4aadc0b9a0c063206846d603b92030eb6f03069151a625667f982887153e2"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:28ccb213807e037460326424ceb8b5245acb88f32f3d2777427476e1b32c48c4"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:de3cd1899e2c279b140adde9357c4495ed9d47131b4a4eaff9052f23398076b3"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:220f892729375e2d736b97d0e51466252ad84c51857d4d15f5e9692f9ef12be4"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a0fcd29cd6b4e74fe8ddd2c90330fd8edf2e30cb52acda47f06dd615ae72da57"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-win32.whl", hash = "sha256:1e2cb691ed9834cd6a8be61228471d0a503731abfb42f82458ff27be7b2186fc"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-win_amd64.whl", hash = "sha256:cc3f1a99a4f4f9dd1de4fe0312c114e740b5ddead65bb4102884b384c15d8bc9"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-win_arm64.whl", hash = "sha256:3911ac9284cd8a1792d3cb26a2da18f3ca26c6908cc434a18f730dc0db7bfa3b"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7d14bd329640e63852364c306f4d23eb744e0f8193148d4044dd3dacdaacbd8b"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:82f91663004eb8ed30ff478d77c4d1179b3563df6cdb15c0817cd1cdaf34d154"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71b24c7d61131bb83df10cc7e687433609963a944ccf45190cfc21e0887b08c9"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fa8e459d4954f608fa26116118bb67f56b93b209c39b008277ace29937453dc9"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce8918cbebc8da707ba805b7fd0b382816858728ae7fe19a942080c24e5b7cd1"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eda3f5c2a021bbc5d976107bb302e0131351c2ba54343f8a496dc8783d3d3a6a"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd8086fa684c4775c27f03f062cbb9eaa6e17f064307e86b21b9e0abc9c0f02e"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8d9b3388db186ba0c099a6d20f0604a44eabdeef1777ddd94786cdae158729e4"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7a66efda2387de898c8f38c0cf7f14fca0b51a8ef0b24bfea5849f1b3c95af27"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:18a101c168e4e092ab40dbc2503bdc0f62010e95d292b27827871dc85450d7ee"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ba5dd002f88b78a4215ed2f8ddbdf85e8513382820ba15ad5ad8955ce0ca19a1"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-win32.whl", hash = "sha256:1ebaf1d0481914d004a573394f4be3a7616334be70261007e47c2a6fe7e50130"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-win_amd64.whl", hash = "sha256:953101387ecf2f5652883208769a79e48db18c6df442568a0b5ccd8c2723abee"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-win_arm64.whl", hash = "sha256:ac4dbfd1691affb8f48c2c13241a2e3b60ff23247cbcf981759c768b6633cf8b"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:d3e8d504bdd3f10835468f29008d72fc8359d95c9c415ce6e767203db6127506"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:521eb9b7f036c9b6187f0b47318ab0d7ca14bd87f776240b90b21c1f4f149320"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85210c4d99a0114f5a9481b44560d7d1e35e32cc5634c656bc48e590b669b145"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d716e2e30c6f140d7560ef1538953a5cd1a87264c737643d481f2779fc247fe1"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f66d89ba397d92f840f8654756196d93804278457b5fbede59598a1f9f90b228"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:669e193c1c576a58f132e3158f9dfa9662969edb1a250c54d8fa52590045f046"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdbe7629b996647b99c01b37f11170a57ae675375b14b8c13b8518b8320ced5"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d262606bf386a5ba0b0af3b97f37c83d7011439e3dc1a9298f21efb292e42f1a"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:cabb9bcb7e0d97f74df8646f34fc76fbf793b7f6dc2438517d7a9e50eee4f14d"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-musllinux_1_1_armv7l.whl", hash = "sha256:d2d63f1215638d28221f664596b1ccb3944f6e25dd18cd3b86b0a4c408d5ebb9"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bca101c00bff0adb45a833f8451b9105d9df18accb8743b08107d7ada14bd7da"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-win32.whl", hash = "sha256:f6f8e111843bbb0dee4cb6594cdc73e79b3329b526037ec242a3e49012495b3b"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-win_amd64.whl", hash = "sha256:fd1aea04935a508f62e0d0ef1f5ae968774a32afc306fb8545e06f5ff5cdf3ad"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:c10eb4f1659290b523af58fa7cffb452a61ad6ae5613404519aee4bfbf1df993"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef592d4bad47296fb11f96cd7dc898b92e795032b4894dfb4076cfccd43a9308"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c61709a844acc6bf0b7dce7daae75195a10aac96a596ea1b776996414791ede4"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:42c5f762659e47fdb7b16956c71598292f60a03aa92f8b6351504359dbdba6cf"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4c9775e339e42e79ec99c441d9730fccf07414af63eac2f0e48e08fd38a64d76"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57762139821c31847cfb2df63c12f725788bd9f04bc2fb392790959b8f70f118"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d1e85068e818c73e048fe28cfc769040bb1f475524f4745a5dc621f75ac7630"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:097830ed52fd9e427942ff3b9bc17fab52913b2f50f2880dc4a5611446606a54"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:044a50963a614ecfae59bb1eaf7ea7efc4bc62f49ed594e18fa1e5d953c40e9f"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-musllinux_1_1_armv7l.whl", hash = "sha256:4e0b4220ba5b40d727c7f879eac379b822eee5d8fff418e9d3381ee45b3b0362"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5e4f4bb20d75e9325cc9696c6802657b58bc1dbbe3022f32cc2b2b632c3fbb96"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-win32.whl", hash = "sha256:cca63613e90d001b9f2f9a9ceb276c308bfa2a43fafb75c8031c4f66039e8c6e"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-win_amd64.whl", hash = "sha256:77d1bca19b0f7021b3a982e6f903dcd5b2b06076def36a652e3907f596e29f67"},
+    {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:2bf14caea37e91198329b828eae1618c068dfb8ef17bb33287a7ad4b61ac314e"},
+    {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b0cb791f5b45307caae8810c2023a184c74605ec3bcbb67d13846c28ff731ff8"},
+    {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:688d3fd9fcb71f41c4c015c023d12a79d1c4c0732ec9eb35d96e3388a120dcf3"},
+    {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d591580c34f4d731592f0e9fe40f9cc1b430d297eecc70b962e93c5c668f15f"},
+    {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:82f986faf4e644ffc189a7f1aafc86e46ef70372bb153e7001e8afccc6e54133"},
+    {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:bec317a27290e2537f922639cafd54990551725fc844249e64c523301d0822fc"},
+    {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:0296abcb83a797db256b773f45773da397da75a08f5fcaef41f2044adec05f50"},
+    {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0d75070718e369e452075a6017fbf187f788e17ed67a3abd47fa934d001863d9"},
+    {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:7e17b560be3c98a8e3aa66ce828bdebb9e9ac6ad5466fba92eb74c4c95cb1151"},
+    {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c33939a82924da9ed65dab5a65d427205a73181d8098e79b6b426bdf8ad4e656"},
+    {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:00bad2484fa6bda1e216e7345a798bd37c68fb2d97558edd584942aa41b7d278"},
+    {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c817e2b40aba42bac6f457498dacabc568c3b7a986fc9ba7c8d9d260b71485fb"},
+    {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:251136cdad0cb722e93732cb45ca5299fb56e1344a833640bf93b2803f8d1bfd"},
+    {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d2088237af596f0a524d3afc39ab3b036e8adb054ee57cbb1dcf8e09da5b29cc"},
+    {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d4041c0b966a84b4ae7a09832eb691a35aec90910cd2dbe7a208de59be77965b"},
+    {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:8083d4e875ebe0b864ffef72a4304827015cff328a1be6e22cc850753bfb122b"},
+    {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f141ee28a0ad2123b6611b6ceff018039df17f32ada8b534e6aa039545a3efb2"},
+    {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7d0c8399fcc1848491f00e0314bd59fb34a9c008761bcb422a057670c3f65e35"},
+    {file = "pydantic_core-2.27.2.tar.gz", hash = "sha256:eb026e5a4c1fee05726072337ff51d1efb6f59090b7da90d30ea58625b1ffb39"},
 ]
 
 [package.dependencies]
@@ -2638,6 +2700,20 @@ files = [
 [package.dependencies]
 six = ">=1.5"
 
+[[package]]
+name = "python-dotenv"
+version = "1.0.1"
+description = "Read key-value pairs from a .env file and set them as environment variables"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca"},
+    {file = "python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a"},
+]
+
+[package.extras]
+cli = ["click (>=5.0)"]
+
 [[package]]
 name = "pytz"
 version = "2024.1"
@@ -2992,17 +3068,18 @@ mpmath = ">=0.19"
 
 [[package]]
 name = "testcontainers"
-version = "4.8.1"
+version = "4.9.0"
 description = "Python library for throwaway instances of anything that can run in a Docker container"
 optional = false
 python-versions = "<4.0,>=3.9"
 files = [
-    {file = "testcontainers-4.8.1-py3-none-any.whl", hash = "sha256:d8ae43e8fe34060fcd5c3f494e0b7652b7774beabe94568a2283d0881e94d489"},
-    {file = "testcontainers-4.8.1.tar.gz", hash = "sha256:5ded4820b7227ad526857eb3caaafcabce1bbac05d22ad194849b136ffae3cb0"},
+    {file = "testcontainers-4.9.0-py3-none-any.whl", hash = "sha256:c6fee929990972c40bf6b91b7072c94064ff3649b405a14fde0274c8b2479d32"},
+    {file = "testcontainers-4.9.0.tar.gz", hash = "sha256:2cd6af070109ff68c1ab5389dc89c86c2dc3ab30a21ca734b2cb8f0f80ad479e"},
 ]
 
 [package.dependencies]
 docker = "*"
+python-dotenv = "*"
 typing-extensions = "*"
 urllib3 = "*"
 wrapt = "*"
@@ -3160,13 +3237,13 @@ files = [
 
 [[package]]
 name = "typing-extensions"
-version = "4.6.1"
-description = "Backported and Experimental Type Hints for Python 3.7+"
+version = "4.12.2"
+description = "Backported and Experimental Type Hints for Python 3.8+"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "typing_extensions-4.6.1-py3-none-any.whl", hash = "sha256:6bac751f4789b135c43228e72de18637e9a6c29d12777023a703fd1a6858469f"},
-    {file = "typing_extensions-4.6.1.tar.gz", hash = "sha256:558bc0c4145f01e6405f4a5fdbd82050bd221b119f4bf72a961a1cfd471349d6"},
+    {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
+    {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
 ]
 
 [[package]]
@@ -3309,16 +3386,6 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -3475,54 +3542,108 @@ propcache = ">=0.2.0"
 
 [[package]]
 name = "zstandard"
-version = "0.21.0"
+version = "0.23.0"
 description = "Zstandard bindings for Python"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "zstandard-0.21.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:649a67643257e3b2cff1c0a73130609679a5673bf389564bc6d4b164d822a7ce"},
-    {file = "zstandard-0.21.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:144a4fe4be2e747bf9c646deab212666e39048faa4372abb6a250dab0f347a29"},
-    {file = "zstandard-0.21.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b72060402524ab91e075881f6b6b3f37ab715663313030d0ce983da44960a86f"},
-    {file = "zstandard-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8257752b97134477fb4e413529edaa04fc0457361d304c1319573de00ba796b1"},
-    {file = "zstandard-0.21.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:c053b7c4cbf71cc26808ed67ae955836232f7638444d709bfc302d3e499364fa"},
-    {file = "zstandard-0.21.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2769730c13638e08b7a983b32cb67775650024632cd0476bf1ba0e6360f5ac7d"},
-    {file = "zstandard-0.21.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:7d3bc4de588b987f3934ca79140e226785d7b5e47e31756761e48644a45a6766"},
-    {file = "zstandard-0.21.0-cp310-cp310-win32.whl", hash = "sha256:67829fdb82e7393ca68e543894cd0581a79243cc4ec74a836c305c70a5943f07"},
-    {file = "zstandard-0.21.0-cp310-cp310-win_amd64.whl", hash = "sha256:e6048a287f8d2d6e8bc67f6b42a766c61923641dd4022b7fd3f7439e17ba5a4d"},
-    {file = "zstandard-0.21.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7f2afab2c727b6a3d466faee6974a7dad0d9991241c498e7317e5ccf53dbc766"},
-    {file = "zstandard-0.21.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff0852da2abe86326b20abae912d0367878dd0854b8931897d44cfeb18985472"},
-    {file = "zstandard-0.21.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d12fa383e315b62630bd407477d750ec96a0f438447d0e6e496ab67b8b451d39"},
-    {file = "zstandard-0.21.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1b9703fe2e6b6811886c44052647df7c37478af1b4a1a9078585806f42e5b15"},
-    {file = "zstandard-0.21.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:df28aa5c241f59a7ab524f8ad8bb75d9a23f7ed9d501b0fed6d40ec3064784e8"},
-    {file = "zstandard-0.21.0-cp311-cp311-win32.whl", hash = "sha256:0aad6090ac164a9d237d096c8af241b8dcd015524ac6dbec1330092dba151657"},
-    {file = "zstandard-0.21.0-cp311-cp311-win_amd64.whl", hash = "sha256:48b6233b5c4cacb7afb0ee6b4f91820afbb6c0e3ae0fa10abbc20000acdf4f11"},
-    {file = "zstandard-0.21.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e7d560ce14fd209db6adacce8908244503a009c6c39eee0c10f138996cd66d3e"},
-    {file = "zstandard-0.21.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e6e131a4df2eb6f64961cea6f979cdff22d6e0d5516feb0d09492c8fd36f3bc"},
-    {file = "zstandard-0.21.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1e0c62a67ff425927898cf43da2cf6b852289ebcc2054514ea9bf121bec10a5"},
-    {file = "zstandard-0.21.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:1545fb9cb93e043351d0cb2ee73fa0ab32e61298968667bb924aac166278c3fc"},
-    {file = "zstandard-0.21.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe6c821eb6870f81d73bf10e5deed80edcac1e63fbc40610e61f340723fd5f7c"},
-    {file = "zstandard-0.21.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ddb086ea3b915e50f6604be93f4f64f168d3fc3cef3585bb9a375d5834392d4f"},
-    {file = "zstandard-0.21.0-cp37-cp37m-win32.whl", hash = "sha256:57ac078ad7333c9db7a74804684099c4c77f98971c151cee18d17a12649bc25c"},
-    {file = "zstandard-0.21.0-cp37-cp37m-win_amd64.whl", hash = "sha256:1243b01fb7926a5a0417120c57d4c28b25a0200284af0525fddba812d575f605"},
-    {file = "zstandard-0.21.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ea68b1ba4f9678ac3d3e370d96442a6332d431e5050223626bdce748692226ea"},
-    {file = "zstandard-0.21.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8070c1cdb4587a8aa038638acda3bd97c43c59e1e31705f2766d5576b329e97c"},
-    {file = "zstandard-0.21.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4af612c96599b17e4930fe58bffd6514e6c25509d120f4eae6031b7595912f85"},
-    {file = "zstandard-0.21.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cff891e37b167bc477f35562cda1248acc115dbafbea4f3af54ec70821090965"},
-    {file = "zstandard-0.21.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a9fec02ce2b38e8b2e86079ff0b912445495e8ab0b137f9c0505f88ad0d61296"},
-    {file = "zstandard-0.21.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0bdbe350691dec3078b187b8304e6a9c4d9db3eb2d50ab5b1d748533e746d099"},
-    {file = "zstandard-0.21.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b69cccd06a4a0a1d9fb3ec9a97600055cf03030ed7048d4bcb88c574f7895773"},
-    {file = "zstandard-0.21.0-cp38-cp38-win32.whl", hash = "sha256:9980489f066a391c5572bc7dc471e903fb134e0b0001ea9b1d3eff85af0a6f1b"},
-    {file = "zstandard-0.21.0-cp38-cp38-win_amd64.whl", hash = "sha256:0e1e94a9d9e35dc04bf90055e914077c80b1e0c15454cc5419e82529d3e70728"},
-    {file = "zstandard-0.21.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d2d61675b2a73edcef5e327e38eb62bdfc89009960f0e3991eae5cc3d54718de"},
-    {file = "zstandard-0.21.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:25fbfef672ad798afab12e8fd204d122fca3bc8e2dcb0a2ba73bf0a0ac0f5f07"},
-    {file = "zstandard-0.21.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:62957069a7c2626ae80023998757e27bd28d933b165c487ab6f83ad3337f773d"},
-    {file = "zstandard-0.21.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14e10ed461e4807471075d4b7a2af51f5234c8f1e2a0c1d37d5ca49aaaad49e8"},
-    {file = "zstandard-0.21.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:9cff89a036c639a6a9299bf19e16bfb9ac7def9a7634c52c257166db09d950e7"},
-    {file = "zstandard-0.21.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:52b2b5e3e7670bd25835e0e0730a236f2b0df87672d99d3bf4bf87248aa659fb"},
-    {file = "zstandard-0.21.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b1367da0dde8ae5040ef0413fb57b5baeac39d8931c70536d5f013b11d3fc3a5"},
-    {file = "zstandard-0.21.0-cp39-cp39-win32.whl", hash = "sha256:db62cbe7a965e68ad2217a056107cc43d41764c66c895be05cf9c8b19578ce9c"},
-    {file = "zstandard-0.21.0-cp39-cp39-win_amd64.whl", hash = "sha256:a8d200617d5c876221304b0e3fe43307adde291b4a897e7b0617a61611dfff6a"},
-    {file = "zstandard-0.21.0.tar.gz", hash = "sha256:f08e3a10d01a247877e4cb61a82a319ea746c356a3786558bed2481e6c405546"},
+    {file = "zstandard-0.23.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bf0a05b6059c0528477fba9054d09179beb63744355cab9f38059548fedd46a9"},
+    {file = "zstandard-0.23.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fc9ca1c9718cb3b06634c7c8dec57d24e9438b2aa9a0f02b8bb36bf478538880"},
+    {file = "zstandard-0.23.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77da4c6bfa20dd5ea25cbf12c76f181a8e8cd7ea231c673828d0386b1740b8dc"},
+    {file = "zstandard-0.23.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b2170c7e0367dde86a2647ed5b6f57394ea7f53545746104c6b09fc1f4223573"},
+    {file = "zstandard-0.23.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c16842b846a8d2a145223f520b7e18b57c8f476924bda92aeee3a88d11cfc391"},
+    {file = "zstandard-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:157e89ceb4054029a289fb504c98c6a9fe8010f1680de0201b3eb5dc20aa6d9e"},
+    {file = "zstandard-0.23.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:203d236f4c94cd8379d1ea61db2fce20730b4c38d7f1c34506a31b34edc87bdd"},
+    {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:dc5d1a49d3f8262be192589a4b72f0d03b72dcf46c51ad5852a4fdc67be7b9e4"},
+    {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:752bf8a74412b9892f4e5b58f2f890a039f57037f52c89a740757ebd807f33ea"},
+    {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:80080816b4f52a9d886e67f1f96912891074903238fe54f2de8b786f86baded2"},
+    {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:84433dddea68571a6d6bd4fbf8ff398236031149116a7fff6f777ff95cad3df9"},
+    {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:ab19a2d91963ed9e42b4e8d77cd847ae8381576585bad79dbd0a8837a9f6620a"},
+    {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:59556bf80a7094d0cfb9f5e50bb2db27fefb75d5138bb16fb052b61b0e0eeeb0"},
+    {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:27d3ef2252d2e62476389ca8f9b0cf2bbafb082a3b6bfe9d90cbcbb5529ecf7c"},
+    {file = "zstandard-0.23.0-cp310-cp310-win32.whl", hash = "sha256:5d41d5e025f1e0bccae4928981e71b2334c60f580bdc8345f824e7c0a4c2a813"},
+    {file = "zstandard-0.23.0-cp310-cp310-win_amd64.whl", hash = "sha256:519fbf169dfac1222a76ba8861ef4ac7f0530c35dd79ba5727014613f91613d4"},
+    {file = "zstandard-0.23.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:34895a41273ad33347b2fc70e1bff4240556de3c46c6ea430a7ed91f9042aa4e"},
+    {file = "zstandard-0.23.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:77ea385f7dd5b5676d7fd943292ffa18fbf5c72ba98f7d09fc1fb9e819b34c23"},
+    {file = "zstandard-0.23.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:983b6efd649723474f29ed42e1467f90a35a74793437d0bc64a5bf482bedfa0a"},
+    {file = "zstandard-0.23.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80a539906390591dd39ebb8d773771dc4db82ace6372c4d41e2d293f8e32b8db"},
+    {file = "zstandard-0.23.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:445e4cb5048b04e90ce96a79b4b63140e3f4ab5f662321975679b5f6360b90e2"},
+    {file = "zstandard-0.23.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd30d9c67d13d891f2360b2a120186729c111238ac63b43dbd37a5a40670b8ca"},
+    {file = "zstandard-0.23.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d20fd853fbb5807c8e84c136c278827b6167ded66c72ec6f9a14b863d809211c"},
+    {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ed1708dbf4d2e3a1c5c69110ba2b4eb6678262028afd6c6fbcc5a8dac9cda68e"},
+    {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:be9b5b8659dff1f913039c2feee1aca499cfbc19e98fa12bc85e037c17ec6ca5"},
+    {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:65308f4b4890aa12d9b6ad9f2844b7ee42c7f7a4fd3390425b242ffc57498f48"},
+    {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:98da17ce9cbf3bfe4617e836d561e433f871129e3a7ac16d6ef4c680f13a839c"},
+    {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:8ed7d27cb56b3e058d3cf684d7200703bcae623e1dcc06ed1e18ecda39fee003"},
+    {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:b69bb4f51daf461b15e7b3db033160937d3ff88303a7bc808c67bbc1eaf98c78"},
+    {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:034b88913ecc1b097f528e42b539453fa82c3557e414b3de9d5632c80439a473"},
+    {file = "zstandard-0.23.0-cp311-cp311-win32.whl", hash = "sha256:f2d4380bf5f62daabd7b751ea2339c1a21d1c9463f1feb7fc2bdcea2c29c3160"},
+    {file = "zstandard-0.23.0-cp311-cp311-win_amd64.whl", hash = "sha256:62136da96a973bd2557f06ddd4e8e807f9e13cbb0bfb9cc06cfe6d98ea90dfe0"},
+    {file = "zstandard-0.23.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b4567955a6bc1b20e9c31612e615af6b53733491aeaa19a6b3b37f3b65477094"},
+    {file = "zstandard-0.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1e172f57cd78c20f13a3415cc8dfe24bf388614324d25539146594c16d78fcc8"},
+    {file = "zstandard-0.23.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b0e166f698c5a3e914947388c162be2583e0c638a4703fc6a543e23a88dea3c1"},
+    {file = "zstandard-0.23.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12a289832e520c6bd4dcaad68e944b86da3bad0d339ef7989fb7e88f92e96072"},
+    {file = "zstandard-0.23.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d50d31bfedd53a928fed6707b15a8dbeef011bb6366297cc435accc888b27c20"},
+    {file = "zstandard-0.23.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72c68dda124a1a138340fb62fa21b9bf4848437d9ca60bd35db36f2d3345f373"},
+    {file = "zstandard-0.23.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:53dd9d5e3d29f95acd5de6802e909ada8d8d8cfa37a3ac64836f3bc4bc5512db"},
+    {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:6a41c120c3dbc0d81a8e8adc73312d668cd34acd7725f036992b1b72d22c1772"},
+    {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:40b33d93c6eddf02d2c19f5773196068d875c41ca25730e8288e9b672897c105"},
+    {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9206649ec587e6b02bd124fb7799b86cddec350f6f6c14bc82a2b70183e708ba"},
+    {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:76e79bc28a65f467e0409098fa2c4376931fd3207fbeb6b956c7c476d53746dd"},
+    {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:66b689c107857eceabf2cf3d3fc699c3c0fe8ccd18df2219d978c0283e4c508a"},
+    {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9c236e635582742fee16603042553d276cca506e824fa2e6489db04039521e90"},
+    {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a8fffdbd9d1408006baaf02f1068d7dd1f016c6bcb7538682622c556e7b68e35"},
+    {file = "zstandard-0.23.0-cp312-cp312-win32.whl", hash = "sha256:dc1d33abb8a0d754ea4763bad944fd965d3d95b5baef6b121c0c9013eaf1907d"},
+    {file = "zstandard-0.23.0-cp312-cp312-win_amd64.whl", hash = "sha256:64585e1dba664dc67c7cdabd56c1e5685233fbb1fc1966cfba2a340ec0dfff7b"},
+    {file = "zstandard-0.23.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:576856e8594e6649aee06ddbfc738fec6a834f7c85bf7cadd1c53d4a58186ef9"},
+    {file = "zstandard-0.23.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:38302b78a850ff82656beaddeb0bb989a0322a8bbb1bf1ab10c17506681d772a"},
+    {file = "zstandard-0.23.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2240ddc86b74966c34554c49d00eaafa8200a18d3a5b6ffbf7da63b11d74ee2"},
+    {file = "zstandard-0.23.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ef230a8fd217a2015bc91b74f6b3b7d6522ba48be29ad4ea0ca3a3775bf7dd5"},
+    {file = "zstandard-0.23.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:774d45b1fac1461f48698a9d4b5fa19a69d47ece02fa469825b442263f04021f"},
+    {file = "zstandard-0.23.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f77fa49079891a4aab203d0b1744acc85577ed16d767b52fc089d83faf8d8ed"},
+    {file = "zstandard-0.23.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ac184f87ff521f4840e6ea0b10c0ec90c6b1dcd0bad2f1e4a9a1b4fa177982ea"},
+    {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c363b53e257246a954ebc7c488304b5592b9c53fbe74d03bc1c64dda153fb847"},
+    {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:e7792606d606c8df5277c32ccb58f29b9b8603bf83b48639b7aedf6df4fe8171"},
+    {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a0817825b900fcd43ac5d05b8b3079937073d2b1ff9cf89427590718b70dd840"},
+    {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:9da6bc32faac9a293ddfdcb9108d4b20416219461e4ec64dfea8383cac186690"},
+    {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:fd7699e8fd9969f455ef2926221e0233f81a2542921471382e77a9e2f2b57f4b"},
+    {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d477ed829077cd945b01fc3115edd132c47e6540ddcd96ca169facff28173057"},
+    {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa6ce8b52c5987b3e34d5674b0ab529a4602b632ebab0a93b07bfb4dfc8f8a33"},
+    {file = "zstandard-0.23.0-cp313-cp313-win32.whl", hash = "sha256:a9b07268d0c3ca5c170a385a0ab9fb7fdd9f5fd866be004c4ea39e44edce47dd"},
+    {file = "zstandard-0.23.0-cp313-cp313-win_amd64.whl", hash = "sha256:f3513916e8c645d0610815c257cbfd3242adfd5c4cfa78be514e5a3ebb42a41b"},
+    {file = "zstandard-0.23.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2ef3775758346d9ac6214123887d25c7061c92afe1f2b354f9388e9e4d48acfc"},
+    {file = "zstandard-0.23.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4051e406288b8cdbb993798b9a45c59a4896b6ecee2f875424ec10276a895740"},
+    {file = "zstandard-0.23.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2d1a054f8f0a191004675755448d12be47fa9bebbcffa3cdf01db19f2d30a54"},
+    {file = "zstandard-0.23.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f83fa6cae3fff8e98691248c9320356971b59678a17f20656a9e59cd32cee6d8"},
+    {file = "zstandard-0.23.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:32ba3b5ccde2d581b1e6aa952c836a6291e8435d788f656fe5976445865ae045"},
+    {file = "zstandard-0.23.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f146f50723defec2975fb7e388ae3a024eb7151542d1599527ec2aa9cacb152"},
+    {file = "zstandard-0.23.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1bfe8de1da6d104f15a60d4a8a768288f66aa953bbe00d027398b93fb9680b26"},
+    {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:29a2bc7c1b09b0af938b7a8343174b987ae021705acabcbae560166567f5a8db"},
+    {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:61f89436cbfede4bc4e91b4397eaa3e2108ebe96d05e93d6ccc95ab5714be512"},
+    {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:53ea7cdc96c6eb56e76bb06894bcfb5dfa93b7adcf59d61c6b92674e24e2dd5e"},
+    {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:a4ae99c57668ca1e78597d8b06d5af837f377f340f4cce993b551b2d7731778d"},
+    {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:379b378ae694ba78cef921581ebd420c938936a153ded602c4fea612b7eaa90d"},
+    {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:50a80baba0285386f97ea36239855f6020ce452456605f262b2d33ac35c7770b"},
+    {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:61062387ad820c654b6a6b5f0b94484fa19515e0c5116faf29f41a6bc91ded6e"},
+    {file = "zstandard-0.23.0-cp38-cp38-win32.whl", hash = "sha256:b8c0bd73aeac689beacd4e7667d48c299f61b959475cdbb91e7d3d88d27c56b9"},
+    {file = "zstandard-0.23.0-cp38-cp38-win_amd64.whl", hash = "sha256:a05e6d6218461eb1b4771d973728f0133b2a4613a6779995df557f70794fd60f"},
+    {file = "zstandard-0.23.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3aa014d55c3af933c1315eb4bb06dd0459661cc0b15cd61077afa6489bec63bb"},
+    {file = "zstandard-0.23.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0a7f0804bb3799414af278e9ad51be25edf67f78f916e08afdb983e74161b916"},
+    {file = "zstandard-0.23.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb2b1ecfef1e67897d336de3a0e3f52478182d6a47eda86cbd42504c5cbd009a"},
+    {file = "zstandard-0.23.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:837bb6764be6919963ef41235fd56a6486b132ea64afe5fafb4cb279ac44f259"},
+    {file = "zstandard-0.23.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1516c8c37d3a053b01c1c15b182f3b5f5eef19ced9b930b684a73bad121addf4"},
+    {file = "zstandard-0.23.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48ef6a43b1846f6025dde6ed9fee0c24e1149c1c25f7fb0a0585572b2f3adc58"},
+    {file = "zstandard-0.23.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:11e3bf3c924853a2d5835b24f03eeba7fc9b07d8ca499e247e06ff5676461a15"},
+    {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2fb4535137de7e244c230e24f9d1ec194f61721c86ebea04e1581d9d06ea1269"},
+    {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8c24f21fa2af4bb9f2c492a86fe0c34e6d2c63812a839590edaf177b7398f700"},
+    {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a8c86881813a78a6f4508ef9daf9d4995b8ac2d147dcb1a450448941398091c9"},
+    {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:fe3b385d996ee0822fd46528d9f0443b880d4d05528fd26a9119a54ec3f91c69"},
+    {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:82d17e94d735c99621bf8ebf9995f870a6b3e6d14543b99e201ae046dfe7de70"},
+    {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:c7c517d74bea1a6afd39aa612fa025e6b8011982a0897768a2f7c8ab4ebb78a2"},
+    {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1fd7e0f1cfb70eb2f95a19b472ee7ad6d9a0a992ec0ae53286870c104ca939e5"},
+    {file = "zstandard-0.23.0-cp39-cp39-win32.whl", hash = "sha256:43da0f0092281bf501f9c5f6f3b4c975a8a0ea82de49ba3f7100e64d422a1274"},
+    {file = "zstandard-0.23.0-cp39-cp39-win_amd64.whl", hash = "sha256:f8346bfa098532bc1fb6c7ef06783e969d87a99dd1d2a5a18a892c1d7a643c58"},
+    {file = "zstandard-0.23.0.tar.gz", hash = "sha256:b2d8c62d08e7255f68f7a740bae85b3c9b8e5466baa9cbf7f57f1cde0ac6bc09"},
 ]
 
 [package.dependencies]
@@ -3534,4 +3655,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "9032c11f264f2f6d8a50230e5021c606d460aafdf370da0524784c3f0f1f31b1"
+content-hash = "e6904aca09abc6c805604b21a5702a97e0056406f9ec7469b091d35ee10a6b16"
diff --git a/pyproject.toml b/pyproject.toml
index ba4ab0b1f7..735d12d756 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,11 +7,11 @@ package-mode = false
 python = "^3.11"
 pytest = "^7.4.4"
 psycopg2-binary = "^2.9.10"
-typing-extensions = "^4.6.1"
+typing-extensions = "^4.12.2"
 PyJWT = {version = "^2.1.0", extras = ["crypto"]}
 requests = "^2.32.3"
 pytest-xdist = "^3.3.1"
-asyncpg = "^0.29.0"
+asyncpg = "^0.30.0"
 aiopg = "^1.4.0"
 Jinja2 = "^3.1.5"
 types-requests = "^2.31.0.0"
@@ -36,7 +36,7 @@ aiohttp = "3.10.11"
 pytest-rerunfailures = "^15.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"
-zstandard = "^0.21.0"
+zstandard = "^0.23.0"
 httpx = {extras = ["http2"], version = "^0.26.0"}
 pytest-repeat = "^0.9.3"
 websockets = "^12.0"
@@ -47,8 +47,9 @@ h2 = "^4.1.0"
 types-jwcrypto = "^1.5.0.20240925"
 pyyaml = "^6.0.2"
 types-pyyaml = "^6.0.12.20240917"
-testcontainers = "^4.8.1"
-jsonnet = "^0.20.0"
+testcontainers = "^4.9.0"
+# Jsonnet doesn't support Python 3.13 yet
+jsonnet = { version = "^0.20.0", markers = "python_version < '3.13'" }
 
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.13.0"
diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py
index 787790103f..71963355b7 100644
--- a/test_runner/regress/test_compute_metrics.py
+++ b/test_runner/regress/test_compute_metrics.py
@@ -3,12 +3,11 @@ from __future__ import annotations
 import enum
 import os
 import shutil
+import sys
 from enum import StrEnum
 from pathlib import Path
 from typing import TYPE_CHECKING, cast
 
-# Docs are available at https://jsonnet.org/ref/bindings.html#python_api
-import _jsonnet
 import pytest
 import requests
 import yaml
@@ -87,6 +86,10 @@ def jsonnet_evaluate_file(
     ext_vars: str | dict[str, str] | None = None,
     tla_vars: str | dict[str, str] | None = None,
 ) -> str:
+    # Jsonnet doesn't support Python 3.13 yet
+    # Docs are available at https://jsonnet.org/ref/bindings.html#python_api
+    import _jsonnet
+
     return cast(
         "str",
         _jsonnet.evaluate_file(
@@ -121,6 +124,7 @@ class SqlExporterProcess(StrEnum):
     AUTOSCALING = "autoscaling"
 
 
+@pytest.mark.xfail(sys.version_info >= (3, 13), reason="Jsonnet doesn't support Python 3.13 yet")
 @pytest.mark.parametrize(
     "collector_name",
     ["neon_collector", "neon_collector_autoscaling"],
@@ -352,6 +356,7 @@ else:
             self.__proc.wait()
 
 
+@pytest.mark.xfail(sys.version_info >= (3, 13), reason="Jsonnet doesn't support Python 3.13 yet")
 @pytest.mark.parametrize(
     "exporter",
     [SqlExporterProcess.COMPUTE, SqlExporterProcess.AUTOSCALING],

From 02f81b6469c88187ddda548c2dbe32c8c5a9a41d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 6 Jan 2025 20:28:33 +0000
Subject: [PATCH 116/583] Fix clippy warning on macOS (#10282)

## Problem

On macOS:

```
error: unused variable: `disable_lfc_resizing`
   --> compute_tools/src/bin/compute_ctl.rs:431:9
    |
431 |         disable_lfc_resizing,
    |         ^^^^^^^^^^^^^^^^^^^^ help: try ignoring the field: `disable_lfc_resizing: _`
    |
    = note: `-D unused-variables` implied by `-D warnings`
    = help: to override `-D warnings` add `#[allow(unused_variables)]`
```

## Summary of changes
- Initialise `disable_lfc_resizing` only on Linux (because it's used on
Linux only in further bloc)
---
 compute_tools/src/bin/compute_ctl.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 26ae25ec20..6ede5fdceb 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -428,6 +428,7 @@ fn start_postgres(
     let &ComputeSpec {
         swap_size_bytes,
         disk_quota_bytes,
+        #[cfg(target_os = "linux")]
         disable_lfc_resizing,
         ..
     } = &state.pspec.as_ref().unwrap().spec;

From 30863c010421affd737977dbfe21ea08f18a43cb Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Tue, 7 Jan 2025 10:07:38 +0100
Subject: [PATCH 117/583] libpagestore: timeout = max(0, difference), not
 min(0, difference) (#10274)

Using `min(0, ...)` causes us to fail to wait in most situations, so a
lack of data would be a hot wait loop, which is bad.

## Problem

We noticed high CPU usage in some situations
---
 pgxn/neon/libpagestore.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 88d0a5292b..fa2a570ea8 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -680,7 +680,7 @@ call_PQgetCopyData(shardno_t shard_no, char **buffer)
 	 * but in the cases that take exceptionally long, it's useful to log the
 	 * exact timestamps.
 	 */
-#define LOG_INTERVAL_US		UINT64CONST(10 * 1000000)
+#define LOG_INTERVAL_MS		INT64CONST(10 * 1000)
 
 	INSTR_TIME_SET_CURRENT(now);
 	start_ts = last_log_ts = now;
@@ -694,7 +694,7 @@ retry:
 		WaitEvent	event;
 		long		timeout;
 
-		timeout = Min(0, LOG_INTERVAL_US - INSTR_TIME_GET_MICROSEC(since_last_log));
+		timeout = Max(0, LOG_INTERVAL_MS - INSTR_TIME_GET_MILLISEC(since_last_log));
 
 		/* Sleep until there's something to do */
 		(void) WaitEventSetWait(shard->wes_read, timeout, &event, 1,
@@ -723,7 +723,7 @@ retry:
 		INSTR_TIME_SET_CURRENT(now);
 		since_last_log = now;
 		INSTR_TIME_SUBTRACT(since_last_log, last_log_ts);
-		if (INSTR_TIME_GET_MICROSEC(since_last_log) >= LOG_INTERVAL_US)
+		if (INSTR_TIME_GET_MILLISEC(since_last_log) >= LOG_INTERVAL_MS)
 		{
 			since_start = now;
 			INSTR_TIME_SUBTRACT(since_start, start_ts);

From ea84ec357fa4caa5a48ec65a0aab9e37d1a9fda4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Tue, 7 Jan 2025 11:36:05 +0100
Subject: [PATCH 118/583] Split promote-images into promote-images-dev and
 promote-images-prod (#10267)

## Problem
`trigger-e2e-tests` waits half an hour before starting to run. Nearly
half of that time can be saved by promoting images before tests on them
are complete, so the e2e tests can run in parallel.

On `main` and `release{,-proxy,-compute}`, `promote-images` updates
`latest` and pushes things to prod ecr, so we want to run
`promote-images` only after `test-images` is done, but on other
branches, there is no harm in promoting images that aren't tested yet.

## Summary of changes

To promote images into dev container registries sooner, `promote-images`
is split into `promote-images-dev` and `promote-images-prod`. The former
pushes to dev container registries, the latter to prod ones. The latter
also waits for `test-images`, while the former doesn't. This allows to
run `trigger-e2e-tests` sooner.
---
 .github/workflows/actionlint.yml        |  2 +-
 .github/workflows/build_and_test.yml    | 35 +++++++++++++++++++------
 .github/workflows/trigger-e2e-tests.yml |  8 +++---
 3 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index 85cfe7446e..0e53830040 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -33,7 +33,7 @@ jobs:
           # SC2086 - Double quote to prevent globbing and word splitting. - https://www.shellcheck.net/wiki/SC2086
           SHELLCHECK_OPTS: --exclude=SC2046,SC2086
         with:
-          fail_on_error: true
+          fail_level: error
           filter_mode: nofilter
           level: error
 
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 12b1ac98ac..5c2b397c82 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -538,7 +538,7 @@ jobs:
 
   trigger-e2e-tests:
     if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }}
-    needs: [ check-permissions, promote-images, tag ]
+    needs: [ check-permissions, promote-images-dev, tag ]
     uses: ./.github/workflows/trigger-e2e-tests.yml
     secrets: inherit
 
@@ -930,8 +930,8 @@ jobs:
           docker compose -f ./docker-compose/docker-compose.yml logs || 0
           docker compose -f ./docker-compose/docker-compose.yml down
 
-  promote-images:
-    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
+  promote-images-dev:
+    needs: [ check-permissions, tag, vm-compute-node-image ]
     runs-on: ubuntu-22.04
 
     permissions:
@@ -965,6 +965,25 @@ jobs:
                                                neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
           done
 
+  promote-images-prod:
+    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
+    runs-on: ubuntu-22.04
+    if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
+
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: read
+
+    env:
+      VERSIONS: v14 v15 v16 v17
+
+    steps:
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
       - name: Add latest tag to images
         if: github.ref_name == 'main'
         run: |
@@ -1010,7 +1029,7 @@ jobs:
 
   push-to-acr-dev:
     if: github.ref_name == 'main'
-    needs: [ tag, promote-images ]
+    needs: [ tag, promote-images-dev ]
     uses: ./.github/workflows/_push-to-acr.yml
     with:
       client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
@@ -1022,7 +1041,7 @@ jobs:
 
   push-to-acr-prod:
     if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
-    needs: [ tag, promote-images ]
+    needs: [ tag, promote-images-prod ]
     uses: ./.github/workflows/_push-to-acr.yml
     with:
       client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
@@ -1112,7 +1131,7 @@ jobs:
           exit 1
 
   deploy:
-    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
+    needs: [ check-permissions, promote-images-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
     # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
     if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled()
     permissions:
@@ -1333,7 +1352,7 @@ jobs:
           done
 
   pin-build-tools-image:
-    needs: [ build-build-tools-image, promote-images, build-and-test-locally ]
+    needs: [ build-build-tools-image, promote-images-prod, build-and-test-locally ]
     if: github.ref_name == 'main'
     uses: ./.github/workflows/pin-build-tools-image.yml
     with:
@@ -1356,7 +1375,7 @@ jobs:
       - build-and-test-locally
       - check-codestyle-python
       - check-codestyle-rust
-      - promote-images
+      - promote-images-dev
       - test-images
       - trigger-custom-extensions-build-and-wait
     runs-on: ubuntu-22.04
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 70c2e8549f..31696248b0 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -68,7 +68,7 @@ jobs:
       GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
       TAG: ${{ needs.tag.outputs.build-tag }}
     steps:
-      - name: Wait for `promote-images` job to finish
+      - name: Wait for `promote-images-dev` job to finish
         # It's important to have a timeout here, the script in the step can run infinitely
         timeout-minutes: 60
         run: |
@@ -79,17 +79,17 @@ jobs:
           # For PRs we use the run id as the tag
           BUILD_AND_TEST_RUN_ID=${TAG}
           while true; do
-            conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images") | .conclusion')
+            conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images-dev") | .conclusion')
             case "$conclusion" in
               success)
                 break
                 ;;
               failure | cancelled | skipped)
-                echo "The 'promote-images' job didn't succeed: '${conclusion}'. Exiting..."
+                echo "The 'promote-images-dev' job didn't succeed: '${conclusion}'. Exiting..."
                 exit 1
                 ;;
               *)
-                echo "The 'promote-images' hasn't succeed yet. Waiting..."
+                echo "The 'promote-images-dev' hasn't succeed yet. Waiting..."
                 sleep 60
                 ;;
             esac

From be38123e62b029dcd9f9cc0beb765ad2d3333906 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Tue, 7 Jan 2025 11:41:52 +0100
Subject: [PATCH 119/583] Fix accounting of dropped prefetched GetPage requests
 (#10276)

Apparently, we failed to do this bookkeeping in quite a few places...

## Problem

Fixes https://github.com/neondatabase/cloud/issues/22364

## Summary of changes

Add accounting of dropped requests. Note that this includes prefetches
dropped due to things like "PS connection dropped unexpectedly" or
"prefetch queue is already full", but *not* (yet?) "dropped due to
backend shutdown".
---
 pgxn/neon/pagestore_smgr.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 385905d9ce..b733807026 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -716,6 +716,8 @@ prefetch_on_ps_disconnect(void)
 		MyPState->ring_receive += 1;
 
 		prefetch_set_unused(ring_index);
+		pgBufferUsage.prefetch.expired += 1;
+		MyNeonCounters->getpage_prefetch_discards_total += 1;
 	}
 
 	/*
@@ -935,7 +937,8 @@ Retry:
 					prefetch_set_unused(ring_index);
 					entry = NULL;
 					slot = NULL;
-					MyNeonCounters->getpage_prefetch_discards_total++;
+					pgBufferUsage.prefetch.expired += 1;
+					MyNeonCounters->getpage_prefetch_discards_total += 1;
 				}
 			}
 
@@ -1026,10 +1029,14 @@ Retry:
 						if (!prefetch_wait_for(cleanup_index))
 							goto Retry;
 						prefetch_set_unused(cleanup_index);
+						pgBufferUsage.prefetch.expired += 1;
+						MyNeonCounters->getpage_prefetch_discards_total += 1;
 						break;
 					case PRFS_RECEIVED:
 					case PRFS_TAG_REMAINS:
 						prefetch_set_unused(cleanup_index);
+						pgBufferUsage.prefetch.expired += 1;
+						MyNeonCounters->getpage_prefetch_discards_total += 1;
 						break;
 					default:
 						pg_unreachable();

From 4aa9786c6bffe8094cfc82947504d2044e284d7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Tue, 7 Jan 2025 14:45:18 +0100
Subject: [PATCH 120/583] Fix promote-images-prod after splitting it out
 (#10292)

## Problem
`promote-images` was split into `promote-images-dev` and
`promote-images-prod` in
https://github.com/neondatabase/neon/pull/10267.

`dev` credentials were loaded in `promote-images-dev` and `prod`
credentials were loaded in `promote-images-prod`, but
`promote-images-prod` needs `dev` credentials as well to access the
`dev` images to replicate them from `dev` to `prod`.

## Summary of changes
Load `dev` credentials in `promote-images-prod` as well.
---
 .github/workflows/build_and_test.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 5c2b397c82..01f5c3ede9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -979,6 +979,16 @@ jobs:
       VERSIONS: v14 v15 v16 v17
 
     steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: eu-central-1
+          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          role-duration-seconds: 3600
+
+      - name: Login to Amazon Dev ECR
+        uses: aws-actions/amazon-ecr-login@v2
+
       - uses: docker/login-action@v3
         with:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}

From 0a117fb1f1a9c2a062c8a60d53f2eb00637392e8 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 7 Jan 2025 16:24:54 +0100
Subject: [PATCH 121/583] proxy: Parse Notification twice only for unknown
 topic (#10296)

## Problem

We currently parse Notification twice even in the happy path.

## Summary of changes

Use `#[serde(other)]` to catch unknown topics and defer the second
parsing.
---
 proxy/src/redis/notifications.rs | 55 ++++++++++++--------------------
 1 file changed, 21 insertions(+), 34 deletions(-)

diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 4383d6be2c..bf9d61ded3 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -37,7 +37,6 @@ struct NotificationHeader<'a> {
 
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 #[serde(tag = "topic", content = "data")]
-// Message to contributors: Make sure to align these topic names with the list below.
 pub(crate) enum Notification {
     #[serde(
         rename = "/allowed_ips_updated",
@@ -74,21 +73,9 @@ pub(crate) enum Notification {
     PasswordUpdate { password_update: PasswordUpdate },
     #[serde(rename = "/cancel_session")]
     Cancel(CancelSession),
-}
 
-/// Returns true if the topic name given is a known topic that we can deserialize and action on.
-/// Returns false otherwise.
-fn known_topic(s: &str) -> bool {
-    // Message to contributors: Make sure to align these topic names with the enum above.
-    matches!(
-        s,
-        "/allowed_ips_updated"
-            | "/block_public_or_vpc_access_updated"
-            | "/allowed_vpc_endpoints_updated_for_org"
-            | "/allowed_vpc_endpoints_updated_for_projects"
-            | "/password_updated"
-            | "/cancel_session"
-    )
+    #[serde(other, skip_serializing)]
+    UnknownTopic,
 }
 
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
@@ -178,32 +165,29 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
         let payload: String = msg.get_payload()?;
         tracing::debug!(?payload, "received a message payload");
 
-        // For better error handling, we first parse the payload to extract the topic.
-        // If there's a topic we don't support, we can handle that error more gracefully.
-        let header: NotificationHeader = match serde_json::from_str(&payload) {
-            Ok(msg) => msg,
-            Err(e) => {
-                Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
-                    channel: msg.get_channel_name(),
-                });
-                tracing::error!("broken message: {e}");
+        let msg: Notification = match serde_json::from_str(&payload) {
+            Ok(Notification::UnknownTopic) => {
+                match serde_json::from_str::<NotificationHeader>(&payload) {
+                    // don't update the metric for redis errors if it's just a topic we don't know about.
+                    Ok(header) => tracing::warn!(topic = header.topic, "unknown topic"),
+                    Err(e) => {
+                        Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
+                            channel: msg.get_channel_name(),
+                        });
+                        tracing::error!("broken message: {e}");
+                    }
+                };
                 return Ok(());
             }
-        };
-
-        if !known_topic(header.topic) {
-            // don't update the metric for redis errors if it's just a topic we don't know about.
-            tracing::warn!(topic = header.topic, "unknown topic");
-            return Ok(());
-        }
-
-        let msg: Notification = match serde_json::from_str(&payload) {
             Ok(msg) => msg,
             Err(e) => {
                 Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
                     channel: msg.get_channel_name(),
                 });
-                tracing::error!(topic = header.topic, "broken message: {e}");
+                match serde_json::from_str::<NotificationHeader>(&payload) {
+                    Ok(header) => tracing::error!(topic = header.topic, "broken message: {e}"),
+                    Err(_) => tracing::error!("broken message: {e}"),
+                };
                 return Ok(());
             }
         };
@@ -278,6 +262,8 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                     invalidate_cache(cache, msg);
                 });
             }
+
+            Notification::UnknownTopic => unreachable!(),
         }
 
         Ok(())
@@ -304,6 +290,7 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
         Notification::AllowedVpcEndpointsUpdatedForProjects { .. } => {
             // https://github.com/neondatabase/neon/pull/10073
         }
+        Notification::UnknownTopic => unreachable!(),
     }
 }
 

From 43a5e575d611c546c9bf5d538a57d8984560589d Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Tue, 7 Jan 2025 21:00:56 +0100
Subject: [PATCH 122/583] ci: use reusable workflow for MacOs build (#9889)

Closes: https://github.com/neondatabase/cloud/issues/17784

## Problem
Currently, we run the whole CI pipeline for any changes. It's slow and
expensive.

## Suggestion
Starting with MacOs builds:
- check what files were changed
- rebuild only needed parts
- reuse results from previous builds when available
- run builds in parallel when possible

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/file-filters.yaml               |  12 ++
 .github/workflows/build-macos.yml       | 241 ++++++++++++++++++++++++
 .github/workflows/neon_extra_builds.yml | 139 ++++----------
 3 files changed, 290 insertions(+), 102 deletions(-)
 create mode 100644 .github/file-filters.yaml
 create mode 100644 .github/workflows/build-macos.yml

diff --git a/.github/file-filters.yaml b/.github/file-filters.yaml
new file mode 100644
index 0000000000..886cd3919a
--- /dev/null
+++ b/.github/file-filters.yaml
@@ -0,0 +1,12 @@
+rust_code: ['**/*.rs', '**/Cargo.toml', '**/Cargo.lock']
+
+v14: ['vendor/postgres-v14/**', 'Makefile', 'pgxn/**']
+v15: ['vendor/postgres-v15/**', 'Makefile', 'pgxn/**']
+v16: ['vendor/postgres-v16/**', 'Makefile', 'pgxn/**']
+v17: ['vendor/postgres-v17/**', 'Makefile', 'pgxn/**']
+
+rebuild_neon_extra:
+    - .github/workflows/neon_extra_builds.yml
+
+rebuild_macos:
+    - .github/workflows/build-macos.yml
diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml
new file mode 100644
index 0000000000..01d82a1ed2
--- /dev/null
+++ b/.github/workflows/build-macos.yml
@@ -0,0 +1,241 @@
+name: Check neon with MacOS builds
+
+on:
+  workflow_call:
+    inputs:
+      pg_versions:
+        description: "Array of the pg versions to build for, for example: ['v14', 'v17']"
+        type: string
+        default: '[]'
+        required: false
+      rebuild_rust_code:
+        description: "Rebuild Rust code"
+        type: boolean
+        default: false
+        required: false
+      rebuild_everything:
+        description: "If true, rebuild for all versions"
+        type: boolean
+        default: false
+        required: false
+
+env:
+  RUST_BACKTRACE: 1
+  COPT: '-Werror'
+
+# TODO: move `check-*` and `files-changed` jobs to the "Caller" Workflow
+# We should care about that as Github has limitations:
+# - You can connect up to four levels of workflows
+# - You can call a maximum of 20 unique reusable workflows from a single workflow file.
+# https://docs.github.com/en/actions/sharing-automations/reusing-workflows#limitations
+jobs:
+  build-pgxn:
+    if: |
+      (inputs.pg_versions != '[]' || inputs.rebuild_everything) && (
+        contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
+        contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+        github.ref_name == 'main'
+      )
+    timeout-minutes: 30
+    runs-on: macos-15
+    strategy:
+      matrix:
+        postgres-version: ${{ inputs.rebuild_everything && fromJson('["v14", "v15", "v16", "v17"]') || fromJSON(inputs.pg_versions) }}
+    env:
+      # Use release build only, to have less debug info around
+      # Hence keeping target/ (and general cache size) smaller
+      BUILD_TYPE: release
+    steps:
+      - name: Checkout main repo
+        uses: actions/checkout@v4
+
+      - name: Set pg ${{ matrix.postgres-version }} for caching
+        id: pg_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-${{ matrix.postgres-version }}) | tee -a "${GITHUB_OUTPUT}"
+
+      - name: Cache postgres ${{ matrix.postgres-version }} build
+        id: cache_pg
+        uses: actions/cache@v4
+        with:
+          path: pg_install/${{ matrix.postgres-version }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ matrix.postgres-version }}-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Checkout submodule vendor/postgres-${{ matrix.postgres-version }}
+        if: steps.cache_pg.outputs.cache-hit != 'true'
+        run: |
+          git submodule init vendor/postgres-${{ matrix.postgres-version }}
+          git submodule update --depth 1 --recursive
+
+      - name: Install build dependencies
+        if: steps.cache_pg.outputs.cache-hit != 'true'
+        run: |
+          brew install flex bison openssl protobuf icu4c
+
+      - name: Set extra env for macOS
+        if: steps.cache_pg.outputs.cache-hit != 'true'
+        run: |
+          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
+          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
+
+      - name: Build Postgres ${{ matrix.postgres-version }}
+        if: steps.cache_pg.outputs.cache-hit != 'true'
+        run: |
+          make postgres-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu)
+
+      - name: Build Neon Pg Ext ${{ matrix.postgres-version }}
+        if: steps.cache_pg.outputs.cache-hit != 'true'
+        run: |
+          make "neon-pg-ext-${{ matrix.postgres-version }}" -j$(sysctl -n hw.ncpu)
+
+      - name: Get postgres headers ${{ matrix.postgres-version }}
+        if: steps.cache_pg.outputs.cache-hit != 'true'
+        run: |
+          make postgres-headers-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu)
+
+  build-walproposer-lib:
+    if: |
+      (inputs.pg_versions != '[]' || inputs.rebuild_everything) && (
+        contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
+        contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+        github.ref_name == 'main'
+      )
+    timeout-minutes: 30
+    runs-on: macos-15
+    needs: [build-pgxn]
+    env:
+      # Use release build only, to have less debug info around
+      # Hence keeping target/ (and general cache size) smaller
+      BUILD_TYPE: release
+    steps:
+      - name: Checkout main repo
+        uses: actions/checkout@v4
+
+      - name: Set pg v17 for caching
+        id: pg_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) | tee -a "${GITHUB_OUTPUT}"
+
+      - name: Cache postgres v17 build
+        id: cache_pg
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v17
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache walproposer-lib
+        id: cache_walproposer_lib
+        uses: actions/cache@v4
+        with:
+          path: pg_install/build/walproposer-lib
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Checkout submodule vendor/postgres-v17
+        if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
+        run: |
+          git submodule init vendor/postgres-v17
+          git submodule update --depth 1 --recursive
+
+      - name: Install build dependencies
+        if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
+        run: |
+          brew install flex bison openssl protobuf icu4c
+
+      - name: Set extra env for macOS
+        if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
+        run: |
+          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
+          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
+
+      - name: Build walproposer-lib (only for v17)
+        if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
+        run:
+          make walproposer-lib -j$(sysctl -n hw.ncpu)
+
+  cargo-build:
+    if: |
+      (inputs.pg_versions != '[]' || inputs.rebuild_rust_code || inputs.rebuild_everything) && (
+        contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
+        contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+        github.ref_name == 'main'
+      )
+    timeout-minutes: 30
+    runs-on: macos-15
+    needs: [build-pgxn, build-walproposer-lib]
+    env:
+      # Use release build only, to have less debug info around
+      # Hence keeping target/ (and general cache size) smaller
+      BUILD_TYPE: release
+    steps:
+      - name: Checkout main repo
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - name: Set pg v14 for caching
+        id: pg_rev_v14
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) | tee -a "${GITHUB_OUTPUT}"
+      - name: Set pg v15 for caching
+        id: pg_rev_v15
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) | tee -a "${GITHUB_OUTPUT}"
+      - name: Set pg v16 for caching
+        id: pg_rev_v16
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) | tee -a "${GITHUB_OUTPUT}"
+      - name: Set pg v17 for caching
+        id: pg_rev_v17
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) | tee -a "${GITHUB_OUTPUT}"
+
+      - name: Cache postgres v14 build
+        id: cache_pg
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v14
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v14-${{ steps.pg_rev_v14.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+      - name: Cache postgres v15 build
+        id: cache_pg_v15
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v15
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v15-${{ steps.pg_rev_v15.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+      - name: Cache postgres v16 build
+        id: cache_pg_v16
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v16-${{ steps.pg_rev_v16.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+      - name: Cache postgres v17 build
+        id: cache_pg_v17
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v17
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache cargo deps (only for v17)
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            !~/.cargo/registry/src
+            ~/.cargo/git
+            target
+          key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
+
+      - name: Cache walproposer-lib
+        id: cache_walproposer_lib
+        uses: actions/cache@v4
+        with:
+          path: pg_install/build/walproposer-lib
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Install build dependencies
+        run: |
+          brew install flex bison openssl protobuf icu4c
+
+      - name: Set extra env for macOS
+        run: |
+          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
+          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
+
+      - name: Run cargo build (only for v17)
+        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release -j$(sysctl -n hw.ncpu)
+
+      - name: Check that no warnings are produced (only for v17)
+        run: ./run_clippy.sh
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 1f85c2e102..5b5910badf 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -31,19 +31,15 @@ jobs:
     uses: ./.github/workflows/build-build-tools-image.yml
     secrets: inherit
 
-  check-macos-build:
-    needs: [ check-permissions ]
-    if: |
-      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
-      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
-      github.ref_name == 'main'
-    timeout-minutes: 90
-    runs-on: macos-15
-
-    env:
-      # Use release build only, to have less debug info around
-      # Hence keeping target/ (and general cache size) smaller
-      BUILD_TYPE: release
+  files-changed:
+    name: Detect what files changed
+    runs-on: ubuntu-22.04
+    timeout-minutes: 3
+    outputs:
+      v17: ${{ steps.files_changed.outputs.v17 }}
+      postgres_changes: ${{ steps.postgres_changes.outputs.changes }}
+      rebuild_rust_code: ${{ steps.files_changed.outputs.rust_code }}
+      rebuild_everything: ${{ steps.files_changed.outputs.rebuild_neon_extra || steps.files_changed.outputs.rebuild_macos }}
 
     steps:
       - name: Checkout
@@ -51,106 +47,45 @@ jobs:
         with:
           submodules: true
 
-      - name: Install macOS postgres dependencies
-        run: brew install flex bison openssl protobuf icu4c
-
-      - name: Set pg 14 revision for caching
-        id: pg_v14_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
-
-      - name: Set pg 15 revision for caching
-        id: pg_v15_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
-
-      - name: Set pg 16 revision for caching
-        id: pg_v16_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
-
-      - name: Set pg 17 revision for caching
-        id: pg_v17_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT
-
-      - name: Cache postgres v14 build
-        id: cache_pg_14
-        uses: actions/cache@v4
+      - name: Check for Postgres changes
+        uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242  #v3
+        id: files_changed
         with:
-          path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          token: ${{ github.token }}
+          filters: .github/file-filters.yaml
+          base: ${{ github.event_name != 'pull_request' && (github.event.merge_group.base_ref || github.ref_name) || '' }}
+          ref: ${{ github.event_name != 'pull_request' && (github.event.merge_group.head_ref || github.ref) || '' }}
 
-      - name: Cache postgres v15 build
-        id: cache_pg_15
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
-      - name: Cache postgres v16 build
-        id: cache_pg_16
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
-      - name: Cache postgres v17 build
-        id: cache_pg_17
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v17
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
-      - name: Set extra env for macOS
+      - name: Filter out only v-string for build matrix
+        id: postgres_changes
         run: |
-          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
-          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
+          v_strings_only_as_json_array=$(echo ${{ steps.files_changed.outputs.chnages }} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c)
+          echo "changes=${v_strings_only_as_json_array}" | tee -a "${GITHUB_OUTPUT}"
 
-      - name: Cache cargo deps
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cargo/registry
-            !~/.cargo/registry/src
-            ~/.cargo/git
-            target
-          key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
-
-      - name: Build postgres v14
-        if: steps.cache_pg_14.outputs.cache-hit != 'true'
-        run: make postgres-v14 -j$(sysctl -n hw.ncpu)
-
-      - name: Build postgres v15
-        if: steps.cache_pg_15.outputs.cache-hit != 'true'
-        run: make postgres-v15 -j$(sysctl -n hw.ncpu)
-
-      - name: Build postgres v16
-        if: steps.cache_pg_16.outputs.cache-hit != 'true'
-        run: make postgres-v16 -j$(sysctl -n hw.ncpu)
-
-      - name: Build postgres v17
-        if: steps.cache_pg_17.outputs.cache-hit != 'true'
-        run: make postgres-v17 -j$(sysctl -n hw.ncpu)
-
-      - name: Build neon extensions
-        run: make neon-pg-ext -j$(sysctl -n hw.ncpu)
-
-      - name: Build walproposer-lib
-        run: make walproposer-lib -j$(sysctl -n hw.ncpu)
-
-      - name: Run cargo build
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release
-
-      - name: Check that no warnings are produced
-        run: ./run_clippy.sh
+  check-macos-build:
+    needs: [ check-permissions, files-changed ]
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
+    uses: ./.github/workflows/build-macos.yml
+    with:
+      pg_versions: ${{ needs.files-changed.outputs.postgres_changes }}
+      rebuild_rust_code: ${{ needs.files-changed.outputs.rebuild_rust_code }}
+      rebuild_everything: ${{ fromJson(needs.files-changed.outputs.rebuild_everything) }}
 
   gather-rust-build-stats:
-    needs: [ check-permissions, build-build-tools-image ]
+    needs: [ check-permissions, build-build-tools-image, files-changed ]
     permissions:
       id-token: write # aws-actions/configure-aws-credentials
       statuses: write
       contents: write
     if: |
-      contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
-      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
-      github.ref_name == 'main'
+      (needs.files-changed.outputs.v17 == 'true' || needs.files-changed.outputs.rebuild_everything == 'true') && (
+        contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
+        contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+        github.ref_name == 'main'
+      )
     runs-on: [ self-hosted, large ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm

From 237dae71a1bd00d1611fb2298b2e3cb13883155b Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 7 Jan 2025 23:49:00 +0100
Subject: [PATCH 123/583] Revert "pageserver,safekeeper: disable heap profiling
 (#10268)" (#10303)

This reverts commit b33299dc37d9269fe55bd3256b7a4a72c129b81c.

Heap profiles weren't the culprit after all.

Touches #10225.
---
 pageserver/src/bin/pageserver.rs | 10 ++++------
 safekeeper/src/bin/safekeeper.rs | 10 ++++------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index b92ff4ebf9..567a69da3b 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -53,12 +53,10 @@ project_build_tag!(BUILD_TAG);
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
-// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
-// TODO: disabled because concurrent CPU profiles cause seg faults. See:
-// https://github.com/neondatabase/neon/issues/10225.
-//#[allow(non_upper_case_globals)]
-//#[export_name = "malloc_conf"]
-//pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
+#[allow(non_upper_case_globals)]
+#[export_name = "malloc_conf"]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
 
 const PID_FILE_NAME: &str = "pageserver.pid";
 
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index e0ba38d638..13f6e34575 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -51,12 +51,10 @@ use utils::{
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
-// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
-// TODO: disabled because concurrent CPU profiles cause seg faults. See:
-// https://github.com/neondatabase/neon/issues/10225.
-//#[allow(non_upper_case_globals)]
-//#[export_name = "malloc_conf"]
-//pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
+#[allow(non_upper_case_globals)]
+#[export_name = "malloc_conf"]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
 
 const PID_FILE_NAME: &str = "safekeeper.pid";
 const ID_FILE_NAME: &str = "safekeeper.id";

From 5c76e2a983295f3123631b0178309a942f584596 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 7 Jan 2025 18:24:17 -0500
Subject: [PATCH 124/583] fix(storage-scrubber): ignore errors if index_part is
 not consistent (#10304)

## Problem

Consider the pageserver is doing the following sequence of operations:

* upload X files
* update index_part to add X and remove Y
* delete Y files

When storage scrubber obtains the initial timeline snapshot before
"update index_part" (that is the old version that contains Y but not X),
and then obtains the index_part file after it gets updated, it will
report all Y files are missing.

## Summary of changes

Do not report layer file missing if index_part listed and downloaded are
not the same (i.e. different last_modified times)

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_scrubber/src/checks.rs                | 24 +++++++++++++------
 storage_scrubber/src/lib.rs                   |  7 +++---
 .../src/pageserver_physical_gc.rs             |  6 ++++-
 .../src/scan_pageserver_metadata.rs           | 10 ++++++--
 storage_scrubber/src/tenant_snapshot.rs       |  2 ++
 5 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 32c86052ef..b42709868b 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -1,4 +1,5 @@
 use std::collections::{HashMap, HashSet};
+use std::time::SystemTime;
 
 use itertools::Itertools;
 use pageserver::tenant::checks::check_valid_layermap;
@@ -88,9 +89,14 @@ pub(crate) async fn branch_cleanup_and_check_errors(
             match s3_data.blob_data {
                 BlobDataParseResult::Parsed {
                     index_part,
-                    index_part_generation: _index_part_generation,
-                    s3_layers: _s3_layers,
+                    index_part_generation: _,
+                    s3_layers: _,
+                    index_part_last_modified_time,
+                    index_part_snapshot_time,
                 } => {
+                    // Ignore missing file error if index_part downloaded is different from the one when listing the layer files.
+                    let ignore_error = index_part_snapshot_time < index_part_last_modified_time
+                        && !cfg!(debug_assertions);
                     if !IndexPart::KNOWN_VERSIONS.contains(&index_part.version()) {
                         result
                             .errors
@@ -171,7 +177,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                                     is_l0,
                                 );
 
-                                if is_l0 {
+                                if is_l0 || ignore_error {
                                     result.warnings.push(msg);
                                 } else {
                                     result.errors.push(msg);
@@ -308,6 +314,8 @@ pub(crate) enum BlobDataParseResult {
     Parsed {
         index_part: Box<IndexPart>,
         index_part_generation: Generation,
+        index_part_last_modified_time: SystemTime,
+        index_part_snapshot_time: SystemTime,
         s3_layers: HashSet<(LayerName, Generation)>,
     },
     /// The remains of an uncleanly deleted Timeline or aborted timeline creation(e.g. an initdb archive only, or some layer without an index)
@@ -484,9 +492,9 @@ async fn list_timeline_blobs_impl(
     }
 
     if let Some(index_part_object_key) = index_part_object.as_ref() {
-        let index_part_bytes =
+        let (index_part_bytes, index_part_last_modified_time) =
             match download_object_with_retries(remote_client, &index_part_object_key.key).await {
-                Ok(index_part_bytes) => index_part_bytes,
+                Ok(data) => data,
                 Err(e) => {
                     // It is possible that the branch gets deleted in-between we list the objects
                     // and we download the index part file.
@@ -500,7 +508,7 @@ async fn list_timeline_blobs_impl(
                     ));
                 }
             };
-
+        let index_part_snapshot_time = index_part_object_key.last_modified;
         match serde_json::from_slice(&index_part_bytes) {
             Ok(index_part) => {
                 return Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData {
@@ -508,6 +516,8 @@ async fn list_timeline_blobs_impl(
                         index_part: Box::new(index_part),
                         index_part_generation,
                         s3_layers,
+                        index_part_last_modified_time,
+                        index_part_snapshot_time,
                     },
                     unused_index_keys: index_part_keys,
                     unknown_keys,
@@ -625,7 +635,7 @@ pub(crate) async fn list_tenant_manifests(
 
     let manifest_bytes =
         match download_object_with_retries(remote_client, &latest_listing_object.key).await {
-            Ok(bytes) => bytes,
+            Ok((bytes, _)) => bytes,
             Err(e) => {
                 // It is possible that the tenant gets deleted in-between we list the objects
                 // and we download the manifest file.
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index be526daaf0..224235098c 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -13,7 +13,7 @@ pub mod tenant_snapshot;
 use std::env;
 use std::fmt::Display;
 use std::sync::Arc;
-use std::time::Duration;
+use std::time::{Duration, SystemTime};
 
 use anyhow::Context;
 use aws_config::retry::{RetryConfigBuilder, RetryMode};
@@ -509,10 +509,11 @@ async fn list_objects_with_retries(
     panic!("MAX_RETRIES is not allowed to be 0");
 }
 
+/// Returns content, last modified time
 async fn download_object_with_retries(
     remote_client: &GenericRemoteStorage,
     key: &RemotePath,
-) -> anyhow::Result<Vec<u8>> {
+) -> anyhow::Result<(Vec<u8>, SystemTime)> {
     let cancel = CancellationToken::new();
     for trial in 0..MAX_RETRIES {
         let mut buf = Vec::new();
@@ -535,7 +536,7 @@ async fn download_object_with_retries(
         {
             Ok(bytes_read) => {
                 tracing::debug!("Downloaded {bytes_read} bytes for object {key}");
-                return Ok(buf);
+                return Ok((buf, download.last_modified));
             }
             Err(e) => {
                 error!("Failed to stream object body for key {key}: {e}");
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index d19b8a5f91..a997373375 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -450,6 +450,8 @@ async fn gc_ancestor(
                 index_part: _,
                 index_part_generation: _,
                 s3_layers,
+                index_part_last_modified_time: _,
+                index_part_snapshot_time: _,
             } => s3_layers,
             BlobDataParseResult::Relic => {
                 // Post-deletion tenant location: don't try and GC it.
@@ -586,7 +588,9 @@ async fn gc_timeline(
         BlobDataParseResult::Parsed {
             index_part,
             index_part_generation,
-            s3_layers: _s3_layers,
+            s3_layers: _,
+            index_part_last_modified_time: _,
+            index_part_snapshot_time: _,
         } => (index_part, *index_part_generation, data.unused_index_keys),
         BlobDataParseResult::Relic => {
             // Post-deletion tenant location: don't try and GC it.
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index c8de6e46b3..a31fb5b242 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -47,6 +47,8 @@ impl MetadataSummary {
             index_part,
             index_part_generation: _,
             s3_layers: _,
+            index_part_last_modified_time: _,
+            index_part_snapshot_time: _,
         } = &data.blob_data
         {
             *self
@@ -195,7 +197,9 @@ pub async fn scan_pageserver_metadata(
                     if let BlobDataParseResult::Parsed {
                         index_part,
                         index_part_generation,
-                        s3_layers: _s3_layers,
+                        s3_layers: _,
+                        index_part_last_modified_time: _,
+                        index_part_snapshot_time: _,
                     } = &data.blob_data
                     {
                         if index_part.deleted_at.is_some() {
@@ -318,9 +322,11 @@ pub async fn scan_pageserver_metadata(
 
         match &data.blob_data {
             BlobDataParseResult::Parsed {
-                index_part: _index_part,
+                index_part: _,
                 index_part_generation: _index_part_generation,
                 s3_layers,
+                index_part_last_modified_time: _,
+                index_part_snapshot_time: _,
             } => {
                 tenant_objects.push(ttid, s3_layers.clone());
             }
diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs
index 39e0b5c9b4..60e79fb859 100644
--- a/storage_scrubber/src/tenant_snapshot.rs
+++ b/storage_scrubber/src/tenant_snapshot.rs
@@ -268,6 +268,8 @@ impl SnapshotDownloader {
                         index_part,
                         index_part_generation,
                         s3_layers: _,
+                        index_part_last_modified_time: _,
+                        index_part_snapshot_time: _,
                     } => {
                         self.download_timeline(
                             ttid,

From dc284247a5b0d4fd442868c9dc555dd0ab50c0c3 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 8 Jan 2025 10:26:53 +0000
Subject: [PATCH 125/583] storage_controller: fix node flap detach race
 (#10298)

## Problem

The observed state removal may race with the inline updates of the
observed state done from `Service::node_activate_reconcile`.

This was intended to work as follows:
1. Detaches while the node is unavailable remove the entry from the
   observed state.
2. `Service::node_activate_reconcile` diffs the locations returned
   by the pageserver with the observed state and detaches in-line
   when required.

## Summary of changes

This PR removes step (1) and lets background reconciliations
deal with the mismatch between the intent and observed state.
A follow up will attempt to remove `Service::node_activate_reconcile`
altogether.

Closes https://github.com/neondatabase/neon/issues/10253
---
 storage_controller/src/reconciler.rs          |  16 ++-
 storage_controller/src/service.rs             |   4 +
 test_runner/fixtures/neon_fixtures.py         |   7 +-
 .../regress/test_storage_controller.py        | 122 +++++++++++++++++-
 4 files changed, 139 insertions(+), 10 deletions(-)

diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 475f91eff4..e0a854fff7 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -14,7 +14,6 @@ use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
 use utils::backoff::exponential_backoff;
-use utils::failpoint_support;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
 use utils::lsn::Lsn;
@@ -212,11 +211,12 @@ impl Reconciler {
         lazy: bool,
     ) -> Result<(), ReconcileError> {
         if !node.is_available() && config.mode == LocationConfigMode::Detached {
-            // Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline
-            // will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of
-            // what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`]
-            tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation");
-            self.observed.locations.remove(&node.get_id());
+            // [`crate::service::Service::node_activate_reconcile`] will update the observed state
+            // when the node comes back online. At that point, the intent and observed states will
+            // be mismatched and a background reconciliation will detach.
+            tracing::info!(
+                "Node {node} is unavailable during detach: proceeding anyway, it will be detached via background reconciliation"
+            );
             return Ok(());
         }
 
@@ -749,6 +749,8 @@ impl Reconciler {
                     };
 
                     if increment_generation {
+                        pausable_failpoint!("reconciler-pre-increment-generation");
+
                         let generation = self
                             .persistence
                             .increment_generation(self.tenant_shard_id, node.get_id())
@@ -824,7 +826,7 @@ impl Reconciler {
                 .handle_detach(self.tenant_shard_id, self.shard.stripe_size);
         }
 
-        failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue");
+        pausable_failpoint!("reconciler-epilogue");
 
         Ok(())
     }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 222cb9fdd4..359fcb3288 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -83,6 +83,7 @@ use utils::{
     generation::Generation,
     http::error::ApiError,
     id::{NodeId, TenantId, TimelineId},
+    pausable_failpoint,
     sync::gate::Gate,
 };
 
@@ -1024,6 +1025,8 @@ impl Service {
                     )
                     .await;
 
+                    pausable_failpoint!("heartbeat-pre-node-state-configure");
+
                     // This is the code path for geniune availability transitions (i.e node
                     // goes unavailable and/or comes back online).
                     let res = self
@@ -2492,6 +2495,7 @@ impl Service {
                 // Persist updates
                 // Ordering: write to the database before applying changes in-memory, so that
                 // we will not appear time-travel backwards on a restart.
+
                 let mut schedule_context = ScheduleContext::default();
                 for ShardUpdate {
                     tenant_shard_id,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 8fd9eec8ce..00fdda2998 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2521,6 +2521,7 @@ class NeonPageserver(PgProtocol, LogUtils):
         self,
         extra_env_vars: dict[str, str] | None = None,
         timeout_in_seconds: int | None = None,
+        await_active: bool = True,
     ) -> Self:
         """
         Start the page server.
@@ -2547,8 +2548,10 @@ class NeonPageserver(PgProtocol, LogUtils):
         )
         self.running = True
 
-        if self.env.storage_controller.running and self.env.storage_controller.node_registered(
-            self.id
+        if (
+            await_active
+            and self.env.storage_controller.running
+            and self.env.storage_controller.node_registered(self.id)
         ):
             self.env.storage_controller.poll_node_status(
                 self.id, PageserverAvailability.ACTIVE, None, max_attempts=200, backoff=0.1
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 7062c35e05..973d0cdf82 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -17,6 +17,7 @@ from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     DEFAULT_AZ_ID,
+    LogCursor,
     NeonEnv,
     NeonEnvBuilder,
     NeonPageserver,
@@ -2406,7 +2407,14 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
     env.storage_controller.tenant_create(tid)
 
     env.storage_controller.reconcile_until_idle()
-    env.storage_controller.configure_failpoints(("sleep-on-reconcile-epilogue", "return(10000)"))
+    env.storage_controller.configure_failpoints(("reconciler-epilogue", "pause"))
+
+    def unpause_failpoint():
+        time.sleep(2)
+        env.storage_controller.configure_failpoints(("reconciler-epilogue", "off"))
+
+    thread = threading.Thread(target=unpause_failpoint)
+    thread.start()
 
     # Make a change to the tenant config to trigger a slow reconcile
     virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
@@ -2421,6 +2429,8 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
     observed_state = env.storage_controller.step_down()
     log.info(f"Storage controller stepped down with {observed_state=}")
 
+    thread.join()
+
     # Validate that we waited for the slow reconcile to complete
     # and updated the observed state in the storcon before stepping down.
     node_id = str(env.pageserver.id)
@@ -3294,3 +3304,113 @@ def test_storage_controller_detached_stopped(
 
     # Confirm the detach happened
     assert env.pageserver.http_client().tenant_list_locations()["tenant_shards"] == []
+
+
+@run_only_on_default_postgres("Postgres version makes no difference here")
+def test_storage_controller_node_flap_detach_race(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Reproducer for https://github.com/neondatabase/neon/issues/10253.
+
+    When a node's availability flaps, the reconciliations spawned by the node
+    going offline may race with the reconciliation done when then node comes
+    back online.
+    """
+    neon_env_builder.num_pageservers = 4
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    env.storage_controller.tenant_create(
+        tenant_id,
+        shard_count=2,
+    )
+    env.storage_controller.reconcile_until_idle()
+
+    stopped_nodes = [s["node_id"] for s in env.storage_controller.locate(tenant_id)]
+
+    def has_hit_failpoint(failpoint: str, offset: LogCursor | None = None) -> LogCursor:
+        res = env.storage_controller.log_contains(f"at failpoint {failpoint}", offset=offset)
+        assert res
+        return res[1]
+
+    # Stop the nodes which host attached shards.
+    # This will trigger reconciliations which pause before incrmenenting the generation,
+    # and, more importantly, updating the `generation_pageserver` of the shards.
+    env.storage_controller.configure_failpoints(("reconciler-pre-increment-generation", "pause"))
+    for node_id in stopped_nodes:
+        env.get_pageserver(node_id).stop(immediate=True)
+
+    def failure_handled() -> LogCursor:
+        stop_offset = None
+
+        for node_id in stopped_nodes:
+            res = env.storage_controller.log_contains(f"node {node_id} going offline")
+            assert res
+            stop_offset = res[1]
+
+        assert stop_offset
+        return stop_offset
+
+    offset = wait_until(failure_handled)
+
+    # Now restart the nodes and make them pause before marking themselves as available
+    # or running the activation reconciliation.
+    env.storage_controller.configure_failpoints(("heartbeat-pre-node-state-configure", "pause"))
+
+    for node_id in stopped_nodes:
+        env.get_pageserver(node_id).start(await_active=False)
+
+    offset = wait_until(
+        lambda: has_hit_failpoint("heartbeat-pre-node-state-configure", offset=offset)
+    )
+
+    # The nodes have restarted and are waiting to perform activaction reconciliation.
+    # Unpause the initial reconciliation triggered by the nodes going offline.
+    # It will attempt to detach from the old location, but notice that the old location
+    # is not yet available, and then stop before processing the results of the reconciliation.
+    env.storage_controller.configure_failpoints(("reconciler-epilogue", "pause"))
+    env.storage_controller.configure_failpoints(("reconciler-pre-increment-generation", "off"))
+
+    offset = wait_until(lambda: has_hit_failpoint("reconciler-epilogue", offset=offset))
+
+    # Let the nodes perform activation reconciliation while still holding up processing the result
+    # from the initial reconcile triggered by going offline.
+    env.storage_controller.configure_failpoints(("heartbeat-pre-node-state-configure", "off"))
+
+    def activate_reconciliation_done():
+        for node_id in stopped_nodes:
+            assert env.storage_controller.log_contains(
+                f"Node {node_id} transition to active", offset=offset
+            )
+
+    wait_until(activate_reconciliation_done)
+
+    # Finally, allow the initial reconcile to finish up.
+    env.storage_controller.configure_failpoints(("reconciler-epilogue", "off"))
+
+    # Give things a chance to settle and validate that no stale locations exist
+    env.storage_controller.reconcile_until_idle()
+
+    def validate_locations():
+        shard_locations = defaultdict(list)
+        for ps in env.pageservers:
+            locations = ps.http_client().tenant_list_locations()["tenant_shards"]
+            for loc in locations:
+                shard_locations[loc[0]].append(
+                    {"generation": loc[1]["generation"], "mode": loc[1]["mode"], "node": ps.id}
+                )
+
+        log.info(f"Shard locations: {shard_locations}")
+
+        attached_locations = {
+            k: list(filter(lambda loc: loc["mode"] == "AttachedSingle", v))
+            for k, v in shard_locations.items()
+        }
+
+        for shard, locs in attached_locations.items():
+            assert len(locs) == 1, f"{shard} has {len(locs)} attached locations"
+
+    wait_until(validate_locations, timeout=10)

From 68d8acfd058b7b2d0deb041d14252f17d50ad05f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 8 Jan 2025 18:12:09 +0000
Subject: [PATCH 126/583] storage controller: don't hold detached tenants in
 memory (#10264)

## Problem

Typical deployments of neon have some tenants that stay in use
continuously, and a background churning population of tenants that are
created and then fall idle, and are configured to Detached state.
Currently, this churn of short lived tenants results in an
ever-increasing memory footprint.

Closes: https://github.com/neondatabase/neon/issues/9712

## Summary of changes

- At startup, filter to only load shards that don't have Detached policy
- In process_result, check if a tenant's shards are all Detached and
observed=={}, and if so drop them from memory
- In tenant_location_conf and other tenant mutators, load the tenants'
shards on-demand if they are not present
---
 storage_controller/src/id_lock_map.rs         |   8 +
 storage_controller/src/persistence.rs         |  34 ++-
 storage_controller/src/service.rs             | 195 +++++++++++++++---
 storage_controller/src/tenant_shard.rs        |   4 +
 .../regress/test_storage_controller.py        | 105 +++++++++-
 5 files changed, 318 insertions(+), 28 deletions(-)

diff --git a/storage_controller/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs
index fcd3eb57e2..2d8b674f86 100644
--- a/storage_controller/src/id_lock_map.rs
+++ b/storage_controller/src/id_lock_map.rs
@@ -112,6 +112,14 @@ where
         }
     }
 
+    pub(crate) fn try_exclusive(&self, key: T, operation: I) -> Option<TracingExclusiveGuard<I>> {
+        let mut locked = self.entities.lock().unwrap();
+        let entry = locked.entry(key).or_default().clone();
+        let mut guard = TracingExclusiveGuard::new(entry.try_write_owned().ok()?);
+        *guard.guard = Some(operation);
+        Some(guard)
+    }
+
     /// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do
     /// periodic housekeeping to avoid the map growing indefinitely
     pub(crate) fn housekeeping(&self) {
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index cc377e606e..c5eb106f24 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -97,6 +97,7 @@ pub(crate) enum DatabaseOperation {
     TenantGenerations,
     ShardGenerations,
     ListTenantShards,
+    LoadTenant,
     InsertTenantShards,
     UpdateTenantShard,
     DeleteTenant,
@@ -330,11 +331,40 @@ impl Persistence {
 
     /// At startup, load the high level state for shards, such as their config + policy.  This will
     /// be enriched at runtime with state discovered on pageservers.
-    pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
+    ///
+    /// We exclude shards configured to be detached.  During startup, if we see any attached locations
+    /// for such shards, they will automatically be detached as 'orphans'.
+    pub(crate) async fn load_active_tenant_shards(
+        &self,
+    ) -> DatabaseResult<Vec<TenantShardPersistence>> {
+        use crate::schema::tenant_shards::dsl::*;
         self.with_measured_conn(
             DatabaseOperation::ListTenantShards,
             move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
+                let query = tenant_shards.filter(
+                    placement_policy.ne(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
+                );
+                let result = query.load::<TenantShardPersistence>(conn)?;
+
+                Ok(result)
+            },
+        )
+        .await
+    }
+
+    /// When restoring a previously detached tenant into memory, load it from the database
+    pub(crate) async fn load_tenant(
+        &self,
+        filter_tenant_id: TenantId,
+    ) -> DatabaseResult<Vec<TenantShardPersistence>> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_measured_conn(
+            DatabaseOperation::LoadTenant,
+            move |conn| -> DatabaseResult<_> {
+                let query = tenant_shards.filter(tenant_id.eq(filter_tenant_id.to_string()));
+                let result = query.load::<TenantShardPersistence>(conn)?;
+
+                Ok(result)
             },
         )
         .await
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 359fcb3288..fd4ee7fd10 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -155,6 +155,7 @@ enum TenantOperations {
     TimelineArchivalConfig,
     TimelineDetachAncestor,
     TimelineGcBlockUnblock,
+    DropDetached,
 }
 
 #[derive(Clone, strum_macros::Display)]
@@ -416,8 +417,8 @@ pub struct Service {
     /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile
     /// Send into this queue to promptly attempt to reconcile this shard next time units are available.
     ///
-    /// Note that this state logically lives inside ServiceInner, but carrying Sender here makes the code simpler
-    /// by avoiding needing a &mut ref to something inside the ServiceInner.  This could be optimized to
+    /// Note that this state logically lives inside ServiceState, but carrying Sender here makes the code simpler
+    /// by avoiding needing a &mut ref to something inside the ServiceState.  This could be optimized to
     /// use a VecDeque instead of a channel to reduce synchronization overhead, at the cost of some code complexity.
     delayed_reconcile_tx: tokio::sync::mpsc::Sender<TenantShardId>,
 
@@ -1165,6 +1166,20 @@ impl Service {
             }
         }
 
+        // If we just finished detaching all shards for a tenant, it might be time to drop it from memory.
+        if tenant.policy == PlacementPolicy::Detached {
+            // We may only drop a tenant from memory while holding the exclusive lock on the tenant ID: this protects us
+            // from concurrent execution wrt a request handler that might expect the tenant to remain in memory for the
+            // duration of the request.
+            let guard = self.tenant_op_locks.try_exclusive(
+                tenant.tenant_shard_id.tenant_id,
+                TenantOperations::DropDetached,
+            );
+            if let Some(guard) = guard {
+                self.maybe_drop_tenant(tenant.tenant_shard_id.tenant_id, &mut locked, &guard);
+            }
+        }
+
         // Maybe some other work can proceed now that this job finished.
         if self.reconciler_concurrency.available_permits() > 0 {
             while let Ok(tenant_shard_id) = locked.delayed_reconcile_rx.try_recv() {
@@ -1294,7 +1309,7 @@ impl Service {
             .set(nodes.len() as i64);
 
         tracing::info!("Loading shards from database...");
-        let mut tenant_shard_persistence = persistence.list_tenant_shards().await?;
+        let mut tenant_shard_persistence = persistence.load_active_tenant_shards().await?;
         tracing::info!(
             "Loaded {} shards from database.",
             tenant_shard_persistence.len()
@@ -1546,8 +1561,14 @@ impl Service {
         // the pageserver API (not via this service), we will auto-create any missing tenant
         // shards with default state.
         let insert = {
-            let locked = self.inner.write().unwrap();
-            !locked.tenants.contains_key(&attach_req.tenant_shard_id)
+            match self
+                .maybe_load_tenant(attach_req.tenant_shard_id.tenant_id, &_tenant_lock)
+                .await
+            {
+                Ok(_) => false,
+                Err(ApiError::NotFound(_)) => true,
+                Err(e) => return Err(e.into()),
+            }
         };
 
         if insert {
@@ -2439,6 +2460,99 @@ impl Service {
         }
     }
 
+    /// For APIs that might act on tenants with [`PlacementPolicy::Detached`], first check if
+    /// the tenant is present in memory. If not, load it from the database.  If it is found
+    /// in neither location, return a NotFound error.
+    ///
+    /// Caller must demonstrate they hold a lock guard, as otherwise two callers might try and load
+    /// it at the same time, or we might race with [`Self::maybe_drop_tenant`]
+    async fn maybe_load_tenant(
+        &self,
+        tenant_id: TenantId,
+        _guard: &TracingExclusiveGuard<TenantOperations>,
+    ) -> Result<(), ApiError> {
+        let present_in_memory = {
+            let locked = self.inner.read().unwrap();
+            locked
+                .tenants
+                .range(TenantShardId::tenant_range(tenant_id))
+                .next()
+                .is_some()
+        };
+
+        if present_in_memory {
+            return Ok(());
+        }
+
+        let tenant_shards = self.persistence.load_tenant(tenant_id).await?;
+        if tenant_shards.is_empty() {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant {} not found", tenant_id).into(),
+            ));
+        }
+
+        // TODO: choose a fresh AZ to use for this tenant when un-detaching: there definitely isn't a running
+        // compute, so no benefit to making AZ sticky across detaches.
+
+        let mut locked = self.inner.write().unwrap();
+        tracing::info!(
+            "Loaded {} shards for tenant {}",
+            tenant_shards.len(),
+            tenant_id
+        );
+
+        locked.tenants.extend(tenant_shards.into_iter().map(|p| {
+            let intent = IntentState::new();
+            let shard =
+                TenantShard::from_persistent(p, intent).expect("Corrupt shard row in database");
+
+            // Sanity check: when loading on-demand, we should always be loaded something Detached
+            debug_assert!(shard.policy == PlacementPolicy::Detached);
+            if shard.policy != PlacementPolicy::Detached {
+                tracing::error!(
+                    "Tenant shard {} loaded on-demand, but has non-Detached policy {:?}",
+                    shard.tenant_shard_id,
+                    shard.policy
+                );
+            }
+
+            (shard.tenant_shard_id, shard)
+        }));
+
+        Ok(())
+    }
+
+    /// If all shards for a tenant are detached, and in a fully quiescent state (no observed locations on pageservers),
+    /// and have no reconciler running, then we can drop the tenant from memory.  It will be reloaded on-demand
+    /// if we are asked to attach it again (see [`Self::maybe_load_tenant`]).
+    ///
+    /// Caller must demonstrate they hold a lock guard, as otherwise it is unsafe to drop a tenant from
+    /// memory while some other function might assume it continues to exist while not holding the lock on Self::inner.
+    fn maybe_drop_tenant(
+        &self,
+        tenant_id: TenantId,
+        locked: &mut std::sync::RwLockWriteGuard<ServiceState>,
+        _guard: &TracingExclusiveGuard<TenantOperations>,
+    ) {
+        let mut tenant_shards = locked.tenants.range(TenantShardId::tenant_range(tenant_id));
+        if tenant_shards.all(|(_id, shard)| {
+            shard.policy == PlacementPolicy::Detached
+                && shard.reconciler.is_none()
+                && shard.observed.is_empty()
+        }) {
+            let keys = locked
+                .tenants
+                .range(TenantShardId::tenant_range(tenant_id))
+                .map(|(id, _)| id)
+                .copied()
+                .collect::<Vec<_>>();
+            for key in keys {
+                tracing::info!("Dropping detached tenant shard {} from memory", key);
+                locked.tenants.remove(&key);
+            }
+        }
+    }
+
     /// This API is used by the cloud control plane to migrate unsharded tenants that it created
     /// directly with pageservers into this service.
     ///
@@ -2465,14 +2579,26 @@ impl Service {
         )
         .await;
 
-        if !tenant_shard_id.is_unsharded() {
+        let tenant_id = if !tenant_shard_id.is_unsharded() {
             return Err(ApiError::BadRequest(anyhow::anyhow!(
                 "This API is for importing single-sharded or unsharded tenants"
             )));
-        }
+        } else {
+            tenant_shard_id.tenant_id
+        };
+
+        // In case we are waking up a Detached tenant
+        match self.maybe_load_tenant(tenant_id, &_tenant_lock).await {
+            Ok(()) | Err(ApiError::NotFound(_)) => {
+                // This is a creation or an update
+            }
+            Err(e) => {
+                return Err(e);
+            }
+        };
 
         // First check if this is a creation or an update
-        let create_or_update = self.tenant_location_config_prepare(tenant_shard_id.tenant_id, req);
+        let create_or_update = self.tenant_location_config_prepare(tenant_id, req);
 
         let mut result = TenantLocationConfigResponse {
             shards: Vec::new(),
@@ -2600,6 +2726,8 @@ impl Service {
         let tenant_id = req.tenant_id;
         let patch = req.config;
 
+        self.maybe_load_tenant(tenant_id, &_tenant_lock).await?;
+
         let base = {
             let locked = self.inner.read().unwrap();
             let shards = locked
@@ -2644,19 +2772,7 @@ impl Service {
         )
         .await;
 
-        let tenant_exists = {
-            let locked = self.inner.read().unwrap();
-            let mut r = locked
-                .tenants
-                .range(TenantShardId::tenant_range(req.tenant_id));
-            r.next().is_some()
-        };
-
-        if !tenant_exists {
-            return Err(ApiError::NotFound(
-                anyhow::anyhow!("Tenant {} not found", req.tenant_id).into(),
-            ));
-        }
+        self.maybe_load_tenant(req.tenant_id, &_tenant_lock).await?;
 
         self.set_tenant_config_and_reconcile(req.tenant_id, req.config)
             .await
@@ -2949,6 +3065,8 @@ impl Service {
         let _tenant_lock =
             trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await;
 
+        self.maybe_load_tenant(tenant_id, &_tenant_lock).await?;
+
         // Detach all shards. This also deletes local pageserver shard data.
         let (detach_waiters, node) = {
             let mut detach_waiters = Vec::new();
@@ -3068,6 +3186,8 @@ impl Service {
         )
         .await;
 
+        self.maybe_load_tenant(tenant_id, &_tenant_lock).await?;
+
         failpoint_support::sleep_millis_async!("tenant-update-policy-exclusive-lock");
 
         let TenantPolicyRequest {
@@ -5150,11 +5270,13 @@ impl Service {
             )));
         }
 
-        let mut shards = self.persistence.list_tenant_shards().await?;
-        shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count));
+        let mut persistent_shards = self.persistence.load_active_tenant_shards().await?;
+        persistent_shards
+            .sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count));
+
         expect_shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count));
 
-        if shards != expect_shards {
+        if persistent_shards != expect_shards {
             tracing::error!("Consistency check failed on shards.");
             tracing::error!(
                 "Shards in memory: {}",
@@ -5163,7 +5285,7 @@ impl Service {
             );
             tracing::error!(
                 "Shards in database: {}",
-                serde_json::to_string(&shards)
+                serde_json::to_string(&persistent_shards)
                     .map_err(|e| ApiError::InternalServerError(e.into()))?
             );
             return Err(ApiError::InternalServerError(anyhow::anyhow!(
@@ -6119,6 +6241,10 @@ impl Service {
         let mut pending_reconciles = 0;
         let mut az_violations = 0;
 
+        // If we find any tenants to drop from memory, stash them to offload after
+        // we're done traversing the map of tenants.
+        let mut drop_detached_tenants = Vec::new();
+
         let mut reconciles_spawned = 0;
         for shard in tenants.values_mut() {
             // Accumulate scheduling statistics
@@ -6152,6 +6278,25 @@ impl Service {
                 // Shard wanted to reconcile but for some reason couldn't.
                 pending_reconciles += 1;
             }
+
+            // If this tenant is detached, try dropping it from memory. This is usually done
+            // proactively in [`Self::process_results`], but we do it here to handle the edge
+            // case where a reconcile completes while someone else is holding an op lock for the tenant.
+            if shard.tenant_shard_id.shard_number == ShardNumber(0)
+                && shard.policy == PlacementPolicy::Detached
+            {
+                if let Some(guard) = self.tenant_op_locks.try_exclusive(
+                    shard.tenant_shard_id.tenant_id,
+                    TenantOperations::DropDetached,
+                ) {
+                    drop_detached_tenants.push((shard.tenant_shard_id.tenant_id, guard));
+                }
+            }
+        }
+
+        // Process any deferred tenant drops
+        for (tenant_id, guard) in drop_detached_tenants {
+            self.maybe_drop_tenant(tenant_id, &mut locked, &guard);
         }
 
         metrics::METRICS_REGISTRY
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index cba579e8a7..c17989a316 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -465,6 +465,10 @@ impl ObservedState {
             locations: HashMap::new(),
         }
     }
+
+    pub(crate) fn is_empty(&self) -> bool {
+        self.locations.is_empty()
+    }
 }
 
 impl TenantShard {
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 973d0cdf82..207f55a214 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3299,13 +3299,116 @@ def test_storage_controller_detached_stopped(
             "generation": None,
         },
     )
-
+    env.storage_controller.reconcile_until_idle()
     env.storage_controller.consistency_check()
 
     # Confirm the detach happened
     assert env.pageserver.http_client().tenant_list_locations()["tenant_shards"] == []
 
 
+@run_only_on_default_postgres("Postgres version makes no difference here")
+def test_storage_controller_detach_lifecycle(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Test that detached tenants are handled properly through their lifecycle: getting dropped
+    from memory when detached, then getting loaded back on-demand.
+    """
+
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    neon_env_builder.num_pageservers = 1
+
+    env = neon_env_builder.init_configs()
+    env.start()
+    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.storage_controller.tenant_create(
+        tenant_id,
+        shard_count=1,
+    )
+    virtual_ps_http.timeline_create(PgVersion.NOT_SET, tenant_id, timeline_id)
+
+    remote_prefix = "/".join(
+        (
+            "tenants",
+            str(tenant_id),
+        )
+    )
+    # We will later check data is gone after deletion, so as a control check that it is present to begin with
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=remote_prefix,
+    )
+
+    assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1
+    assert len(env.storage_controller.tenant_list()) == 1
+
+    # Detach the tenant
+    virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": None,
+        },
+    )
+    # Ensure reconciles are done (the one we do inline in location_conf is advisory and if it takes too long that API just succeeds anyway)
+    env.storage_controller.reconcile_until_idle()
+    env.storage_controller.consistency_check()
+
+    # Confirm the detach happened on pageserver
+    assert env.pageserver.http_client().tenant_list_locations()["tenant_shards"] == []
+    # Confirm the tenant is not in memory on the controller
+    assert env.storage_controller.tenant_list() == []
+
+    # The detached tenant does not get loaded into memory across a controller restart
+    env.storage_controller.stop()
+    env.storage_controller.start()
+    assert env.storage_controller.tenant_list() == []
+    env.storage_controller.consistency_check()
+
+    # The detached tenant can be re-attached
+    virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "AttachedSingle",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": None,
+        },
+    )
+    assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1
+    assert len(env.storage_controller.tenant_list()) == 1
+    env.storage_controller.consistency_check()
+
+    # Detach it again before doing deletion
+    virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": None,
+        },
+    )
+    env.storage_controller.reconcile_until_idle()
+    env.storage_controller.consistency_check()
+
+    # A detached tenant can be deleted
+    virtual_ps_http.tenant_delete(tenant_id)
+
+    # Such deletions really work (empty remote storage)
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=remote_prefix,
+    )
+
+
 @run_only_on_default_postgres("Postgres version makes no difference here")
 def test_storage_controller_node_flap_detach_race(
     neon_env_builder: NeonEnvBuilder,

From 0ad0db6ff8b5b491244d251fa09c8093f725a1d3 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Wed, 8 Jan 2025 18:55:04 +0000
Subject: [PATCH 127/583] compute: dropdb DROP SUBSCRIPTION fix (#10066)

## Problem
Project gets stuck if database with subscriptions was deleted via API /
UI.

https://github.com/neondatabase/cloud/issues/18646

## Summary of changes
Before dropping the database, drop all the subscriptions in it.
Do not drop slot on publisher, because we have no guarantee that the
slot still exists or that the publisher is reachable.

Add `DropSubscriptionsForDeletedDatabases` phase to run these operations
in all databases, we're about to delete.
Ignore the error if the database does not exist.
---
 compute_tools/src/compute.rs                  |  95 ++++++++++++--
 compute_tools/src/spec_apply.rs               |  32 ++++-
 .../sql/drop_subscription_for_drop_dbs.sql    |  11 ++
 test_runner/fixtures/neon_fixtures.py         |  27 +++-
 test_runner/regress/test_compute_catalog.py   | 116 +++++++++++++++++-
 5 files changed, 262 insertions(+), 19 deletions(-)
 create mode 100644 compute_tools/src/sql/drop_subscription_for_drop_dbs.sql

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 78f6033429..1ac97a378b 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -15,7 +15,7 @@ use std::time::Instant;
 
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
-use compute_api::spec::{PgIdent, Role};
+use compute_api::spec::{Database, PgIdent, Role};
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
@@ -45,8 +45,10 @@ use crate::spec_apply::ApplySpecPhase::{
     DropInvalidDatabases, DropRoles, HandleNeonExtension, HandleOtherExtensions,
     RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase,
 };
+use crate::spec_apply::PerDatabasePhase;
 use crate::spec_apply::PerDatabasePhase::{
-    ChangeSchemaPerms, DeleteDBRoleReferences, HandleAnonExtension,
+    ChangeSchemaPerms, DeleteDBRoleReferences, DropSubscriptionsForDeletedDatabases,
+    HandleAnonExtension,
 };
 use crate::spec_apply::{apply_operations, MutableApplyContext, DB};
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
@@ -834,7 +836,7 @@ impl ComputeNode {
         conf
     }
 
-    async fn get_maintenance_client(
+    pub async fn get_maintenance_client(
         conf: &tokio_postgres::Config,
     ) -> Result<tokio_postgres::Client> {
         let mut conf = conf.clone();
@@ -943,6 +945,78 @@ impl ComputeNode {
                 dbs: databases,
             }));
 
+            // Apply special pre drop database phase.
+            // NOTE: we use the code of RunInEachDatabase phase for parallelism
+            // and connection management, but we don't really run it in *each* database,
+            // only in databases, we're about to drop.
+            info!("Applying PerDatabase (pre-dropdb) phase");
+            let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency));
+
+            // Run the phase for each database that we're about to drop.
+            let db_processes = spec
+                .delta_operations
+                .iter()
+                .flatten()
+                .filter_map(move |op| {
+                    if op.action.as_str() == "delete_db" {
+                        Some(op.name.clone())
+                    } else {
+                        None
+                    }
+                })
+                .map(|dbname| {
+                    let spec = spec.clone();
+                    let ctx = ctx.clone();
+                    let jwks_roles = jwks_roles.clone();
+                    let mut conf = conf.as_ref().clone();
+                    let concurrency_token = concurrency_token.clone();
+                    // We only need dbname field for this phase, so set other fields to dummy values
+                    let db = DB::UserDB(Database {
+                        name: dbname.clone(),
+                        owner: "cloud_admin".to_string(),
+                        options: None,
+                        restrict_conn: false,
+                        invalid: false,
+                    });
+
+                    debug!("Applying per-database phases for Database {:?}", &db);
+
+                    match &db {
+                        DB::SystemDB => {}
+                        DB::UserDB(db) => {
+                            conf.dbname(db.name.as_str());
+                        }
+                    }
+
+                    let conf = Arc::new(conf);
+                    let fut = Self::apply_spec_sql_db(
+                        spec.clone(),
+                        conf,
+                        ctx.clone(),
+                        jwks_roles.clone(),
+                        concurrency_token.clone(),
+                        db,
+                        [DropSubscriptionsForDeletedDatabases].to_vec(),
+                    );
+
+                    Ok(spawn(fut))
+                })
+                .collect::<Vec<Result<_, anyhow::Error>>>();
+
+            for process in db_processes.into_iter() {
+                let handle = process?;
+                if let Err(e) = handle.await? {
+                    // Handle the error case where the database does not exist
+                    // We do not check whether the DB exists or not in the deletion phase,
+                    // so we shouldn't be strict about it in pre-deletion cleanup as well.
+                    if e.to_string().contains("does not exist") {
+                        warn!("Error dropping subscription: {}", e);
+                    } else {
+                        return Err(e);
+                    }
+                };
+            }
+
             for phase in [
                 CreateSuperUser,
                 DropInvalidDatabases,
@@ -962,7 +1036,7 @@ impl ComputeNode {
                 .await?;
             }
 
-            info!("Applying RunInEachDatabase phase");
+            info!("Applying RunInEachDatabase2 phase");
             let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency));
 
             let db_processes = spec
@@ -997,6 +1071,12 @@ impl ComputeNode {
                         jwks_roles.clone(),
                         concurrency_token.clone(),
                         db,
+                        [
+                            DeleteDBRoleReferences,
+                            ChangeSchemaPerms,
+                            HandleAnonExtension,
+                        ]
+                        .to_vec(),
                     );
 
                     Ok(spawn(fut))
@@ -1043,16 +1123,13 @@ impl ComputeNode {
         jwks_roles: Arc<HashSet<String>>,
         concurrency_token: Arc<tokio::sync::Semaphore>,
         db: DB,
+        subphases: Vec<PerDatabasePhase>,
     ) -> Result<()> {
         let _permit = concurrency_token.acquire().await?;
 
         let mut client_conn = None;
 
-        for subphase in [
-            DeleteDBRoleReferences,
-            ChangeSchemaPerms,
-            HandleAnonExtension,
-        ] {
+        for subphase in subphases {
             apply_operations(
                 spec.clone(),
                 ctx.clone(),
diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs
index 7308d5d36e..695a722d6d 100644
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -47,6 +47,7 @@ pub enum PerDatabasePhase {
     DeleteDBRoleReferences,
     ChangeSchemaPerms,
     HandleAnonExtension,
+    DropSubscriptionsForDeletedDatabases,
 }
 
 #[derive(Clone, Debug)]
@@ -326,13 +327,12 @@ async fn get_operations<'a>(
 
                             // Use FORCE to drop database even if there are active connections.
                             // We run this from `cloud_admin`, so it should have enough privileges.
+                            //
                             // NB: there could be other db states, which prevent us from dropping
                             // the database. For example, if db is used by any active subscription
                             // or replication slot.
-                            // TODO: deal with it once we allow logical replication. Proper fix should
-                            // involve returning an error code to the control plane, so it could
-                            // figure out that this is a non-retryable error, return it to the user
-                            // and fail operation permanently.
+                            // Such cases are handled in the DropSubscriptionsForDeletedDatabases
+                            // phase. We do all the cleanup before actually dropping the database.
                             let drop_db_query: String = format!(
                                 "DROP DATABASE IF EXISTS {} WITH (FORCE)",
                                 &op.name.pg_quote()
@@ -444,6 +444,30 @@ async fn get_operations<'a>(
         }
         ApplySpecPhase::RunInEachDatabase { db, subphase } => {
             match subphase {
+                PerDatabasePhase::DropSubscriptionsForDeletedDatabases => {
+                    match &db {
+                        DB::UserDB(db) => {
+                            let drop_subscription_query: String = format!(
+                                include_str!("sql/drop_subscription_for_drop_dbs.sql"),
+                                datname_str = escape_literal(&db.name),
+                            );
+
+                            let operations = vec![Operation {
+                                query: drop_subscription_query,
+                                comment: Some(format!(
+                                    "optionally dropping subscriptions for DB {}",
+                                    db.name,
+                                )),
+                            }]
+                            .into_iter();
+
+                            Ok(Box::new(operations))
+                        }
+                        // skip this cleanup for the system databases
+                        // because users can't drop them
+                        DB::SystemDB => Ok(Box::new(empty())),
+                    }
+                }
                 PerDatabasePhase::DeleteDBRoleReferences => {
                     let ctx = ctx.read().await;
 
diff --git a/compute_tools/src/sql/drop_subscription_for_drop_dbs.sql b/compute_tools/src/sql/drop_subscription_for_drop_dbs.sql
new file mode 100644
index 0000000000..dfb925e48e
--- /dev/null
+++ b/compute_tools/src/sql/drop_subscription_for_drop_dbs.sql
@@ -0,0 +1,11 @@
+DO $$
+DECLARE
+    subname TEXT;
+BEGIN
+    FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP
+        EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname);
+        EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);
+        EXECUTE format('DROP SUBSCRIPTION %I;', subname);
+    END LOOP;
+END;
+$$;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 00fdda2998..e22e452a52 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4933,13 +4933,30 @@ def check_restored_datadir_content(
     assert (mismatch, error) == ([], [])
 
 
-def logical_replication_sync(subscriber: PgProtocol, publisher: PgProtocol) -> Lsn:
+def logical_replication_sync(
+    subscriber: PgProtocol,
+    publisher: PgProtocol,
+    sub_dbname: str | None = None,
+    pub_dbname: str | None = None,
+) -> Lsn:
     """Wait logical replication subscriber to sync with publisher."""
-    publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+    if pub_dbname is not None:
+        publisher_lsn = Lsn(
+            publisher.safe_psql("SELECT pg_current_wal_flush_lsn()", dbname=pub_dbname)[0][0]
+        )
+    else:
+        publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+
     while True:
-        res = subscriber.safe_psql("select latest_end_lsn from pg_catalog.pg_stat_subscription")[0][
-            0
-        ]
+        if sub_dbname is not None:
+            res = subscriber.safe_psql(
+                "select latest_end_lsn from pg_catalog.pg_stat_subscription", dbname=sub_dbname
+            )[0][0]
+        else:
+            res = subscriber.safe_psql(
+                "select latest_end_lsn from pg_catalog.pg_stat_subscription"
+            )[0][0]
+
         if res:
             log.info(f"subscriber_lsn={res}")
             subscriber_lsn = Lsn(res)
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
index b3719a45ed..e411aad97d 100644
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
 
+import logging
+
 import requests
-from fixtures.neon_fixtures import NeonEnv
+from fixtures.neon_fixtures import NeonEnv, logical_replication_sync
 
 TEST_DB_NAMES = [
     {
@@ -136,3 +138,115 @@ def test_compute_create_databases(neon_simple_env: NeonEnv):
             assert curr_db is not None
             assert len(curr_db) == 1
             assert curr_db[0] == db["name"]
+
+
+def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
+    """
+    Test that compute_ctl can drop a database that has a logical replication subscription.
+    """
+    env = neon_simple_env
+
+    # Create and start endpoint so that neon_local put all the generated
+    # stuff into the spec.json file.
+    endpoint = env.endpoints.create_start("main")
+
+    TEST_DB_NAMES = [
+        {
+            "name": "neondb",
+            "owner": "cloud_admin",
+        },
+        {
+            "name": "subscriber_db",
+            "owner": "cloud_admin",
+        },
+        {
+            "name": "publisher_db",
+            "owner": "cloud_admin",
+        },
+    ]
+
+    # Update the spec.json file to create the databases
+    # and reconfigure the endpoint to apply the changes.
+    endpoint.respec_deep(
+        **{
+            "skip_pg_catalog_updates": False,
+            "cluster": {
+                "databases": TEST_DB_NAMES,
+            },
+        }
+    )
+    endpoint.reconfigure()
+
+    # connect to the publisher_db and create a publication
+    with endpoint.cursor(dbname="publisher_db") as cursor:
+        cursor.execute("CREATE PUBLICATION mypub FOR ALL TABLES")
+        cursor.execute("select pg_catalog.pg_create_logical_replication_slot('mysub', 'pgoutput');")
+        cursor.execute("CREATE TABLE t(a int)")
+        cursor.execute("INSERT INTO t VALUES (1)")
+
+    # connect to the subscriber_db and create a subscription
+    # Note that we need to create subscription with
+    connstr = endpoint.connstr(dbname="publisher_db").replace("'", "''")
+    with endpoint.cursor(dbname="subscriber_db") as cursor:
+        cursor.execute("CREATE TABLE t(a int)")
+        cursor.execute(
+            f"CREATE SUBSCRIPTION mysub CONNECTION '{connstr}' PUBLICATION mypub  WITH (create_slot = false) "
+        )
+
+    # wait for the subscription to be active
+    logical_replication_sync(
+        endpoint, endpoint, sub_dbname="subscriber_db", pub_dbname="publisher_db"
+    )
+
+    # Check that replication is working
+    with endpoint.cursor(dbname="subscriber_db") as cursor:
+        cursor.execute("SELECT * FROM t")
+        rows = cursor.fetchall()
+        assert len(rows) == 1
+        assert rows[0][0] == 1
+
+    # drop the subscriber_db from the list
+    TEST_DB_NAMES_NEW = [
+        {
+            "name": "neondb",
+            "owner": "cloud_admin",
+        },
+        {
+            "name": "publisher_db",
+            "owner": "cloud_admin",
+        },
+    ]
+    # Update the spec.json file to drop the database
+    # and reconfigure the endpoint to apply the changes.
+    endpoint.respec_deep(
+        **{
+            "skip_pg_catalog_updates": False,
+            "cluster": {
+                "databases": TEST_DB_NAMES_NEW,
+            },
+            "delta_operations": [
+                {"action": "delete_db", "name": "subscriber_db"},
+                # also test the case when we try to delete a non-existent database
+                # shouldn't happen in normal operation,
+                # but can occur when failed operations are retried
+                {"action": "delete_db", "name": "nonexistent_db"},
+            ],
+        }
+    )
+
+    logging.info("Reconfiguring the endpoint to drop the subscriber_db")
+    endpoint.reconfigure()
+
+    # Check that the subscriber_db is dropped
+    with endpoint.cursor() as cursor:
+        cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", ("subscriber_db",))
+        catalog_db = cursor.fetchone()
+        assert catalog_db is None
+
+    # Check that we can still connect to the publisher_db
+    with endpoint.cursor(dbname="publisher_db") as cursor:
+        cursor.execute("SELECT * FROM current_database()")
+        curr_db = cursor.fetchone()
+        assert curr_db is not None
+        assert len(curr_db) == 1
+        assert curr_db[0] == "publisher_db"

From fcfff724547cda260e884d2d680f199fdbc9471c Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Wed, 8 Jan 2025 21:34:53 +0200
Subject: [PATCH 128/583] impr(proxy): Decouple ip_allowlist from the
 CancelClosure (#10199)

This PR removes the direct dependency of the IP allowlist from
CancelClosure, allowing for more scalable and flexible IP restrictions
and enabling the future use of Redis-based CancelMap storage.

Changes:
- Introduce a new BackendAuth async trait that retrieves the IP
allowlist through existing authentication methods;
- Improve cancellation error handling by instrument() async
cancel_sesion() rather than dropping it.
- Set and store IP allowlist for SCRAM Proxy to consistently perform IP
allowance check

 Relates to #9660
---
 proxy/src/auth/backend/console_redirect.rs    |  32 ++++-
 proxy/src/auth/backend/mod.rs                 |  47 ++++++--
 proxy/src/bin/proxy.rs                        |  54 ++++++++-
 proxy/src/cancellation.rs                     | 111 +++++++++++++++++-
 proxy/src/compute.rs                          |  28 ++++-
 proxy/src/console_redirect_proxy.rs           |  28 ++---
 .../control_plane/client/cplane_proxy_v1.rs   |  36 ++++--
 proxy/src/proxy/mod.rs                        |  31 +++--
 8 files changed, 307 insertions(+), 60 deletions(-)

diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index c3de77b352..dbfda588cc 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -1,16 +1,18 @@
 use async_trait::async_trait;
 use postgres_client::config::SslMode;
 use pq_proto::BeMessage as Be;
+use std::fmt;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, info_span};
 
-use super::ComputeCredentialKeys;
+use super::{ComputeCredentialKeys, ControlPlaneApi};
+use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo};
 use crate::auth::IpPattern;
 use crate::cache::Cached;
 use crate::config::AuthenticationConfig;
 use crate::context::RequestContext;
-use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
+use crate::control_plane::{self, client::cplane_proxy_v1, CachedNodeInfo, NodeInfo};
 use crate::error::{ReportableError, UserFacingError};
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::stream::PqStream;
@@ -31,6 +33,13 @@ pub(crate) enum ConsoleRedirectError {
 #[derive(Debug)]
 pub struct ConsoleRedirectBackend {
     console_uri: reqwest::Url,
+    api: cplane_proxy_v1::NeonControlPlaneClient,
+}
+
+impl fmt::Debug for cplane_proxy_v1::NeonControlPlaneClient {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "NeonControlPlaneClient")
+    }
 }
 
 impl UserFacingError for ConsoleRedirectError {
@@ -71,9 +80,24 @@ pub(crate) fn new_psql_session_id() -> String {
     hex::encode(rand::random::<[u8; 8]>())
 }
 
+#[async_trait]
+impl BackendIpAllowlist for ConsoleRedirectBackend {
+    async fn get_allowed_ips(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> auth::Result<Vec<auth::IpPattern>> {
+        self.api
+            .get_allowed_ips_and_secret(ctx, user_info)
+            .await
+            .map(|(ips, _)| ips.as_ref().clone())
+            .map_err(|e| e.into())
+    }
+}
+
 impl ConsoleRedirectBackend {
-    pub fn new(console_uri: reqwest::Url) -> Self {
-        Self { console_uri }
+    pub fn new(console_uri: reqwest::Url, api: cplane_proxy_v1::NeonControlPlaneClient) -> Self {
+        Self { console_uri, api }
     }
 
     pub(crate) async fn authenticate(
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 0c9a7f7825..de48be2952 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -16,7 +16,9 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{debug, info, warn};
 
 use crate::auth::credentials::check_peer_addr_is_in_list;
-use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint};
+use crate::auth::{
+    self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint, IpPattern,
+};
 use crate::cache::Cached;
 use crate::config::AuthenticationConfig;
 use crate::context::RequestContext;
@@ -131,7 +133,7 @@ pub(crate) struct ComputeUserInfoNoEndpoint {
     pub(crate) options: NeonOptions,
 }
 
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Default)]
 pub(crate) struct ComputeUserInfo {
     pub(crate) endpoint: EndpointId,
     pub(crate) user: RoleName,
@@ -244,6 +246,15 @@ impl AuthenticationConfig {
     }
 }
 
+#[async_trait::async_trait]
+pub(crate) trait BackendIpAllowlist {
+    async fn get_allowed_ips(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> auth::Result<Vec<auth::IpPattern>>;
+}
+
 /// True to its name, this function encapsulates our current auth trade-offs.
 /// Here, we choose the appropriate auth flow based on circumstances.
 ///
@@ -256,7 +267,7 @@ async fn auth_quirks(
     allow_cleartext: bool,
     config: &'static AuthenticationConfig,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-) -> auth::Result<ComputeCredentials> {
+) -> auth::Result<(ComputeCredentials, Option<Vec<IpPattern>>)> {
     // If there's no project so far, that entails that client doesn't
     // support SNI or other means of passing the endpoint (project) name.
     // We now expect to see a very specific payload in the place of password.
@@ -315,7 +326,7 @@ async fn auth_quirks(
     )
     .await
     {
-        Ok(keys) => Ok(keys),
+        Ok(keys) => Ok((keys, Some(allowed_ips.as_ref().clone()))),
         Err(e) => {
             if e.is_password_failed() {
                 // The password could have been changed, so we invalidate the cache.
@@ -385,7 +396,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
         allow_cleartext: bool,
         config: &'static AuthenticationConfig,
         endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-    ) -> auth::Result<Backend<'a, ComputeCredentials>> {
+    ) -> auth::Result<(Backend<'a, ComputeCredentials>, Option<Vec<IpPattern>>)> {
         let res = match self {
             Self::ControlPlane(api, user_info) => {
                 debug!(
@@ -394,7 +405,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
                     "performing authentication using the console"
                 );
 
-                let credentials = auth_quirks(
+                let (credentials, ip_allowlist) = auth_quirks(
                     ctx,
                     &*api,
                     user_info,
@@ -404,7 +415,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
                     endpoint_rate_limiter,
                 )
                 .await?;
-                Backend::ControlPlane(api, credentials)
+                Ok((Backend::ControlPlane(api, credentials), ip_allowlist))
             }
             Self::Local(_) => {
                 return Err(auth::AuthError::bad_auth_method("invalid for local proxy"))
@@ -413,7 +424,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
 
         // TODO: replace with some metric
         info!("user successfully authenticated");
-        Ok(res)
+        res
     }
 }
 
@@ -441,6 +452,24 @@ impl Backend<'_, ComputeUserInfo> {
     }
 }
 
+#[async_trait::async_trait]
+impl BackendIpAllowlist for Backend<'_, ()> {
+    async fn get_allowed_ips(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> auth::Result<Vec<auth::IpPattern>> {
+        let auth_data = match self {
+            Self::ControlPlane(api, ()) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+        };
+
+        auth_data
+            .map(|(ips, _)| ips.as_ref().clone())
+            .map_err(|e| e.into())
+    }
+}
+
 #[async_trait::async_trait]
 impl ComputeConnectBackend for Backend<'_, ComputeCredentials> {
     async fn wake_compute(
@@ -786,7 +815,7 @@ mod tests {
         .await
         .unwrap();
 
-        assert_eq!(creds.info.endpoint, "my-endpoint");
+        assert_eq!(creds.0.info.endpoint, "my-endpoint");
 
         handle.await.unwrap();
     }
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 3b122d771c..70b50436bf 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -744,9 +744,59 @@ fn build_auth_backend(
         }
 
         AuthBackendType::ConsoleRedirect => {
-            let url = args.uri.parse()?;
-            let backend = ConsoleRedirectBackend::new(url);
+            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
+            let project_info_cache_config: ProjectInfoCacheOptions =
+                args.project_info_cache.parse()?;
+            let endpoint_cache_config: config::EndpointCacheConfig =
+                args.endpoint_cache_config.parse()?;
 
+            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
+            info!(
+                "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
+            );
+            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
+            let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
+                wake_compute_cache_config,
+                project_info_cache_config,
+                endpoint_cache_config,
+            )));
+
+            let config::ConcurrencyLockOptions {
+                shards,
+                limiter,
+                epoch,
+                timeout,
+            } = args.wake_compute_lock.parse()?;
+            info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
+            let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
+                "wake_compute_lock",
+                limiter,
+                shards,
+                timeout,
+                epoch,
+                &Metrics::get().wake_compute_lock,
+            )?));
+
+            let url = args.uri.clone().parse()?;
+            let ep_url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
+            let endpoint = http::Endpoint::new(ep_url, http::new_client());
+            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
+            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
+            let wake_compute_endpoint_rate_limiter =
+                Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
+
+            // Since we use only get_allowed_ips_and_secret() wake_compute_endpoint_rate_limiter
+            // and locks are not used in ConsoleRedirectBackend,
+            // but they are required by the NeonControlPlaneClient
+            let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new(
+                endpoint,
+                args.control_plane_token.clone(),
+                caches,
+                locks,
+                wake_compute_endpoint_rate_limiter,
+            );
+
+            let backend = ConsoleRedirectBackend::new(url, api);
             let config = Box::leak(Box::new(backend));
 
             Ok(Either::Right(config))
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index df618cf242..a96c43f2ce 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -12,8 +12,10 @@ use tokio::sync::Mutex;
 use tracing::{debug, info};
 use uuid::Uuid;
 
-use crate::auth::{check_peer_addr_is_in_list, IpPattern};
+use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo};
+use crate::auth::{check_peer_addr_is_in_list, AuthError, IpPattern};
 use crate::config::ComputeConfig;
+use crate::context::RequestContext;
 use crate::error::ReportableError;
 use crate::ext::LockExt;
 use crate::metrics::{CancellationRequest, CancellationSource, Metrics};
@@ -56,6 +58,9 @@ pub(crate) enum CancelError {
 
     #[error("IP is not allowed")]
     IpNotAllowed,
+
+    #[error("Authentication backend error")]
+    AuthError(#[from] AuthError),
 }
 
 impl ReportableError for CancelError {
@@ -68,6 +73,7 @@ impl ReportableError for CancelError {
             CancelError::Postgres(_) => crate::error::ErrorKind::Compute,
             CancelError::RateLimit => crate::error::ErrorKind::RateLimit,
             CancelError::IpNotAllowed => crate::error::ErrorKind::User,
+            CancelError::AuthError(_) => crate::error::ErrorKind::ControlPlane,
         }
     }
 }
@@ -102,10 +108,7 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
         }
     }
 
-    /// Try to cancel a running query for the corresponding connection.
-    /// If the cancellation key is not found, it will be published to Redis.
-    /// check_allowed - if true, check if the IP is allowed to cancel the query
-    /// return Result primarily for tests
+    /// Cancelling only in notification, will be removed
     pub(crate) async fn cancel_session(
         &self,
         key: CancelKeyData,
@@ -134,7 +137,8 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
         }
 
         // NB: we should immediately release the lock after cloning the token.
-        let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
+        let cancel_state = self.map.get(&key).and_then(|x| x.clone());
+        let Some(cancel_closure) = cancel_state else {
             tracing::warn!("query cancellation key not found: {key}");
             Metrics::get()
                 .proxy
@@ -185,6 +189,96 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
         cancel_closure.try_cancel_query(self.compute_config).await
     }
 
+    /// Try to cancel a running query for the corresponding connection.
+    /// If the cancellation key is not found, it will be published to Redis.
+    /// check_allowed - if true, check if the IP is allowed to cancel the query.
+    /// Will fetch IP allowlist internally.
+    ///
+    /// return Result primarily for tests
+    pub(crate) async fn cancel_session_auth<T: BackendIpAllowlist>(
+        &self,
+        key: CancelKeyData,
+        ctx: RequestContext,
+        check_allowed: bool,
+        auth_backend: &T,
+    ) -> Result<(), CancelError> {
+        // TODO: check for unspecified address is only for backward compatibility, should be removed
+        if !ctx.peer_addr().is_unspecified() {
+            let subnet_key = match ctx.peer_addr() {
+                IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here
+                IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
+            };
+            if !self.limiter.lock_propagate_poison().check(subnet_key, 1) {
+                // log only the subnet part of the IP address to know which subnet is rate limited
+                tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}");
+                Metrics::get()
+                    .proxy
+                    .cancellation_requests_total
+                    .inc(CancellationRequest {
+                        source: self.from,
+                        kind: crate::metrics::CancellationOutcome::RateLimitExceeded,
+                    });
+                return Err(CancelError::RateLimit);
+            }
+        }
+
+        // NB: we should immediately release the lock after cloning the token.
+        let cancel_state = self.map.get(&key).and_then(|x| x.clone());
+        let Some(cancel_closure) = cancel_state else {
+            tracing::warn!("query cancellation key not found: {key}");
+            Metrics::get()
+                .proxy
+                .cancellation_requests_total
+                .inc(CancellationRequest {
+                    source: self.from,
+                    kind: crate::metrics::CancellationOutcome::NotFound,
+                });
+
+            if ctx.session_id() == Uuid::nil() {
+                // was already published, do not publish it again
+                return Ok(());
+            }
+
+            match self
+                .client
+                .try_publish(key, ctx.session_id(), ctx.peer_addr())
+                .await
+            {
+                Ok(()) => {} // do nothing
+                Err(e) => {
+                    // log it here since cancel_session could be spawned in a task
+                    tracing::error!("failed to publish cancellation key: {key}, error: {e}");
+                    return Err(CancelError::IO(std::io::Error::new(
+                        std::io::ErrorKind::Other,
+                        e.to_string(),
+                    )));
+                }
+            }
+            return Ok(());
+        };
+
+        let ip_allowlist = auth_backend
+            .get_allowed_ips(&ctx, &cancel_closure.user_info)
+            .await
+            .map_err(CancelError::AuthError)?;
+
+        if check_allowed && !check_peer_addr_is_in_list(&ctx.peer_addr(), &ip_allowlist) {
+            // log it here since cancel_session could be spawned in a task
+            tracing::warn!("IP is not allowed to cancel the query: {key}");
+            return Err(CancelError::IpNotAllowed);
+        }
+
+        Metrics::get()
+            .proxy
+            .cancellation_requests_total
+            .inc(CancellationRequest {
+                source: self.from,
+                kind: crate::metrics::CancellationOutcome::Found,
+            });
+        info!("cancelling query per user's request using key {key}");
+        cancel_closure.try_cancel_query(self.compute_config).await
+    }
+
     #[cfg(test)]
     fn contains(&self, session: &Session<P>) -> bool {
         self.map.contains_key(&session.key)
@@ -248,6 +342,7 @@ pub struct CancelClosure {
     cancel_token: CancelToken,
     ip_allowlist: Vec<IpPattern>,
     hostname: String, // for pg_sni router
+    user_info: ComputeUserInfo,
 }
 
 impl CancelClosure {
@@ -256,12 +351,14 @@ impl CancelClosure {
         cancel_token: CancelToken,
         ip_allowlist: Vec<IpPattern>,
         hostname: String,
+        user_info: ComputeUserInfo,
     ) -> Self {
         Self {
             socket_addr,
             cancel_token,
             ip_allowlist,
             hostname,
+            user_info,
         }
     }
     /// Cancels the query running on user's compute node.
@@ -288,6 +385,8 @@ impl CancelClosure {
         debug!("query was cancelled");
         Ok(())
     }
+
+    /// Obsolete (will be removed after moving CancelMap to Redis), only for notifications
     pub(crate) fn set_ip_allowlist(&mut self, ip_allowlist: Vec<IpPattern>) {
         self.ip_allowlist = ip_allowlist;
     }
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 89de6692ad..788bd63fee 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -13,6 +13,7 @@ use thiserror::Error;
 use tokio::net::TcpStream;
 use tracing::{debug, error, info, warn};
 
+use crate::auth::backend::ComputeUserInfo;
 use crate::auth::parse_endpoint_param;
 use crate::cancellation::CancelClosure;
 use crate::config::ComputeConfig;
@@ -23,8 +24,10 @@ use crate::control_plane::messages::MetricsAuxInfo;
 use crate::error::{ReportableError, UserFacingError};
 use crate::metrics::{Metrics, NumDbConnectionsGuard};
 use crate::proxy::neon_option;
+use crate::proxy::NeonOptions;
 use crate::tls::postgres_rustls::MakeRustlsConnect;
 use crate::types::Host;
+use crate::types::{EndpointId, RoleName};
 
 pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
 
@@ -284,6 +287,28 @@ impl ConnCfg {
             self.0.get_ssl_mode()
         );
 
+        let compute_info = match parameters.get("user") {
+            Some(user) => {
+                match parameters.get("database") {
+                    Some(database) => {
+                        ComputeUserInfo {
+                            user: RoleName::from(user),
+                            options: NeonOptions::default(), // just a shim, we don't need options
+                            endpoint: EndpointId::from(database),
+                        }
+                    }
+                    None => {
+                        warn!("compute node didn't return database name");
+                        ComputeUserInfo::default()
+                    }
+                }
+            }
+            None => {
+                warn!("compute node didn't return user name");
+                ComputeUserInfo::default()
+            }
+        };
+
         // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw.
         // Yet another reason to rework the connection establishing code.
         let cancel_closure = CancelClosure::new(
@@ -294,8 +319,9 @@ impl ConnCfg {
                 process_id,
                 secret_key,
             },
-            vec![],
+            vec![], // TODO: deprecated, will be removed
             host.to_string(),
+            compute_info,
         );
 
         let connection = PostgresConnection {
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 25a549039c..846f55f9e1 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -159,6 +159,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     let request_gauge = metrics.connection_requests.guard(proto);
 
     let tls = config.tls_config.as_ref();
+
     let record_handshake_error = !ctx.has_private_peer_addr();
     let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
     let do_handshake = handshake(ctx, stream, tls, record_handshake_error);
@@ -171,23 +172,20 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
             // spawn a task to cancel the session, but don't wait for it
             cancellations.spawn({
                 let cancellation_handler_clone = Arc::clone(&cancellation_handler);
-                let session_id = ctx.session_id();
-                let peer_ip = ctx.peer_addr();
-                let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?session_id);
+                let ctx = ctx.clone();
+                let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?ctx.session_id());
                 cancel_span.follows_from(tracing::Span::current());
                 async move {
-                    drop(
-                        cancellation_handler_clone
-                            .cancel_session(
-                                cancel_key_data,
-                                session_id,
-                                peer_ip,
-                                config.authentication_config.ip_allowlist_check_enabled,
-                            )
-                            .instrument(cancel_span)
-                            .await,
-                    );
-                }
+                    cancellation_handler_clone
+                        .cancel_session_auth(
+                            cancel_key_data,
+                            ctx,
+                            config.authentication_config.ip_allowlist_check_enabled,
+                            backend,
+                        )
+                        .await
+                        .inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok();
+                }.instrument(cancel_span)
             });
 
             return Ok(None);
diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs
index 00038a6ac6..ece03156d1 100644
--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -29,7 +29,7 @@ use crate::rate_limiter::WakeComputeRateLimiter;
 use crate::types::{EndpointCacheKey, EndpointId};
 use crate::{compute, http, scram};
 
-const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
+pub(crate) const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
 
 #[derive(Clone)]
 pub struct NeonControlPlaneClient {
@@ -78,15 +78,30 @@ impl NeonControlPlaneClient {
             info!("endpoint is not valid, skipping the request");
             return Ok(AuthInfo::default());
         }
-        let request_id = ctx.session_id().to_string();
-        let application_name = ctx.console_application_name();
+        self.do_get_auth_req(user_info, &ctx.session_id(), Some(ctx))
+            .await
+    }
+
+    async fn do_get_auth_req(
+        &self,
+        user_info: &ComputeUserInfo,
+        session_id: &uuid::Uuid,
+        ctx: Option<&RequestContext>,
+    ) -> Result<AuthInfo, GetAuthInfoError> {
+        let request_id: String = session_id.to_string();
+        let application_name = if let Some(ctx) = ctx {
+            ctx.console_application_name()
+        } else {
+            "auth_cancellation".to_string()
+        };
+
         async {
             let request = self
                 .endpoint
                 .get_path("get_endpoint_access_control")
                 .header(X_REQUEST_ID, &request_id)
                 .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id())])
+                .query(&[("session_id", session_id)])
                 .query(&[
                     ("application_name", application_name.as_str()),
                     ("endpointish", user_info.endpoint.as_str()),
@@ -96,9 +111,16 @@ impl NeonControlPlaneClient {
 
             debug!(url = request.url().as_str(), "sending http request");
             let start = Instant::now();
-            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
-            let response = self.endpoint.execute(request).await?;
-            drop(pause);
+            let response = match ctx {
+                Some(ctx) => {
+                    let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
+                    let rsp = self.endpoint.execute(request).await;
+                    drop(pause);
+                    rsp?
+                }
+                None => self.endpoint.execute(request).await?,
+            };
+
             info!(duration = ?start.elapsed(), "received http response");
             let body = match parse_body::<GetEndpointAccessControl>(response).await {
                 Ok(body) => body,
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 3926c56fec..1f7dba2f9a 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -273,23 +273,20 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
             // spawn a task to cancel the session, but don't wait for it
             cancellations.spawn({
                 let cancellation_handler_clone = Arc::clone(&cancellation_handler);
-                let session_id = ctx.session_id();
-                let peer_ip = ctx.peer_addr();
-                let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?session_id);
+                let ctx = ctx.clone();
+                let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?ctx.session_id());
                 cancel_span.follows_from(tracing::Span::current());
                 async move {
-                    drop(
-                        cancellation_handler_clone
-                            .cancel_session(
-                                cancel_key_data,
-                                session_id,
-                                peer_ip,
-                                config.authentication_config.ip_allowlist_check_enabled,
-                            )
-                            .instrument(cancel_span)
-                            .await,
-                    );
-                }
+                    cancellation_handler_clone
+                        .cancel_session_auth(
+                            cancel_key_data,
+                            ctx,
+                            config.authentication_config.ip_allowlist_check_enabled,
+                            auth_backend,
+                        )
+                        .await
+                        .inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok();
+                }.instrument(cancel_span)
             });
 
             return Ok(None);
@@ -315,7 +312,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     };
 
     let user = user_info.get_user().to_owned();
-    let user_info = match user_info
+    let (user_info, ip_allowlist) = match user_info
         .authenticate(
             ctx,
             &mut stream,
@@ -356,6 +353,8 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     .or_else(|e| stream.throw_error(e))
     .await?;
 
+    node.cancel_closure
+        .set_ip_allowlist(ip_allowlist.unwrap_or_default());
     let session = cancellation_handler.get_session();
     prepare_client_connection(&node, &session, &mut stream).await?;
 

From 5b2751397dee2234bfc97d2241f05ab47b95f5ee Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 9 Jan 2025 01:05:07 -0600
Subject: [PATCH 129/583] Refactor MigrationRunner::run_migrations() to call a
 helper (#10232)

This will make it easier to add per-db migrations, such as that for
CVE-2024-4317.

Link: https://www.postgresql.org/support/security/CVE-2024-4317/
Signed-off-by: Tristan Partin <tristan@neon.tech>

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/migration.rs | 94 ++++++++++++++++------------------
 1 file changed, 44 insertions(+), 50 deletions(-)

diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs
index 1f3de65806..45c33172f7 100644
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -1,6 +1,6 @@
 use anyhow::{Context, Result};
 use fail::fail_point;
-use postgres::Client;
+use postgres::{Client, Transaction};
 use tracing::info;
 
 /// Runs a series of migrations on a target database
@@ -20,11 +20,9 @@ impl<'m> MigrationRunner<'m> {
 
     /// Get the current value neon_migration.migration_id
     fn get_migration_id(&mut self) -> Result<i64> {
-        let query = "SELECT id FROM neon_migration.migration_id";
         let row = self
             .client
-            .query_one(query, &[])
-            .context("run_migrations get migration_id")?;
+            .query_one("SELECT id FROM neon_migration.migration_id", &[])?;
 
         Ok(row.get::<&str, i64>("id"))
     }
@@ -34,7 +32,7 @@ impl<'m> MigrationRunner<'m> {
     /// This function has a fail point called compute-migration, which can be
     /// used if you would like to fail the application of a series of migrations
     /// at some point.
-    fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
+    fn update_migration_id(txn: &mut Transaction, migration_id: i64) -> Result<()> {
         // We use this fail point in order to check that failing in the
         // middle of applying a series of migrations fails in an expected
         // manner
@@ -55,12 +53,11 @@ impl<'m> MigrationRunner<'m> {
             }
         }
 
-        self.client
-            .query(
-                "UPDATE neon_migration.migration_id SET id = $1",
-                &[&migration_id],
-            )
-            .context("run_migrations update id")?;
+        txn.query(
+            "UPDATE neon_migration.migration_id SET id = $1",
+            &[&migration_id],
+        )
+        .with_context(|| format!("update neon_migration.migration_id to {migration_id}"))?;
 
         Ok(())
     }
@@ -81,53 +78,50 @@ impl<'m> MigrationRunner<'m> {
         Ok(())
     }
 
-    /// Run the configrured set of migrations
+    /// Run an individual migration
+    fn run_migration(txn: &mut Transaction, migration_id: i64, migration: &str) -> Result<()> {
+        if migration.starts_with("-- SKIP") {
+            info!("Skipping migration id={}", migration_id);
+
+            // Even though we are skipping the migration, updating the
+            // migration ID should help keep logic easy to understand when
+            // trying to understand the state of a cluster.
+            Self::update_migration_id(txn, migration_id)?;
+        } else {
+            info!("Running migration id={}:\n{}\n", migration_id, migration);
+
+            txn.simple_query(migration)
+                .with_context(|| format!("apply migration {migration_id}"))?;
+
+            Self::update_migration_id(txn, migration_id)?;
+        }
+
+        Ok(())
+    }
+
+    /// Run the configured set of migrations
     pub fn run_migrations(mut self) -> Result<()> {
-        self.prepare_database()?;
+        self.prepare_database()
+            .context("prepare database to handle migrations")?;
 
         let mut current_migration = self.get_migration_id()? as usize;
         while current_migration < self.migrations.len() {
-            macro_rules! migration_id {
-                ($cm:expr) => {
-                    ($cm + 1) as i64
-                };
-            }
+            // The index lags the migration ID by 1, so the current migration
+            // ID is also the next index
+            let migration_id = (current_migration + 1) as i64;
 
-            let migration = self.migrations[current_migration];
+            let mut txn = self
+                .client
+                .transaction()
+                .with_context(|| format!("begin transaction for migration {migration_id}"))?;
 
-            if migration.starts_with("-- SKIP") {
-                info!("Skipping migration id={}", migration_id!(current_migration));
+            Self::run_migration(&mut txn, migration_id, self.migrations[current_migration])
+                .with_context(|| format!("running migration {migration_id}"))?;
 
-                // Even though we are skipping the migration, updating the
-                // migration ID should help keep logic easy to understand when
-                // trying to understand the state of a cluster.
-                self.update_migration_id(migration_id!(current_migration))?;
-            } else {
-                info!(
-                    "Running migration id={}:\n{}\n",
-                    migration_id!(current_migration),
-                    migration
-                );
+            txn.commit()
+                .with_context(|| format!("commit transaction for migration {migration_id}"))?;
 
-                self.client
-                    .simple_query("BEGIN")
-                    .context("begin migration")?;
-
-                self.client.simple_query(migration).with_context(|| {
-                    format!(
-                        "run_migrations migration id={}",
-                        migration_id!(current_migration)
-                    )
-                })?;
-
-                self.update_migration_id(migration_id!(current_migration))?;
-
-                self.client
-                    .simple_query("COMMIT")
-                    .context("commit migration")?;
-
-                info!("Finished migration id={}", migration_id!(current_migration));
-            }
+            info!("Finished migration id={}", migration_id);
 
             current_migration += 1;
         }

From 5baa4e7f0a4821802dccd98e936b361cbc7a544b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 9 Jan 2025 11:47:55 +0000
Subject: [PATCH 130/583] docker: don't set LD_LIBRARY_PATH (#10321)

## Problem

This was causing storage controller to still use neon-built libpq
instead of vanilla libpq.

Since https://github.com/neondatabase/neon/pull/10269 we have a vanilla
postgres in the system path -- anything that wants a postgres library
will use that.

## Summary of changes

- Remove LD_LIBRARY_PATH assignment in Dockerfile
---
 Dockerfile | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 2c157b3b2a..d3659f917a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -103,11 +103,6 @@ RUN mkdir -p /data/.neon/ && \
   > /data/.neon/pageserver.toml && \
   chown -R neon:neon /data/.neon
 
-# When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
-# that want a particular postgres version will select it explicitly: this is just a default.
-ENV LD_LIBRARY_PATH=/usr/local/v${DEFAULT_PG_VERSION}/lib
-
-
 VOLUME ["/data"]
 USER neon
 EXPOSE 6400

From 030ab1c0e8dc9f4e20b118ee77361a4b34918652 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Thu, 9 Jan 2025 15:26:17 +0300
Subject: [PATCH 131/583] TLA+ spec for safekeeper membership change (#9966)

## Problem

We want to define the algorithm for safekeeper membership change.

## Summary of changes

Add spec for it, several models and logs of checking them.

ref https://github.com/neondatabase/neon/issues/8699
---
 .../spec/MCProposerAcceptorReconfig.tla       |  41 ++
 safekeeper/spec/MCProposerAcceptorStatic.tla  |   3 +
 safekeeper/spec/ProposerAcceptorReconfig.tla  | 350 ++++++++++++++++++
 safekeeper/spec/ProposerAcceptorStatic.tla    | 234 +++++++-----
 safekeeper/spec/modelcheck.sh                 |   3 +
 ...roposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg |  21 ++
 ...roposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg |  19 +
 ...roposerAcceptorReconfig_p2_a3_t2_l2_c3.cfg |  20 +
 ...roposerAcceptorReconfig_p2_a4_t2_l2_c3.cfg |  19 +
 safekeeper/spec/remove_interm_progress.awk    |  25 ++
 safekeeper/spec/remove_interm_progress.sh     |   3 +
 ...2_a2_t2_l2_c3.cfg-2024-12-11--04-24-12.log |  65 ++++
 ...2_a2_t2_l2_c5.cfg-2024-12-11--04-26-11.log |  64 ++++
 13 files changed, 784 insertions(+), 83 deletions(-)
 create mode 100644 safekeeper/spec/MCProposerAcceptorReconfig.tla
 create mode 100644 safekeeper/spec/ProposerAcceptorReconfig.tla
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a3_t2_l2_c3.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a4_t2_l2_c3.cfg
 create mode 100644 safekeeper/spec/remove_interm_progress.awk
 create mode 100755 safekeeper/spec/remove_interm_progress.sh
 create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg-2024-12-11--04-24-12.log
 create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg-2024-12-11--04-26-11.log

diff --git a/safekeeper/spec/MCProposerAcceptorReconfig.tla b/safekeeper/spec/MCProposerAcceptorReconfig.tla
new file mode 100644
index 0000000000..a4b25e383a
--- /dev/null
+++ b/safekeeper/spec/MCProposerAcceptorReconfig.tla
@@ -0,0 +1,41 @@
+---- MODULE MCProposerAcceptorReconfig ----
+EXTENDS TLC, ProposerAcceptorReconfig
+
+\* Augments the spec with model checking constraints.
+
+\* It slightly duplicates MCProposerAcceptorStatic, but we can't EXTENDS it
+\* because it EXTENDS ProposerAcceptorStatic in turn. The duplication isn't big
+\* anyway.
+
+\* For model checking.
+CONSTANTS
+  max_entries, \* model constraint: max log entries acceptor/proposer can hold
+  max_term, \* model constraint: max allowed term
+  max_generation \* mode constraint: max config generation
+
+ASSUME max_entries \in Nat /\ max_term \in Nat /\ max_generation \in Nat
+
+\* Model space constraint.
+StateConstraint == /\ \A p \in proposers:
+                     /\ prop_state[p].term <= max_term
+                     /\ Len(prop_state[p].wal) <= max_entries
+                   /\ conf_store.generation <= max_generation
+
+\* Sets of proposers and acceptors and symmetric because we don't take any
+\* actions depending on some concrete proposer/acceptor (like IF p = p1 THEN
+\* ...)
+ProposerAcceptorSymmetry == Permutations(proposers) \union Permutations(acceptors)
+
+\* enforce order of the vars in the error trace with ALIAS
+\* Note that ALIAS is supported only since version 1.8.0 which is pre-release
+\* as of writing this.
+Alias == [
+           prop_state |-> prop_state,
+           prop_conf |-> prop_conf,
+           acc_state |-> acc_state,
+           acc_conf |-> acc_conf,
+           committed |-> committed,
+           conf_store |-> conf_store
+         ]
+
+====
diff --git a/safekeeper/spec/MCProposerAcceptorStatic.tla b/safekeeper/spec/MCProposerAcceptorStatic.tla
index be3d99c697..b4eca1965a 100644
--- a/safekeeper/spec/MCProposerAcceptorStatic.tla
+++ b/safekeeper/spec/MCProposerAcceptorStatic.tla
@@ -3,6 +3,9 @@ EXTENDS TLC, ProposerAcceptorStatic
 
 \* Augments the spec with model checking constraints.
 
+\* Note that MCProposerAcceptorReconfig duplicates it and might need to
+\* be updated as well.
+
 \* For model checking.
 CONSTANTS
   max_entries, \* model constraint: max log entries acceptor/proposer can hold
diff --git a/safekeeper/spec/ProposerAcceptorReconfig.tla b/safekeeper/spec/ProposerAcceptorReconfig.tla
new file mode 100644
index 0000000000..78de231a39
--- /dev/null
+++ b/safekeeper/spec/ProposerAcceptorReconfig.tla
@@ -0,0 +1,350 @@
+---- MODULE ProposerAcceptorReconfig ----
+
+(*
+    Spec for https://github.com/neondatabase/neon/blob/538e2312a617c65d489d391892c70b2e4d7407b5/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+
+    Simplifications:
+    - The ones inherited from ProposerAcceptorStatic.
+    - We don't model transient state of the configuration change driver process
+      (storage controller in the implementation). Its actions StartChange and FinishChange
+      are taken based on the persistent state of safekeepers and conf store. The
+      justification for that is the following: once new configuration n is
+      created (e.g with StartChange or FinishChange), any old configuration
+      change driver working on older conf < n will never be able to commit
+      it to the conf store because it is protected by CAS. The
+      propagation of these older confs is still possible though, and
+      spec allows to do it through acceptors.
+      Plus the model is already pretty huge.
+    - Previous point also means that the FinishChange action is
+      based only on the current state of safekeepers, not from
+      the past. That's ok because while individual
+      acceptor <last_log_term, flush_lsn> may go down,
+      quorum one never does. So the FinishChange
+      condition which collects max of the quorum may get
+      only more strict over time.
+
+    The invariants expectedly break if any of FinishChange
+    required conditions are removed.
+*)
+
+EXTENDS Integers, Sequences, FiniteSets, TLC
+
+VARIABLES
+  \* state which is the same in the static spec
+  prop_state,
+  acc_state,
+  committed,
+  elected_history,
+  \* reconfiguration only state
+  prop_conf, \* prop_conf[p] is current configuration of proposer p
+  acc_conf, \* acc_conf[a] is current configuration of acceptor a
+  conf_store \* configuration in the configuration store.
+
+CONSTANT
+  acceptors,
+  proposers
+
+CONSTANT NULL
+
+\* Import ProposerAcceptorStatic under PAS.
+\*
+\* Note that all vars and consts are named the same and thus substituted
+\* implicitly.
+PAS == INSTANCE ProposerAcceptorStatic
+
+\********************************************************************************
+\* Helpers
+\********************************************************************************
+
+\********************************************************************************
+\* Type assertion
+\********************************************************************************
+
+\* Is c a valid config?
+IsConfig(c) ==
+    /\ DOMAIN c = {"generation", "members", "newMembers"}
+    \* Unique id of the configuration.
+    /\ c.generation \in Nat
+    /\ c.members \in SUBSET acceptors
+    \* newMembers is NULL when it is not a joint conf.
+    /\ \/ c.newMembers = NULL
+       \/ c.newMembers \in SUBSET acceptors
+
+TypeOk ==
+    /\ PAS!TypeOk
+    /\ \A p \in proposers: IsConfig(prop_conf[p])
+    /\ \A a \in acceptors: IsConfig(acc_conf[a])
+    /\ IsConfig(conf_store)
+
+\********************************************************************************
+\* Initial
+\********************************************************************************
+
+Init ==
+  /\ PAS!Init
+  /\ \E init_members \in SUBSET acceptors:
+       LET init_conf == [generation |-> 1, members |-> init_members, newMembers |-> NULL] IN
+           \* refer to RestartProposer why it is not NULL
+           /\ prop_conf = [p \in proposers |-> init_conf]
+           /\ acc_conf = [a \in acceptors |-> init_conf]
+           /\ conf_store = init_conf
+           \* We could start with anything, but to reduce state space state with
+           \* the most reasonable total acceptors - 1 conf size, which e.g.
+           \* makes basic {a1} -> {a2} change in {a1, a2} acceptors and {a1, a2,
+           \* a3} -> {a2, a3, a4} in {a1, a2, a3, a4} acceptors models even in
+           \* the smallest models with single change.
+           /\ Cardinality(init_members) = Cardinality(acceptors) - 1
+
+\********************************************************************************
+\* Actions
+\********************************************************************************
+
+\* Proposer p loses all state, restarting. In the static spec we bump restarted
+\* proposer term to max of some quorum + 1 which is a minimal term which can win
+\* election. With reconfigurations it's harder to calculate such a term, so keep
+\* it simple and take random acceptor one + 1.
+\*
+\* Also make proposer to adopt configuration of another random acceptor. In the
+\* impl proposer starts with NULL configuration until handshake with first
+\* acceptor. Removing this NULL special case makes the spec a bit simpler.
+RestartProposer(p) ==
+    /\ \E a \in acceptors: PAS!RestartProposerWithTerm(p, acc_state[a].term + 1)
+    /\ \E a \in acceptors: prop_conf' = [prop_conf EXCEPT ![p] = acc_conf[a]]
+    /\ UNCHANGED <<acc_conf, conf_store>>
+
+\* Acceptor a immediately votes for proposer p.
+Vote(p, a) ==
+    \* Configuration must be the same.
+    /\ prop_conf[p].generation = acc_conf[a].generation
+    \* And a is expected be a member of it. This is likely redundant as long as
+    \* becoming leader checks membership (though vote also contributes to max
+    \* <term, lsn> calculation).
+    /\ \/ a \in prop_conf[p].members
+       \/ (prop_conf[p].newMembers /= NULL) /\ (a \in prop_conf[p].newMembers)
+    /\ PAS!Vote(p, a)
+    /\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
+
+\* Proposer p gets elected.
+BecomeLeader(p) ==
+    /\ prop_state[p].state = "campaign"
+    \* Votes must form quorum in both sets (if the newMembers exists).
+    /\ PAS!FormsQuorum(DOMAIN prop_state[p].votes, prop_conf[p].members)
+    /\ \/ prop_conf[p].newMembers = NULL
+       \* TLA+ disjunction evaluation doesn't short-circuit for a good reason:
+       \* https://groups.google.com/g/tlaplus/c/U6tOJ4dsjVM/m/UdOznPCVBwAJ
+       \* so repeat the null check.
+       \/ (prop_conf[p].newMembers /= NULL)  /\ (PAS!FormsQuorum(DOMAIN prop_state[p].votes, prop_conf[p].newMembers))
+    \* DoBecomeLeader will copy WAL of the highest voter to proposer's WAL, so
+    \* ensure its conf is still the same. In the impl WAL fetching also has to
+    \* check the configuration.
+    /\ prop_conf[p].generation = acc_conf[PAS!MaxVoteAcc(p)].generation
+    /\ \A a \in DOMAIN prop_state[p].votes: prop_conf[p].generation = acc_conf[a].generation
+    /\ PAS!DoBecomeLeader(p)
+    /\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
+
+UpdateTerm(p, a) ==
+    /\ PAS!UpdateTerm(p, a)
+    /\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
+
+TruncateWal(p, a) ==
+    /\ prop_state[p].state = "leader"
+    \* Configuration must be the same.
+    /\ prop_conf[p].generation = acc_conf[a].generation
+    /\ PAS!TruncateWal(p, a)
+    /\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
+
+NewEntry(p) ==
+    /\ PAS!NewEntry(p)
+    /\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
+
+AppendEntry(p, a) ==
+    /\ prop_state[p].state = "leader"
+    \* Configuration must be the same.
+    /\ prop_conf[p].generation = acc_conf[a].generation
+    \* And a is member of it. Ignoring this likely wouldn't hurt, but not useful
+    \* either.
+    /\ \/ a \in prop_conf[p].members
+       \/ (prop_conf[p].newMembers /= NULL) /\ (a \in prop_conf[p].newMembers)
+    /\ PAS!AppendEntry(p, a)
+    /\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
+
+\* see PAS!CommitEntries for comments.
+CommitEntries(p) ==
+    /\ prop_state[p].state = "leader"
+    /\ \E q1 \in PAS!AllMinQuorums(prop_conf[p].members):
+           LET q1_commit_lsn == PAS!QuorumCommitLsn(p, q1) IN
+               \* Configuration must be the same.
+               /\ \A a \in q1: prop_conf[p].generation = acc_conf[a].generation
+               /\ q1_commit_lsn /= NULL
+               \* We must collect acks from both quorums, if newMembers is present.
+               /\ IF prop_conf[p].newMembers = NULL THEN
+                      PAS!DoCommitEntries(p, q1_commit_lsn)
+                  ELSE
+                      \E q2 \in PAS!AllMinQuorums(prop_conf[p].newMembers):
+                          LET q2_commit_lsn == PAS!QuorumCommitLsn(p, q2) IN
+                              \* Configuration must be the same.
+                              /\ \A a \in q1: prop_conf[p].generation = acc_conf[a].generation
+                              /\ q2_commit_lsn /= NULL
+                              /\ PAS!DoCommitEntries(p, PAS!Min(q1_commit_lsn, q2_commit_lsn))
+    /\ UNCHANGED <<prop_conf, acc_conf, conf_store>>
+
+\* Proposer p adopts higher conf c from conf store or from some acceptor.
+ProposerSwitchConf(p) ==
+    /\ \E c \in ({conf_store} \union {acc_conf[a]: a \in acceptors}):
+        \* p's conf is lower than c.
+        /\ (c.generation > prop_conf[p].generation)
+        \* We allow to bump conf without restart only when wp is already elected.
+        \* If it isn't, the votes it has already collected are from the previous
+        \* configuration and can't be used.
+        \*
+        \* So if proposer is in 'campaign' in the impl we would restart preserving
+        \* conf and increasing term. In the spec this transition is already covered
+        \* by more a generic RestartProposer, so we don't specify it here.
+        /\ prop_state[p].state = "leader"
+        /\ prop_conf' = [prop_conf EXCEPT ![p] = c]
+        /\ UNCHANGED <<prop_state, acc_state, committed, elected_history, acc_conf, conf_store>>
+
+\* Do CAS on the conf store, starting change into the new_members conf.
+StartChange(new_members) ==
+    \* Possible only if we don't already have the change in progress.
+    /\ conf_store.newMembers = NULL
+    \* Not necessary, but reduces space a bit.
+    /\ new_members /= conf_store.members
+    /\ conf_store' = [generation |-> conf_store.generation + 1, members |-> conf_store.members, newMembers |-> new_members]
+    /\ UNCHANGED <<prop_state, acc_state, committed, elected_history, prop_conf, acc_conf>>
+
+\* Acceptor's last_log_term.
+AccLastLogTerm(acc) ==
+    PAS!LastLogTerm(PAS!AcceptorTermHistory(acc))
+
+\* Do CAS on the conf store, transferring joint conf into the newMembers only.
+FinishChange ==
+    \* have joint conf
+    /\ conf_store.newMembers /= NULL
+    \* The conditions for finishing the change are:
+    /\ \E qo \in PAS!AllMinQuorums(conf_store.members):
+           \* 1) Old majority must be aware of the joint conf.
+           \* Note: generally the driver can't know current acceptor
+           \* generation, it can only know that it once had been the
+           \* expected one, but it might have advanced since then.
+           \* But as explained at the top of the file if acceptor gen
+           \* advanced, FinishChange will never be able to complete
+           \* due to CAS anyway. We use strict equality here because
+           \* that's what makes sense conceptually (old driver should
+           \* abandon its attempt if it observes that conf has advanced).
+           /\ \A a \in qo: conf_store.generation = acc_conf[a].generation
+           \* 2) New member set must have log synced, i.e. some its majority needs
+           \*    to have <last_log_term, lsn> at least as high as max of some
+           \*    old majority.
+           \* 3) Term must be synced, i.e. some majority of the new set must
+           \*    have term >= than max term of some old majority.
+           \*    This ensures that two leaders are never elected with the same
+           \*    term even after config change (which would be bad unless we treat
+           \*    generation as a part of term which we don't).
+           \* 4) A majority of the new set must be aware of the joint conf.
+           \*    This allows to safely destoy acceptor state if it is not a
+           \*    member of its current conf (which is useful for cleanup after
+           \*    migration as well as for aborts).
+           /\ LET sync_pos == PAS!MaxTermLsn({[term |-> AccLastLogTerm(a), lsn |-> PAS!FlushLsn(a)]: a \in qo})
+                  sync_term == PAS!Maximum({acc_state[a].term: a \in qo})
+              IN
+                  \E qn \in PAS!AllMinQuorums(conf_store.newMembers):
+                      \A a \in qn:
+                          /\ PAS!TermLsnGE([term |-> AccLastLogTerm(a), lsn |-> PAS!FlushLsn(a)], sync_pos)
+                          /\ acc_state[a].term >= sync_term
+                          \* The same note as above about strict equality applies here.
+                          /\ conf_store.generation = acc_conf[a].generation
+    /\ conf_store' = [generation |-> conf_store.generation + 1, members |-> conf_store.newMembers, newMembers |-> NULL]
+    /\ UNCHANGED <<prop_state, acc_state, committed, elected_history, prop_conf, acc_conf>>
+
+\* Do CAS on the conf store, aborting the change in progress.
+AbortChange ==
+    \* have joint conf
+    /\ conf_store.newMembers /= NULL
+    /\ conf_store' = [generation |-> conf_store.generation + 1, members |-> conf_store.members, newMembers |-> NULL]
+    /\ UNCHANGED <<prop_state, acc_state, committed, elected_history, prop_conf, acc_conf>>
+
+\* Acceptor a switches to higher configuration from the conf store
+\* or from some proposer.
+AccSwitchConf(a) ==
+    /\ \E c \in ({conf_store} \union {prop_conf[p]: p \in proposers}):
+        /\ acc_conf[a].generation < c.generation
+        /\ acc_conf' = [acc_conf EXCEPT ![a] = c]
+        /\ UNCHANGED <<prop_state, acc_state, committed, elected_history, prop_conf, conf_store>>
+
+\* Nuke all acceptor state if it is not a member of its current conf. Models
+\* cleanup after migration/abort.
+AccReset(a) ==
+    /\ \/ (acc_conf[a].newMembers = NULL) /\ (a \notin acc_conf[a].members)
+       \/ (acc_conf[a].newMembers /= NULL) /\ (a \notin (acc_conf[a].members \union acc_conf[a].newMembers))
+    /\ acc_state' = [acc_state EXCEPT ![a] = PAS!InitAcc]
+    \* Set nextSendLsn to `a` to NULL everywhere. nextSendLsn serves as a mark
+    \* that elected proposer performed TruncateWal on the acceptor, which isn't
+    \* true anymore after state reset. In the impl local deletion is expected to
+    \* terminate all existing connections.
+    /\ prop_state' = [p \in proposers |-> [prop_state[p] EXCEPT !.nextSendLsn[a] = NULL]]
+    /\ UNCHANGED <<committed, elected_history, prop_conf, acc_conf, conf_store>>
+
+\*******************************************************************************
+\* Final spec
+\*******************************************************************************
+
+Next ==
+  \/ \E p \in proposers: RestartProposer(p)
+  \/ \E p \in proposers: \E a \in acceptors: Vote(p, a)
+  \/ \E p \in proposers: BecomeLeader(p)
+  \/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a)
+  \/ \E p \in proposers: \E a \in acceptors: TruncateWal(p, a)
+  \/ \E p \in proposers: NewEntry(p)
+  \/ \E p \in proposers: \E a \in acceptors: AppendEntry(p, a)
+  \/ \E p \in proposers: CommitEntries(p)
+  \/ \E new_members \in SUBSET acceptors: StartChange(new_members)
+  \/ FinishChange
+  \/ AbortChange
+  \/ \E p \in proposers: ProposerSwitchConf(p)
+  \/ \E a \in acceptors: AccSwitchConf(a)
+  \/ \E a \in acceptors: AccReset(a)
+
+Spec == Init /\ [][Next]_<<prop_state, acc_state, committed, elected_history, prop_conf, acc_conf, conf_store>>
+
+\********************************************************************************
+\* Invariants
+\********************************************************************************
+
+AllConfs ==
+    {conf_store} \union {prop_conf[p]: p \in proposers} \union {acc_conf[a]: a \in acceptors}
+
+\* Fairly trivial (given the conf store) invariant that different configurations
+\* with the same generation are never issued.
+ConfigSafety ==
+    \A c1, c2 \in AllConfs:
+        (c1.generation = c2.generation) => (c1 = c2)
+
+ElectionSafety == PAS!ElectionSafety
+
+ElectionSafetyFull == PAS!ElectionSafetyFull
+
+LogIsMonotonic == PAS!LogIsMonotonic
+
+LogSafety == PAS!LogSafety
+
+\********************************************************************************
+\* Invariants which don't need to hold, but useful for playing/debugging.
+\********************************************************************************
+
+\* Check that we ever switch into non joint conf.
+MaxAccConf == ~ \E a \in acceptors:
+    /\ acc_conf[a].generation = 3
+    /\ acc_conf[a].newMembers /= NULL
+
+CommittedNotTruncated == PAS!CommittedNotTruncated
+
+MaxTerm == PAS!MaxTerm
+
+MaxStoreConf == conf_store.generation <= 1
+
+MaxAccWalLen == PAS!MaxAccWalLen
+
+MaxCommitLsn == PAS!MaxCommitLsn
+
+====
diff --git a/safekeeper/spec/ProposerAcceptorStatic.tla b/safekeeper/spec/ProposerAcceptorStatic.tla
index b2d2f005db..fab085bc2e 100644
--- a/safekeeper/spec/ProposerAcceptorStatic.tla
+++ b/safekeeper/spec/ProposerAcceptorStatic.tla
@@ -18,7 +18,7 @@
 \* - old WAL is immediately copied to proposer on its election, without on-demand fetch later.
 
 \* Some ideas how to break it to play around to get a feeling:
-\* - replace Quorums with BadQuorums.
+\* - replace Quorum with BadQuorum.
 \* - remove 'don't commit entries from previous terms separately' rule in
 \*   CommitEntries and observe figure 8 from the raft paper.
 \*   With p2a3t4l4 32 steps error was found in 1h on 80 cores.
@@ -69,16 +69,26 @@ Upsert(f, k, v, l(_)) ==
 
 \*****************
 
-NumAccs == Cardinality(acceptors)
+\* Does set of acceptors `acc_set` form the quorum in the member set `members`?
+\* Acceptors not from `members` are excluded (matters only for reconfig).
+FormsQuorum(acc_set, members) ==
+    Cardinality(acc_set \intersect members) >= (Cardinality(members) \div 2 + 1)
 
-\* does acc_set form the quorum?
-Quorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2 + 1)
-\* all quorums of acceptors
-Quorums == {subset \in SUBSET acceptors: Quorum(subset)}
+\* Like FormsQuorum, but for minimal quorum.
+FormsMinQuorum(acc_set, members) ==
+    Cardinality(acc_set \intersect members) = (Cardinality(members) \div 2 + 1)
 
-\* For substituting Quorums and seeing what happens.
-BadQuorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2)
-BadQuorums == {subset \in SUBSET acceptors: BadQuorum(subset)}
+\* All sets of acceptors forming minimal quorums in the member set `members`.
+AllQuorums(members) == {subset \in SUBSET members: FormsQuorum(subset, members)}
+AllMinQuorums(members) == {subset \in SUBSET acceptors: FormsMinQuorum(subset, members)}
+
+\* For substituting Quorum and seeing what happens.
+FormsBadQuorum(acc_set, members) ==
+    Cardinality(acc_set \intersect members) >= (Cardinality(members) \div 2)
+FormsMinBadQuorum(acc_set, members) ==
+    Cardinality(acc_set \intersect members) = (Cardinality(members) \div 2)
+AllBadQuorums(members) == {subset \in SUBSET acceptors: FormsBadQuorum(subset, members)}
+AllMinBadQuorums(members) == {subset \in SUBSET acceptors: FormsMinBadQuorum(subset, members)}
 
 \* flushLsn (end of WAL, i.e. index of next entry) of acceptor a.
 FlushLsn(a) == Len(acc_state[a].wal) + 1
@@ -135,10 +145,11 @@ TypeOk ==
         /\ IsWal(prop_state[p].wal)
         \* Map of acceptor -> next lsn to send. It is set when truncate_wal is
         \* done so sending entries is allowed only after that. In the impl TCP
-        \* ensures this ordering.
+        \* ensures this ordering. We use NULL instead of missing value to use
+        \* EXCEPT in AccReset.
         /\ \A a \in DOMAIN prop_state[p].nextSendLsn:
                /\ a \in acceptors
-               /\ prop_state[p].nextSendLsn[a] \in Lsns
+               /\ prop_state[p].nextSendLsn[a] \in Lsns \union {NULL}
     /\ \A a \in acceptors:
            /\ DOMAIN acc_state[a] = {"term", "termHistory", "wal"}
            /\ acc_state[a].term \in Terms
@@ -167,6 +178,19 @@ TypeOk ==
 \* Initial
 \********************************************************************************
 
+InitAcc ==
+    [
+        \* There will be no leader in zero term, 1 is the first
+        \* real.
+        term |-> 0,
+        \* Again, leader in term 0 doesn't exist, but we initialize
+        \* term histories with it to always have common point in
+        \* them. Lsn is 1 because TLA+ sequences are indexed from 1
+        \* (we don't want to truncate WAL out of range).
+        termHistory |-> << [term |-> 0, lsn |-> 1] >>,
+        wal |-> << >>
+    ]
+
 Init ==
     /\ prop_state = [p \in proposers |-> [
                         state |-> "campaign",
@@ -174,19 +198,9 @@ Init ==
                         votes |-> EmptyF,
                         termHistory |-> << >>,
                         wal |-> << >>,
-                        nextSendLsn |-> EmptyF
+                        nextSendLsn |-> [a \in acceptors |-> NULL]
                     ]]
-    /\ acc_state = [a \in acceptors |-> [
-                       \* There will be no leader in zero term, 1 is the first
-                       \* real.
-                       term |-> 0,
-                       \* Again, leader in term 0 doesn't exist, but we initialize
-                       \* term histories with it to always have common point in
-                       \* them. Lsn is 1 because TLA+ sequences are indexed from 1
-                       \* (we don't want to truncate WAL out of range).
-                       termHistory |-> << [term |-> 0, lsn |-> 1] >>,
-                       wal |-> << >>
-                   ]]
+    /\ acc_state = [a \in acceptors |-> InitAcc]
     /\ committed = {}
     /\ elected_history = EmptyF
 
@@ -195,23 +209,35 @@ Init ==
 \* Actions
 \********************************************************************************
 
-\* Proposer loses all state.
+RestartProposerWithTerm(p, new_term) ==
+    /\ prop_state' = [prop_state EXCEPT ![p].state = "campaign",
+                                        ![p].term = new_term,
+                                        ![p].votes = EmptyF,
+                                        ![p].termHistory = << >>,
+                                        ![p].wal = << >>,
+                                        ![p].nextSendLsn = [a \in acceptors |-> NULL]]
+    /\ UNCHANGED <<acc_state, committed, elected_history>>
+
+\* Proposer p loses all state, restarting.
 \* For simplicity (and to reduct state space), we assume it immediately gets
 \* current state from quorum q of acceptors determining the term he will request
 \* to vote for.
-RestartProposer(p, q) ==
-    /\ Quorum(q)
-    /\ LET new_term == Maximum({acc_state[a].term : a \in q}) + 1 IN
-           /\ prop_state' = [prop_state EXCEPT ![p].state = "campaign",
-                                               ![p].term = new_term,
-                                               ![p].votes = EmptyF,
-                                               ![p].termHistory = << >>,
-                                               ![p].wal = << >>,
-                                               ![p].nextSendLsn = EmptyF]
-           /\ UNCHANGED <<acc_state, committed, elected_history>>
+RestartProposer(p) ==
+    \E q \in AllQuorums(acceptors):
+        LET new_term == Maximum({acc_state[a].term : a \in q}) + 1 IN
+            RestartProposerWithTerm(p, new_term)
 
 \* Term history of acceptor a's WAL: the one saved truncated to contain only <=
-\* local FlushLsn entries.
+\* local FlushLsn entries. Note that FlushLsn is the end LSN of the last entry
+\* (and begin LSN of the next). The mental model for non strict comparison is
+\* that once proposer is elected it immediately writes log record with zero
+\* length. This allows leader to commit existing log without writing any new
+\* entries. For example, assume acceptor has WAL
+\*   1.1, 1.2
+\* written by prop with term 1; its current <last_log_term, flush_lsn>
+\* is <1, 3>. Now prop with term 2 and max vote from this acc is elected.
+\* Once TruncateWAL is done, <last_log_term, flush_lsn> becomes <2, 3>
+\* without any new records explicitly written.
 AcceptorTermHistory(a) ==
     SelectSeq(acc_state[a].termHistory, LAMBDA th_entry: th_entry.lsn <= FlushLsn(a))
 
@@ -230,35 +256,52 @@ Vote(p, a) ==
 \* Get lastLogTerm from term history th.
 LastLogTerm(th) == th[Len(th)].term
 
+\* Compares <term, lsn> pairs: returns true if tl1 >= tl2.
+TermLsnGE(tl1, tl2) ==
+    /\ tl1.term >= tl2.term
+    /\ (tl1.term = tl2.term => tl1.lsn >= tl2.lsn)
+
+\* Choose max <term, lsn> pair in the non empty set of them.
+MaxTermLsn(term_lsn_set) ==
+    CHOOSE max_tl \in term_lsn_set: \A tl \in term_lsn_set: TermLsnGE(max_tl, tl)
+
+\* Find acceptor with the highest <last_log_term, lsn> vote in proposer p's votes.
+MaxVoteAcc(p) ==
+    CHOOSE a \in DOMAIN prop_state[p].votes:
+        LET a_vote == prop_state[p].votes[a]
+            a_vote_term_lsn == [term |-> LastLogTerm(a_vote.termHistory), lsn |-> a_vote.flushLsn]
+            vote_term_lsns == {[term |-> LastLogTerm(v.termHistory), lsn |-> v.flushLsn]: v \in Range(prop_state[p].votes)}
+        IN
+            a_vote_term_lsn = MaxTermLsn(vote_term_lsns)
+
+\* Workhorse for BecomeLeader.
+\* Assumes the check prop_state[p] votes is quorum has been done *outside*.
+DoBecomeLeader(p) ==
+    LET
+        \* Find acceptor with the highest <last_log_term, lsn> vote.
+        max_vote_acc == MaxVoteAcc(p)
+        max_vote == prop_state[p].votes[max_vote_acc]
+        prop_th == Append(max_vote.termHistory, [term |-> prop_state[p].term, lsn |-> max_vote.flushLsn])
+    IN
+        \* We copy all log preceding proposer's term from the max vote node so
+        \* make sure it is still on one term with us. This is a model
+        \* simplification which can be removed, in impl we fetch WAL on demand
+        \* from safekeeper which has it later. Note though that in case of on
+        \* demand fetch we must check on donor not only term match, but that
+        \* truncate_wal had already been done (if it is not max_vote_acc).
+        /\ acc_state[max_vote_acc].term = prop_state[p].term
+        /\ prop_state' = [prop_state EXCEPT ![p].state = "leader",
+                                            ![p].termHistory = prop_th,
+                                            ![p].wal = acc_state[max_vote_acc].wal
+                         ]
+        /\ elected_history' = Upsert(elected_history, prop_state[p].term, 1, LAMBDA c: c + 1)
+        /\ UNCHANGED <<acc_state, committed>>
+
 \* Proposer p gets elected.
 BecomeLeader(p) ==
   /\ prop_state[p].state = "campaign"
-  /\ Quorum(DOMAIN prop_state[p].votes)
-  /\ LET
-         \* Find acceptor with the highest <last_log_term, lsn> vote.
-         max_vote_acc ==
-              CHOOSE a \in DOMAIN prop_state[p].votes:
-                  LET v == prop_state[p].votes[a]
-                  IN \A v2 \in Range(prop_state[p].votes):
-                         /\ LastLogTerm(v.termHistory) >= LastLogTerm(v2.termHistory)
-                         /\ (LastLogTerm(v.termHistory) = LastLogTerm(v2.termHistory) => v.flushLsn >= v2.flushLsn)
-         max_vote == prop_state[p].votes[max_vote_acc]
-         prop_th == Append(max_vote.termHistory, [term |-> prop_state[p].term, lsn |-> max_vote.flushLsn])
-     IN
-         \* We copy all log preceding proposer's term from the max vote node so
-         \* make sure it is still on one term with us. This is a model
-         \* simplification which can be removed, in impl we fetch WAL on demand
-         \* from safekeeper which has it later. Note though that in case of on
-         \* demand fetch we must check on donor not only term match, but that
-         \* truncate_wal had already been done (if it is not max_vote_acc).
-         /\ acc_state[max_vote_acc].term = prop_state[p].term
-         /\ prop_state' = [prop_state EXCEPT ![p].state = "leader",
-                                             ![p].termHistory = prop_th,
-                                             ![p].wal = acc_state[max_vote_acc].wal
-                          ]
-         /\ elected_history' = Upsert(elected_history, prop_state[p].term, 1, LAMBDA c: c + 1)
-         /\ UNCHANGED <<acc_state, committed>>
-
+  /\ FormsQuorum(DOMAIN prop_state[p].votes, acceptors)
+  /\ DoBecomeLeader(p)
 
 \* Acceptor a learns about elected proposer p's term. In impl it matches to
 \* VoteRequest/VoteResponse exchange when leader is already elected and is not
@@ -287,10 +330,11 @@ FindHighestCommonPoint(prop_th, acc_th, acc_flush_lsn) ==
     IN
         [term |-> last_common_term, lsn |-> Min(acc_common_term_end, prop_common_term_end)]
 
-\* Elected proposer p immediately truncates WAL (and term history) of acceptor a
-\* before starting streaming. Establishes nextSendLsn for a.
+\* Elected proposer p immediately truncates WAL (and sets term history) of
+\* acceptor a before starting streaming. Establishes nextSendLsn for a.
 \*
-\* In impl this happens at each reconnection, here we also allow to do it multiple times.
+\* In impl this happens at each reconnection, here we also allow to do it
+\* multiple times.
 TruncateWal(p, a) ==
     /\ prop_state[p].state = "leader"
     /\ acc_state[a].term = prop_state[p].term
@@ -321,8 +365,8 @@ NewEntry(p) ==
 AppendEntry(p, a) ==
     /\ prop_state[p].state = "leader"
     /\ acc_state[a].term = prop_state[p].term
-    /\ a \in DOMAIN prop_state[p].nextSendLsn \* did TruncateWal
-    /\ prop_state[p].nextSendLsn[a] <= Len(prop_state[p].wal) \* have smth to send
+    /\ prop_state[p].nextSendLsn[a] /= NULL  \* did TruncateWal
+    /\ prop_state[p].nextSendLsn[a] <= Len(prop_state[p].wal)  \* have smth to send
     /\ LET
            send_lsn == prop_state[p].nextSendLsn[a]
            entry == prop_state[p].wal[send_lsn]
@@ -337,41 +381,65 @@ AppendEntry(p, a) ==
 PropStartLsn(p) ==
     IF prop_state[p].state = "leader" THEN prop_state[p].termHistory[Len(prop_state[p].termHistory)].lsn ELSE NULL
 
-\* Proposer p commits all entries it can using quorum q. Note that unlike
-\* will62794/logless-reconfig this allows to commit entries from previous terms
-\* (when conditions for that are met).
-CommitEntries(p, q) ==
-    /\ prop_state[p].state = "leader"
-    /\ \A a \in q:
+\* LSN which can be committed by proposer p using min quorum q (check that q
+\* forms quorum must have been done outside). NULL if there is none.
+QuorumCommitLsn(p, q) ==
+    IF
+        /\ prop_state[p].state = "leader"
+        /\ \A a \in q:
+            \* Without explicit responses to appends this ensures that append
+            \* up to FlushLsn has been accepted.
            /\ acc_state[a].term = prop_state[p].term
              \* nextSendLsn existence means TruncateWal has happened, it ensures
              \* acceptor's WAL (and FlushLsn) are from proper proposer's history.
              \* Alternatively we could compare LastLogTerm here, but that's closer to
              \* what we do in the impl (we check flushLsn in AppendResponse, but
              \* AppendRequest is processed only if HandleElected handling was good).
-           /\ a \in DOMAIN prop_state[p].nextSendLsn
-    \* Now find the LSN present on all the quorum.
-    /\ LET quorum_lsn == Minimum({FlushLsn(a): a \in q}) IN
-           \* This is the basic Raft rule of not committing entries from previous
-           \* terms except along with current term entry (commit them only when
-           \* quorum recovers, i.e. last_log_term on it reaches leader's term).
-           /\ quorum_lsn >= PropStartLsn(p)
-           /\ committed' = committed \cup {[term |-> prop_state[p].wal[lsn], lsn |-> lsn]: lsn \in 1..(quorum_lsn - 1)}
-           /\ UNCHANGED <<prop_state, acc_state, elected_history>>
+           /\ prop_state[p].nextSendLsn[a] /= NULL
+    THEN
+        \* Now find the LSN present on all the quorum.
+        LET quorum_lsn == Minimum({FlushLsn(a): a \in q}) IN
+            \* This is the basic Raft rule of not committing entries from previous
+            \* terms except along with current term entry (commit them only when
+            \* quorum recovers, i.e. last_log_term on it reaches leader's term).
+            IF quorum_lsn >= PropStartLsn(p) THEN
+                quorum_lsn
+            ELSE
+                NULL
+    ELSE
+        NULL
+
+\* Commit all entries on proposer p with record lsn < commit_lsn.
+DoCommitEntries(p, commit_lsn) ==
+    /\ committed' = committed \cup {[term |-> prop_state[p].wal[lsn], lsn |-> lsn]: lsn \in 1..(commit_lsn - 1)}
+    /\ UNCHANGED <<prop_state, acc_state, elected_history>>
+
+\* Proposer p commits all entries it can using some quorum. Note that unlike
+\* will62794/logless-reconfig this allows to commit entries from previous terms
+\* (when conditions for that are met).
+CommitEntries(p) ==
+    /\ prop_state[p].state = "leader"
+    \* Using min quorums here is better because 1) QuorumCommitLsn for
+    \* simplicity checks min across all accs in q. 2) it probably makes
+    \* evaluation faster.
+    /\ \E q \in AllMinQuorums(acceptors):
+           LET commit_lsn == QuorumCommitLsn(p, q) IN
+               /\ commit_lsn /= NULL
+               /\ DoCommitEntries(p, commit_lsn)
 
 \*******************************************************************************
 \* Final spec
 \*******************************************************************************
 
 Next ==
-    \/ \E q \in Quorums: \E p \in proposers: RestartProposer(p, q)
+    \/ \E p \in proposers: RestartProposer(p)
     \/ \E p \in proposers: \E a \in acceptors: Vote(p, a)
     \/ \E p \in proposers: BecomeLeader(p)
     \/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a)
     \/ \E p \in proposers: \E a \in acceptors: TruncateWal(p, a)
     \/ \E p \in proposers: NewEntry(p)
     \/ \E p \in proposers: \E a \in acceptors: AppendEntry(p, a)
-    \/ \E q \in Quorums: \E p \in proposers: CommitEntries(p, q)
+    \/ \E p \in proposers: CommitEntries(p)
 
 Spec == Init /\ [][Next]_<<prop_state, acc_state, committed, elected_history>>
 
diff --git a/safekeeper/spec/modelcheck.sh b/safekeeper/spec/modelcheck.sh
index 21ead7dad8..0084a8c638 100755
--- a/safekeeper/spec/modelcheck.sh
+++ b/safekeeper/spec/modelcheck.sh
@@ -2,6 +2,7 @@
 
 # Usage: ./modelcheck.sh <config_file> <spec_file>, e.g.
 # ./modelcheck.sh models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg MCProposerAcceptorStatic.tla
+# ./modelcheck.sh models/MCProposerAcceptorReconfig_p2_a3_t3_l3_c3.cfg MCProposerAcceptorReconfig.tla
 CONFIG=$1
 SPEC=$2
 
@@ -12,6 +13,7 @@ mkdir -p "tlc-results"
 CONFIG_FILE=$(basename -- "$CONFIG")
 outfilename="$SPEC-${CONFIG_FILE}-$(date --utc +%Y-%m-%d--%H-%M-%S)".log
 outfile="tlc-results/$outfilename"
+echo "saving results to $outfile"
 touch $outfile
 
 # Save some info about the run.
@@ -45,5 +47,6 @@ echo "" >> $outfile
 # https://docs.tlapl.us/codebase:architecture#fingerprint_sets_fpsets
 #
 # Add -simulate to run in infinite simulation mode.
+# -coverage 1 is useful for profiling (check how many times actions are taken).
 java -Xmx$MEM -XX:MaxDirectMemorySize=$MEM -XX:+UseParallelGC -Dtlc2.tool.fp.FPSet.impl=tlc2.tool.fp.OffHeapDiskFPSet \
   -cp "${TOOLSPATH}" tlc2.TLC $SPEC -config $CONFIG -workers auto -gzip | tee -a $outfile
diff --git a/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg
new file mode 100644
index 0000000000..8d34751083
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg
@@ -0,0 +1,21 @@
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2}
+max_term = 2
+max_entries = 2
+max_generation = 3
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ConfigSafety
+ElectionSafetyFull
+LogIsMonotonic
+LogSafety
+\* As its comment explains generally it is not expected to hold, but
+\* in such small model it is true.
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
diff --git a/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg
new file mode 100644
index 0000000000..eb7e0768ff
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg
@@ -0,0 +1,19 @@
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2}
+max_term = 2
+max_entries = 2
+max_generation = 5
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ConfigSafety
+ElectionSafetyFull
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
diff --git a/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a3_t2_l2_c3.cfg b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a3_t2_l2_c3.cfg
new file mode 100644
index 0000000000..b5fae13880
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a3_t2_l2_c3.cfg
@@ -0,0 +1,20 @@
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 2
+max_entries = 2
+max_generation = 3
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ConfigSafety
+ElectionSafetyFull
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
diff --git a/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a4_t2_l2_c3.cfg b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a4_t2_l2_c3.cfg
new file mode 100644
index 0000000000..71af9fa367
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a4_t2_l2_c3.cfg
@@ -0,0 +1,19 @@
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3, a4}
+max_term = 2
+max_entries = 2
+max_generation = 3
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafetyFull
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
diff --git a/safekeeper/spec/remove_interm_progress.awk b/safekeeper/spec/remove_interm_progress.awk
new file mode 100644
index 0000000000..6203f6fa4f
--- /dev/null
+++ b/safekeeper/spec/remove_interm_progress.awk
@@ -0,0 +1,25 @@
+# Print all lines, but thin out lines starting with Progress:
+# leave only first and last 5 ones in the beginning, and only 1 of 1440
+# of others (once a day).
+# Also remove checkpointing logs.
+{
+    lines[NR] = $0
+}
+$0 ~ /^Progress/ {
+    ++pcount
+}
+END {
+    progress_idx = 0
+    for (i = 1; i <= NR; i++) {
+        if (lines[i] ~ /^Progress/) {
+            if (progress_idx < 5 || progress_idx >= pcount - 5 || progress_idx % 1440 == 0) {
+                print lines[i]
+            }
+            progress_idx++
+        }
+        else if (lines[i] ~ /^Checkpointing/) {}
+        else {
+            print lines[i]
+        }
+    }
+}
\ No newline at end of file
diff --git a/safekeeper/spec/remove_interm_progress.sh b/safekeeper/spec/remove_interm_progress.sh
new file mode 100755
index 0000000000..a8724a2b92
--- /dev/null
+++ b/safekeeper/spec/remove_interm_progress.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+awk -f remove_interm_progress.awk $1 > $1.thin
\ No newline at end of file
diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg-2024-12-11--04-24-12.log b/safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg-2024-12-11--04-24-12.log
new file mode 100644
index 0000000000..8aac9eb58c
--- /dev/null
+++ b/safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg-2024-12-11--04-24-12.log
@@ -0,0 +1,65 @@
+git revision: 9e386917a
+Platform: Linux neon-dev-arm64-1 6.8.0-49-generic #49-Ubuntu SMP PREEMPT_DYNAMIC Sun Nov  3 21:21:58 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
+CPU Info Linux: Neoverse-N1
+CPU Cores Linux: 80
+CPU Info Mac: 
+CPU Cores Mac: 
+Spec: MCProposerAcceptorReconfig.tla
+Config: models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg
+----
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2}
+max_term = 2
+max_entries = 2
+max_generation = 3
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafetyFull
+LogIsMonotonic
+LogSafety
+\* CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
+----
+
+TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
+Running breadth-first search Model-Checking with fp 99 and seed -9189733667206762985 with 35 workers on 80 cores with 27307MB heap and 30720MB offheap memory [pid: 391272] (Linux 6.8.0-49-generic aarch64, Ubuntu 21.0.5 x86_64, OffHeapDiskFPSet, DiskStateQueue).
+Parsing file /home/arseny/neon2/safekeeper/spec/MCProposerAcceptorReconfig.tla
+Parsing file /tmp/tlc-3211535543066978921/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
+Parsing file /home/arseny/neon2/safekeeper/spec/ProposerAcceptorReconfig.tla
+Parsing file /tmp/tlc-3211535543066978921/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
+Parsing file /tmp/tlc-3211535543066978921/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
+Parsing file /tmp/tlc-3211535543066978921/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
+Parsing file /tmp/tlc-3211535543066978921/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
+Parsing file /tmp/tlc-3211535543066978921/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
+Parsing file /home/arseny/neon2/safekeeper/spec/ProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-3211535543066978921/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
+Semantic processing of module Naturals
+Semantic processing of module Sequences
+Semantic processing of module FiniteSets
+Semantic processing of module TLC
+Semantic processing of module Integers
+Semantic processing of module ProposerAcceptorStatic
+Semantic processing of module ProposerAcceptorReconfig
+Semantic processing of module TLCExt
+Semantic processing of module _TLCTrace
+Semantic processing of module MCProposerAcceptorReconfig
+Starting... (2024-12-11 04:24:13)
+Computing initial states...
+Finished computing initial states: 2 states generated, with 1 of them distinct at 2024-12-11 04:24:15.
+Progress(16) at 2024-12-11 04:24:18: 1,427,589 states generated (1,427,589 s/min), 142,472 distinct states found (142,472 ds/min), 47,162 states left on queue.
+Model checking completed. No error has been found.
+  Estimates of the probability that TLC did not check all reachable states
+  because two distinct states had the same fingerprint:
+  calculated (optimistic):  val = 1.0E-6
+  based on the actual fingerprints:  val = 4.2E-8
+17746857 states generated, 1121659 distinct states found, 0 states left on queue.
+The depth of the complete state graph search is 37.
+The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 9 and the 95th percentile is 3).
+Finished in 33s at (2024-12-11 04:24:46)
diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg-2024-12-11--04-26-11.log b/safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg-2024-12-11--04-26-11.log
new file mode 100644
index 0000000000..40e7611ae3
--- /dev/null
+++ b/safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg-2024-12-11--04-26-11.log
@@ -0,0 +1,64 @@
+git revision: 9e386917a
+Platform: Linux neon-dev-arm64-1 6.8.0-49-generic #49-Ubuntu SMP PREEMPT_DYNAMIC Sun Nov  3 21:21:58 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
+CPU Info Linux: Neoverse-N1
+CPU Cores Linux: 80
+CPU Info Mac: 
+CPU Cores Mac: 
+Spec: MCProposerAcceptorReconfig.tla
+Config: models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg
+----
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2}
+max_term = 2
+max_entries = 2
+max_generation = 5
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafetyFull
+LogIsMonotonic
+LogSafety
+\* CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
+----
+
+TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
+Running breadth-first search Model-Checking with fp 114 and seed -8099467489737745861 with 35 workers on 80 cores with 27307MB heap and 30720MB offheap memory [pid: 392020] (Linux 6.8.0-49-generic aarch64, Ubuntu 21.0.5 x86_64, OffHeapDiskFPSet, DiskStateQueue).
+Parsing file /home/arseny/neon2/safekeeper/spec/MCProposerAcceptorReconfig.tla
+Parsing file /tmp/tlc-11757875725969857497/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
+Parsing file /home/arseny/neon2/safekeeper/spec/ProposerAcceptorReconfig.tla
+Parsing file /tmp/tlc-11757875725969857497/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
+Parsing file /tmp/tlc-11757875725969857497/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
+Parsing file /tmp/tlc-11757875725969857497/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
+Parsing file /tmp/tlc-11757875725969857497/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
+Parsing file /tmp/tlc-11757875725969857497/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
+Parsing file /home/arseny/neon2/safekeeper/spec/ProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-11757875725969857497/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
+Semantic processing of module Naturals
+Semantic processing of module Sequences
+Semantic processing of module FiniteSets
+Semantic processing of module TLC
+Semantic processing of module Integers
+Semantic processing of module ProposerAcceptorStatic
+Semantic processing of module ProposerAcceptorReconfig
+Semantic processing of module TLCExt
+Semantic processing of module _TLCTrace
+Semantic processing of module MCProposerAcceptorReconfig
+Starting... (2024-12-11 04:26:12)
+Computing initial states...
+Finished computing initial states: 2 states generated, with 1 of them distinct at 2024-12-11 04:26:14.
+Progress(14) at 2024-12-11 04:26:17: 1,519,385 states generated (1,519,385 s/min), 231,263 distinct states found (231,263 ds/min), 121,410 states left on queue.
+Progress(20) at 2024-12-11 04:27:17: 42,757,204 states generated (41,237,819 s/min), 4,198,386 distinct states found (3,967,123 ds/min), 1,308,109 states left on queue.
+Progress(22) at 2024-12-11 04:28:17: 83,613,929 states generated (40,856,725 s/min), 7,499,873 distinct states found (3,301,487 ds/min), 1,929,464 states left on queue.
+Progress(23) at 2024-12-11 04:29:17: 124,086,758 states generated (40,472,829 s/min), 10,569,712 distinct states found (3,069,839 ds/min), 2,386,988 states left on queue.
+Progress(24) at 2024-12-11 04:30:17: 163,412,538 states generated (39,325,780 s/min), 13,314,303 distinct states found (2,744,591 ds/min), 2,610,637 states left on queue.
+Progress(25) at 2024-12-11 04:31:17: 202,643,708 states generated (39,231,170 s/min), 15,960,583 distinct states found (2,646,280 ds/min), 2,759,681 states left on queue.
+Progress(26) at 2024-12-11 04:32:17: 240,681,633 states generated (38,037,925 s/min), 18,443,440 distinct states found (2,482,857 ds/min), 2,852,177 states left on queue.
+Progress(27) at 2024-12-11 04:33:17: 278,559,134 states generated (37,877,501 s/min), 20,878,067 distinct states found (2,434,627 ds/min), 2,904,400 states left on queue.
+Progress(28) at 2024-12-11 04:34:17: 316,699,911 states generated (38,140,777 s/min), 23,212,229 distinct states found (2,334,162 ds/min), 2,864,969 states left on queue.

From f4739d49e3f20fae6a88b7a050084ab73c6319e4 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 9 Jan 2025 12:31:02 +0000
Subject: [PATCH 132/583] pageserver: tweak interpreted ingest record metrics
 (#10291)

## Problem
The filtered record metric doesn't make sense for interpreted ingest.

## Summary of changes
While of dubious utility in the first place, this patch replaces them
with records received and records observed metrics for interpreted
ingest:
* received records cause the pageserver to do _something_: write a key,
value pair to storage, update some metadata or flush pending
modifications
* observed records are a shard 0 concept and contain only key metadata
used in tracking relation sizes (received records include observed
records)
---
 libs/wal_decoder/src/models.rs                |  8 +++
 libs/wal_decoder/src/serialized_batch.rs      |  5 ++
 pageserver/src/metrics.rs                     | 10 ++-
 .../walreceiver/walreceiver_connection.rs     | 72 +++++++------------
 test_runner/regress/test_sharding.py          | 54 +++++++++-----
 5 files changed, 83 insertions(+), 66 deletions(-)

diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs
index af22de5d95..6576dd0eba 100644
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -95,6 +95,14 @@ impl InterpretedWalRecord {
             && self.metadata_record.is_none()
             && matches!(self.flush_uncommitted, FlushUncommittedRecords::No)
     }
+
+    /// Checks if the WAL record is observed (i.e. contains only metadata
+    /// for observed values)
+    pub fn is_observed(&self) -> bool {
+        self.batch.is_observed()
+            && self.metadata_record.is_none()
+            && matches!(self.flush_uncommitted, FlushUncommittedRecords::No)
+    }
 }
 
 /// The interpreted part of the Postgres WAL record which requires metadata
diff --git a/libs/wal_decoder/src/serialized_batch.rs b/libs/wal_decoder/src/serialized_batch.rs
index 41294da7a0..af2b179e05 100644
--- a/libs/wal_decoder/src/serialized_batch.rs
+++ b/libs/wal_decoder/src/serialized_batch.rs
@@ -501,6 +501,11 @@ impl SerializedValueBatch {
         !self.has_data() && self.metadata.is_empty()
     }
 
+    /// Checks if the batch contains only observed values
+    pub fn is_observed(&self) -> bool {
+        !self.has_data() && !self.metadata.is_empty()
+    }
+
     /// Checks if the batch contains data
     ///
     /// Note that if this returns false, it may still contain observed values or
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index b4e20cb8b9..e825d538a2 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2337,13 +2337,15 @@ macro_rules! redo_bytes_histogram_count_buckets {
 pub(crate) struct WalIngestMetrics {
     pub(crate) bytes_received: IntCounter,
     pub(crate) records_received: IntCounter,
+    pub(crate) records_observed: IntCounter,
     pub(crate) records_committed: IntCounter,
     pub(crate) records_filtered: IntCounter,
     pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
     pub(crate) clear_vm_bits_unknown: IntCounterVec,
 }
 
-pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
+pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
+    WalIngestMetrics {
     bytes_received: register_int_counter!(
         "pageserver_wal_ingest_bytes_received",
         "Bytes of WAL ingested from safekeepers",
@@ -2354,6 +2356,11 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
         "Number of WAL records received from safekeepers"
     )
     .expect("failed to define a metric"),
+    records_observed: register_int_counter!(
+        "pageserver_wal_ingest_records_observed",
+        "Number of WAL records observed from safekeepers. These are metadata only records for shard 0."
+    )
+    .expect("failed to define a metric"),
     records_committed: register_int_counter!(
         "pageserver_wal_ingest_records_committed",
         "Number of WAL records which resulted in writes to pageserver storage"
@@ -2375,6 +2382,7 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
         &["entity"],
     )
     .expect("failed to define a metric"),
+}
 });
 
 pub(crate) static PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 3f10eeda60..d74faa1af5 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -319,27 +319,11 @@ pub(super) async fn handle_walreceiver_connection(
             return Ok(());
         }
 
-        async fn commit(
-            modification: &mut DatadirModification<'_>,
-            uncommitted: &mut u64,
-            filtered: &mut u64,
-            ctx: &RequestContext,
-        ) -> anyhow::Result<()> {
-            WAL_INGEST
-                .records_committed
-                .inc_by(*uncommitted - *filtered);
-            modification.commit(ctx).await?;
-            *uncommitted = 0;
-            *filtered = 0;
-            Ok(())
-        }
-
         let status_update = match replication_message {
             ReplicationMessage::RawInterpretedWalRecords(raw) => {
                 WAL_INGEST.bytes_received.inc_by(raw.data().len() as u64);
 
                 let mut uncommitted_records = 0;
-                let mut filtered_records = 0;
 
                 // This is the end LSN of the raw WAL from which the records
                 // were interpreted.
@@ -380,31 +364,23 @@ pub(super) async fn handle_walreceiver_connection(
                     if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
                         && uncommitted_records > 0
                     {
-                        commit(
-                            &mut modification,
-                            &mut uncommitted_records,
-                            &mut filtered_records,
-                            &ctx,
-                        )
-                        .await?;
+                        modification.commit(&ctx).await?;
+                        uncommitted_records = 0;
                     }
 
                     let local_next_record_lsn = interpreted.next_record_lsn;
-                    let ingested = walingest
+
+                    if interpreted.is_observed() {
+                        WAL_INGEST.records_observed.inc();
+                    }
+
+                    walingest
                         .ingest_record(interpreted, &mut modification, &ctx)
                         .await
                         .with_context(|| {
                             format!("could not ingest record at {local_next_record_lsn}")
                         })?;
 
-                    if !ingested {
-                        tracing::debug!(
-                            "ingest: filtered out record @ LSN {local_next_record_lsn}"
-                        );
-                        WAL_INGEST.records_filtered.inc();
-                        filtered_records += 1;
-                    }
-
                     uncommitted_records += 1;
 
                     // FIXME: this cannot be made pausable_failpoint without fixing the
@@ -418,13 +394,8 @@ pub(super) async fn handle_walreceiver_connection(
                         || modification.approx_pending_bytes()
                             > DatadirModification::MAX_PENDING_BYTES
                     {
-                        commit(
-                            &mut modification,
-                            &mut uncommitted_records,
-                            &mut filtered_records,
-                            &ctx,
-                        )
-                        .await?;
+                        modification.commit(&ctx).await?;
+                        uncommitted_records = 0;
                     }
                 }
 
@@ -442,13 +413,7 @@ pub(super) async fn handle_walreceiver_connection(
 
                 if uncommitted_records > 0 || needs_last_record_lsn_advance {
                     // Commit any uncommitted records
-                    commit(
-                        &mut modification,
-                        &mut uncommitted_records,
-                        &mut filtered_records,
-                        &ctx,
-                    )
-                    .await?;
+                    modification.commit(&ctx).await?;
                 }
 
                 if !caught_up && streaming_lsn >= end_of_wal {
@@ -469,6 +434,21 @@ pub(super) async fn handle_walreceiver_connection(
             }
 
             ReplicationMessage::XLogData(xlog_data) => {
+                async fn commit(
+                    modification: &mut DatadirModification<'_>,
+                    uncommitted: &mut u64,
+                    filtered: &mut u64,
+                    ctx: &RequestContext,
+                ) -> anyhow::Result<()> {
+                    WAL_INGEST
+                        .records_committed
+                        .inc_by(*uncommitted - *filtered);
+                    modification.commit(ctx).await?;
+                    *uncommitted = 0;
+                    *filtered = 0;
+                    Ok(())
+                }
+
                 // Pass the WAL data to the decoder, and see if we can decode
                 // more records as a result.
                 let data = xlog_data.data();
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 4c381b563f..673904a1cd 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -561,11 +561,17 @@ def test_sharding_split_smoke(
     workload.write_rows(256)
 
     # Note which pageservers initially hold a shard after tenant creation
-    pre_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)]
-    log.info("Pre-split pageservers: {pre_split_pageserver_ids}")
+    pre_split_pageserver_ids = dict()
+    for loc in env.storage_controller.locate(tenant_id):
+        shard_no = TenantShardId.parse(loc["shard_id"]).shard_number
+        pre_split_pageserver_ids[loc["node_id"]] = shard_no
+    log.info(f"Pre-split pageservers: {pre_split_pageserver_ids}")
 
     # For pageservers holding a shard, validate their ingest statistics
     # reflect a proper splitting of the WAL.
+
+    observed_on_shard_zero = 0
+    received_on_non_zero_shard = 0
     for pageserver in env.pageservers:
         if pageserver.id not in pre_split_pageserver_ids:
             continue
@@ -573,28 +579,38 @@ def test_sharding_split_smoke(
         metrics = pageserver.http_client().get_metrics_values(
             [
                 "pageserver_wal_ingest_records_received_total",
-                "pageserver_wal_ingest_records_committed_total",
-                "pageserver_wal_ingest_records_filtered_total",
+                "pageserver_wal_ingest_records_observed_total",
             ]
         )
 
         log.info(f"Pageserver {pageserver.id} metrics: {metrics}")
 
-        # Not everything received was committed
-        assert (
-            metrics["pageserver_wal_ingest_records_received_total"]
-            > metrics["pageserver_wal_ingest_records_committed_total"]
-        )
+        received = metrics["pageserver_wal_ingest_records_received_total"]
+        observed = metrics["pageserver_wal_ingest_records_observed_total"]
 
-        # Something was committed
-        assert metrics["pageserver_wal_ingest_records_committed_total"] > 0
+        shard_number: int | None = pre_split_pageserver_ids.get(pageserver.id, None)
+        if shard_number is None:
+            assert received == 0
+            assert observed == 0
+        elif shard_number == 0:
+            # Shard 0 receives its own records and observes records of other shards
+            # for relation size tracking.
+            assert observed > 0
+            assert received > 0
+            observed_on_shard_zero = int(observed)
+        else:
+            # Non zero shards do not observe any records, but only receive their own.
+            assert observed == 0
+            assert received > 0
+            received_on_non_zero_shard += int(received)
 
-        # Counts are self consistent
-        assert (
-            metrics["pageserver_wal_ingest_records_received_total"]
-            == metrics["pageserver_wal_ingest_records_committed_total"]
-            + metrics["pageserver_wal_ingest_records_filtered_total"]
-        )
+    # Some records are sent to multiple shards and some shard 0 records include both value observations
+    # and other metadata. Hence, we do a sanity check below that shard 0 observes the majority of records
+    # received by other shards.
+    assert (
+        observed_on_shard_zero <= received_on_non_zero_shard
+        and observed_on_shard_zero >= received_on_non_zero_shard // 2
+    )
 
     # TODO: validate that shards have different sizes
 
@@ -633,7 +649,7 @@ def test_sharding_split_smoke(
     # We should have split into 8 shards, on the same 4 pageservers we started on.
     assert len(post_split_pageserver_ids) == split_shard_count
     assert len(set(post_split_pageserver_ids)) == shard_count
-    assert set(post_split_pageserver_ids) == set(pre_split_pageserver_ids)
+    assert set(post_split_pageserver_ids) == set(pre_split_pageserver_ids.keys())
 
     # The old parent shards should no longer exist on disk
     assert not shards_on_disk(old_shard_ids)
@@ -739,7 +755,7 @@ def test_sharding_split_smoke(
     # all the pageservers that originally held an attached shard should still hold one, otherwise
     # it would indicate that we had done some unnecessary migration.
     log.info(f"attached: {attached}")
-    for ps_id in pre_split_pageserver_ids:
+    for ps_id in pre_split_pageserver_ids.keys():
         log.info("Pre-split pageserver {ps_id} should still hold an attached location")
         assert ps_id in attached
 

From 20c40eb733f401903adfa1c1da47dbed54098cae Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 9 Jan 2025 15:12:04 +0200
Subject: [PATCH 133/583] Add response tag to getpage request in V3 protocol
 version (#8686)

## Problem

We have several serious data corruption incidents caused by mismatch of
get-age requests:
https://neondb.slack.com/archives/C07FJS4QF7V/p1723032720164359

We hope that the problem is fixed now. But it is better to prevent such
kind of problems in future.

Part of https://github.com/neondatabase/cloud/issues/16472

## Summary of changes

This PR introduce new V3 version of compute<->pageserver protocol,
adding tag to getpage response.
So now compute is able to check if it really gets response to the
requested page.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 libs/pageserver_api/src/models.rs             | 395 ++++++++++++----
 pageserver/client/src/page_service.rs         |   2 +-
 .../pagebench/src/cmd/getpage_latest_lsn.rs   |  15 +-
 pageserver/src/metrics.rs                     |   1 +
 pageserver/src/page_service.rs                | 181 ++++---
 pgxn/neon/libpagestore.c                      |   9 +-
 pgxn/neon/pagestore_client.h                  |  51 +-
 pgxn/neon/pagestore_smgr.c                    | 445 ++++++++++++------
 8 files changed, 784 insertions(+), 315 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index f3fc9fad76..39390d7647 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1460,75 +1460,91 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
 // interface allows sending both LSNs, and let the pageserver do the right thing. There was no
 // difference in the responses between V1 and V2.
 //
-#[derive(Clone, Copy)]
+// V3 version of protocol adds request ID to all requests. This request ID is also included in response
+// as well as other fields from requests, which allows to verify that we receive response for our request.
+// We copy fields from request to response to make checking more reliable: request ID is formed from process ID
+// and local counter, so in principle there can be duplicated requests IDs if process PID is reused.
+//
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub enum PagestreamProtocolVersion {
     V2,
+    V3,
 }
 
-#[derive(Debug, PartialEq, Eq)]
+pub type RequestId = u64;
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub struct PagestreamRequest {
+    pub reqid: RequestId,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
+}
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub struct PagestreamExistsRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub hdr: PagestreamRequest,
     pub rel: RelTag,
 }
 
-#[derive(Debug, PartialEq, Eq)]
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub struct PagestreamNblocksRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub hdr: PagestreamRequest,
     pub rel: RelTag,
 }
 
-#[derive(Debug, PartialEq, Eq)]
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub struct PagestreamGetPageRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub hdr: PagestreamRequest,
     pub rel: RelTag,
     pub blkno: u32,
 }
 
-#[derive(Debug, PartialEq, Eq)]
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub struct PagestreamDbSizeRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub hdr: PagestreamRequest,
     pub dbnode: u32,
 }
 
-#[derive(Debug, PartialEq, Eq)]
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub struct PagestreamGetSlruSegmentRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub hdr: PagestreamRequest,
     pub kind: u8,
     pub segno: u32,
 }
 
 #[derive(Debug)]
 pub struct PagestreamExistsResponse {
+    pub req: PagestreamExistsRequest,
     pub exists: bool,
 }
 
 #[derive(Debug)]
 pub struct PagestreamNblocksResponse {
+    pub req: PagestreamNblocksRequest,
     pub n_blocks: u32,
 }
 
 #[derive(Debug)]
 pub struct PagestreamGetPageResponse {
+    pub req: PagestreamGetPageRequest,
     pub page: Bytes,
 }
 
 #[derive(Debug)]
 pub struct PagestreamGetSlruSegmentResponse {
+    pub req: PagestreamGetSlruSegmentRequest,
     pub segment: Bytes,
 }
 
 #[derive(Debug)]
 pub struct PagestreamErrorResponse {
+    pub req: PagestreamRequest,
     pub message: String,
 }
 
 #[derive(Debug)]
 pub struct PagestreamDbSizeResponse {
+    pub req: PagestreamDbSizeRequest,
     pub db_size: i64,
 }
 
@@ -1545,15 +1561,16 @@ pub struct TenantHistorySize {
 
 impl PagestreamFeMessage {
     /// Serialize a compute -> pageserver message. This is currently only used in testing
-    /// tools. Always uses protocol version 2.
+    /// tools. Always uses protocol version 3.
     pub fn serialize(&self) -> Bytes {
         let mut bytes = BytesMut::new();
 
         match self {
             Self::Exists(req) => {
                 bytes.put_u8(0);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
                 bytes.put_u32(req.rel.relnode);
@@ -1562,8 +1579,9 @@ impl PagestreamFeMessage {
 
             Self::Nblocks(req) => {
                 bytes.put_u8(1);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
                 bytes.put_u32(req.rel.relnode);
@@ -1572,8 +1590,9 @@ impl PagestreamFeMessage {
 
             Self::GetPage(req) => {
                 bytes.put_u8(2);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
                 bytes.put_u32(req.rel.relnode);
@@ -1583,15 +1602,17 @@ impl PagestreamFeMessage {
 
             Self::DbSize(req) => {
                 bytes.put_u8(3);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
                 bytes.put_u32(req.dbnode);
             }
 
             Self::GetSlruSegment(req) => {
                 bytes.put_u8(4);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
                 bytes.put_u8(req.kind);
                 bytes.put_u32(req.segno);
             }
@@ -1600,21 +1621,35 @@ impl PagestreamFeMessage {
         bytes.into()
     }
 
-    pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
+    pub fn parse<R: std::io::Read>(
+        body: &mut R,
+        protocol_version: PagestreamProtocolVersion,
+    ) -> anyhow::Result<PagestreamFeMessage> {
         // these correspond to the NeonMessageTag enum in pagestore_client.h
         //
         // TODO: consider using protobuf or serde bincode for less error prone
         // serialization.
         let msg_tag = body.read_u8()?;
-
-        // these two fields are the same for every request type
-        let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
-        let not_modified_since = Lsn::from(body.read_u64::<BigEndian>()?);
+        let (reqid, request_lsn, not_modified_since) = match protocol_version {
+            PagestreamProtocolVersion::V2 => (
+                0,
+                Lsn::from(body.read_u64::<BigEndian>()?),
+                Lsn::from(body.read_u64::<BigEndian>()?),
+            ),
+            PagestreamProtocolVersion::V3 => (
+                body.read_u64::<BigEndian>()?,
+                Lsn::from(body.read_u64::<BigEndian>()?),
+                Lsn::from(body.read_u64::<BigEndian>()?),
+            ),
+        };
 
         match msg_tag {
             0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                request_lsn,
-                not_modified_since,
+                hdr: PagestreamRequest {
+                    reqid,
+                    request_lsn,
+                    not_modified_since,
+                },
                 rel: RelTag {
                     spcnode: body.read_u32::<BigEndian>()?,
                     dbnode: body.read_u32::<BigEndian>()?,
@@ -1623,8 +1658,11 @@ impl PagestreamFeMessage {
                 },
             })),
             1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                request_lsn,
-                not_modified_since,
+                hdr: PagestreamRequest {
+                    reqid,
+                    request_lsn,
+                    not_modified_since,
+                },
                 rel: RelTag {
                     spcnode: body.read_u32::<BigEndian>()?,
                     dbnode: body.read_u32::<BigEndian>()?,
@@ -1633,8 +1671,11 @@ impl PagestreamFeMessage {
                 },
             })),
             2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                request_lsn,
-                not_modified_since,
+                hdr: PagestreamRequest {
+                    reqid,
+                    request_lsn,
+                    not_modified_since,
+                },
                 rel: RelTag {
                     spcnode: body.read_u32::<BigEndian>()?,
                     dbnode: body.read_u32::<BigEndian>()?,
@@ -1644,14 +1685,20 @@ impl PagestreamFeMessage {
                 blkno: body.read_u32::<BigEndian>()?,
             })),
             3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                request_lsn,
-                not_modified_since,
+                hdr: PagestreamRequest {
+                    reqid,
+                    request_lsn,
+                    not_modified_since,
+                },
                 dbnode: body.read_u32::<BigEndian>()?,
             })),
             4 => Ok(PagestreamFeMessage::GetSlruSegment(
                 PagestreamGetSlruSegmentRequest {
-                    request_lsn,
-                    not_modified_since,
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
                     kind: body.read_u8()?,
                     segno: body.read_u32::<BigEndian>()?,
                 },
@@ -1662,43 +1709,114 @@ impl PagestreamFeMessage {
 }
 
 impl PagestreamBeMessage {
-    pub fn serialize(&self) -> Bytes {
+    pub fn serialize(&self, protocol_version: PagestreamProtocolVersion) -> Bytes {
         let mut bytes = BytesMut::new();
 
         use PagestreamBeMessageTag as Tag;
-        match self {
-            Self::Exists(resp) => {
-                bytes.put_u8(Tag::Exists as u8);
-                bytes.put_u8(resp.exists as u8);
-            }
+        match protocol_version {
+            PagestreamProtocolVersion::V2 => {
+                match self {
+                    Self::Exists(resp) => {
+                        bytes.put_u8(Tag::Exists as u8);
+                        bytes.put_u8(resp.exists as u8);
+                    }
 
-            Self::Nblocks(resp) => {
-                bytes.put_u8(Tag::Nblocks as u8);
-                bytes.put_u32(resp.n_blocks);
-            }
+                    Self::Nblocks(resp) => {
+                        bytes.put_u8(Tag::Nblocks as u8);
+                        bytes.put_u32(resp.n_blocks);
+                    }
 
-            Self::GetPage(resp) => {
-                bytes.put_u8(Tag::GetPage as u8);
-                bytes.put(&resp.page[..]);
-            }
+                    Self::GetPage(resp) => {
+                        bytes.put_u8(Tag::GetPage as u8);
+                        bytes.put(&resp.page[..])
+                    }
 
-            Self::Error(resp) => {
-                bytes.put_u8(Tag::Error as u8);
-                bytes.put(resp.message.as_bytes());
-                bytes.put_u8(0); // null terminator
-            }
-            Self::DbSize(resp) => {
-                bytes.put_u8(Tag::DbSize as u8);
-                bytes.put_i64(resp.db_size);
-            }
+                    Self::Error(resp) => {
+                        bytes.put_u8(Tag::Error as u8);
+                        bytes.put(resp.message.as_bytes());
+                        bytes.put_u8(0); // null terminator
+                    }
+                    Self::DbSize(resp) => {
+                        bytes.put_u8(Tag::DbSize as u8);
+                        bytes.put_i64(resp.db_size);
+                    }
 
-            Self::GetSlruSegment(resp) => {
-                bytes.put_u8(Tag::GetSlruSegment as u8);
-                bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
-                bytes.put(&resp.segment[..]);
+                    Self::GetSlruSegment(resp) => {
+                        bytes.put_u8(Tag::GetSlruSegment as u8);
+                        bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
+                        bytes.put(&resp.segment[..]);
+                    }
+                }
+            }
+            PagestreamProtocolVersion::V3 => {
+                match self {
+                    Self::Exists(resp) => {
+                        bytes.put_u8(Tag::Exists as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u32(resp.req.rel.spcnode);
+                        bytes.put_u32(resp.req.rel.dbnode);
+                        bytes.put_u32(resp.req.rel.relnode);
+                        bytes.put_u8(resp.req.rel.forknum);
+                        bytes.put_u8(resp.exists as u8);
+                    }
+
+                    Self::Nblocks(resp) => {
+                        bytes.put_u8(Tag::Nblocks as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u32(resp.req.rel.spcnode);
+                        bytes.put_u32(resp.req.rel.dbnode);
+                        bytes.put_u32(resp.req.rel.relnode);
+                        bytes.put_u8(resp.req.rel.forknum);
+                        bytes.put_u32(resp.n_blocks);
+                    }
+
+                    Self::GetPage(resp) => {
+                        bytes.put_u8(Tag::GetPage as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u32(resp.req.rel.spcnode);
+                        bytes.put_u32(resp.req.rel.dbnode);
+                        bytes.put_u32(resp.req.rel.relnode);
+                        bytes.put_u8(resp.req.rel.forknum);
+                        bytes.put_u32(resp.req.blkno);
+                        bytes.put(&resp.page[..])
+                    }
+
+                    Self::Error(resp) => {
+                        bytes.put_u8(Tag::Error as u8);
+                        bytes.put_u64(resp.req.reqid);
+                        bytes.put_u64(resp.req.request_lsn.0);
+                        bytes.put_u64(resp.req.not_modified_since.0);
+                        bytes.put(resp.message.as_bytes());
+                        bytes.put_u8(0); // null terminator
+                    }
+                    Self::DbSize(resp) => {
+                        bytes.put_u8(Tag::DbSize as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u32(resp.req.dbnode);
+                        bytes.put_i64(resp.db_size);
+                    }
+
+                    Self::GetSlruSegment(resp) => {
+                        bytes.put_u8(Tag::GetSlruSegment as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u8(resp.req.kind);
+                        bytes.put_u32(resp.req.segno);
+                        bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
+                        bytes.put(&resp.segment[..]);
+                    }
+                }
             }
         }
-
         bytes.into()
     }
 
@@ -1710,38 +1828,131 @@ impl PagestreamBeMessage {
         let ok =
             match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
                 Tag::Exists => {
-                    let exists = buf.read_u8()?;
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let rel = RelTag {
+                        spcnode: buf.read_u32::<BigEndian>()?,
+                        dbnode: buf.read_u32::<BigEndian>()?,
+                        relnode: buf.read_u32::<BigEndian>()?,
+                        forknum: buf.read_u8()?,
+                    };
+                    let exists = buf.read_u8()? != 0;
                     Self::Exists(PagestreamExistsResponse {
-                        exists: exists != 0,
+                        req: PagestreamExistsRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            rel,
+                        },
+                        exists,
                     })
                 }
                 Tag::Nblocks => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let rel = RelTag {
+                        spcnode: buf.read_u32::<BigEndian>()?,
+                        dbnode: buf.read_u32::<BigEndian>()?,
+                        relnode: buf.read_u32::<BigEndian>()?,
+                        forknum: buf.read_u8()?,
+                    };
                     let n_blocks = buf.read_u32::<BigEndian>()?;
-                    Self::Nblocks(PagestreamNblocksResponse { n_blocks })
+                    Self::Nblocks(PagestreamNblocksResponse {
+                        req: PagestreamNblocksRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            rel,
+                        },
+                        n_blocks,
+                    })
                 }
                 Tag::GetPage => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let rel = RelTag {
+                        spcnode: buf.read_u32::<BigEndian>()?,
+                        dbnode: buf.read_u32::<BigEndian>()?,
+                        relnode: buf.read_u32::<BigEndian>()?,
+                        forknum: buf.read_u8()?,
+                    };
+                    let blkno = buf.read_u32::<BigEndian>()?;
                     let mut page = vec![0; 8192]; // TODO: use MaybeUninit
                     buf.read_exact(&mut page)?;
-                    PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
+                    Self::GetPage(PagestreamGetPageResponse {
+                        req: PagestreamGetPageRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            rel,
+                            blkno,
+                        },
+                        page: page.into(),
+                    })
                 }
                 Tag::Error => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
                     let mut msg = Vec::new();
                     buf.read_until(0, &mut msg)?;
                     let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
                     let rust_str = cstring.to_str()?;
-                    PagestreamBeMessage::Error(PagestreamErrorResponse {
+                    Self::Error(PagestreamErrorResponse {
+                        req: PagestreamRequest {
+                            reqid,
+                            request_lsn,
+                            not_modified_since,
+                        },
                         message: rust_str.to_owned(),
                     })
                 }
                 Tag::DbSize => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let dbnode = buf.read_u32::<BigEndian>()?;
                     let db_size = buf.read_i64::<BigEndian>()?;
-                    Self::DbSize(PagestreamDbSizeResponse { db_size })
+                    Self::DbSize(PagestreamDbSizeResponse {
+                        req: PagestreamDbSizeRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            dbnode,
+                        },
+                        db_size,
+                    })
                 }
                 Tag::GetSlruSegment => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let kind = buf.read_u8()?;
+                    let segno = buf.read_u32::<BigEndian>()?;
                     let n_blocks = buf.read_u32::<BigEndian>()?;
                     let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
                     buf.read_exact(&mut segment)?;
                     Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
+                        req: PagestreamGetSlruSegmentRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            kind,
+                            segno,
+                        },
                         segment: segment.into(),
                     })
                 }
@@ -1780,8 +1991,11 @@ mod tests {
         // Test serialization/deserialization of PagestreamFeMessage
         let messages = vec![
             PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(3),
+                hdr: PagestreamRequest {
+                    reqid: 0,
+                    request_lsn: Lsn(4),
+                    not_modified_since: Lsn(3),
+                },
                 rel: RelTag {
                     forknum: 1,
                     spcnode: 2,
@@ -1790,8 +2004,11 @@ mod tests {
                 },
             }),
             PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(4),
+                hdr: PagestreamRequest {
+                    reqid: 0,
+                    request_lsn: Lsn(4),
+                    not_modified_since: Lsn(4),
+                },
                 rel: RelTag {
                     forknum: 1,
                     spcnode: 2,
@@ -1800,8 +2017,11 @@ mod tests {
                 },
             }),
             PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(3),
+                hdr: PagestreamRequest {
+                    reqid: 0,
+                    request_lsn: Lsn(4),
+                    not_modified_since: Lsn(3),
+                },
                 rel: RelTag {
                     forknum: 1,
                     spcnode: 2,
@@ -1811,14 +2031,19 @@ mod tests {
                 blkno: 7,
             }),
             PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(3),
+                hdr: PagestreamRequest {
+                    reqid: 0,
+                    request_lsn: Lsn(4),
+                    not_modified_since: Lsn(3),
+                },
                 dbnode: 7,
             }),
         ];
         for msg in messages {
             let bytes = msg.serialize();
-            let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
+            let reconstructed =
+                PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V3)
+                    .unwrap();
             assert!(msg == reconstructed);
         }
     }
diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs
index f9507fc47a..207ec4166c 100644
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -60,7 +60,7 @@ impl Client {
     ) -> anyhow::Result<PagestreamClient> {
         let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
             .client
-            .copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}"))
+            .copy_both_simple(&format!("pagestream_v3 {tenant_id} {timeline_id}"))
             .await?;
         let Client {
             cancel_on_client_drop,
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index b2df01714d..9f3984f1bd 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -2,7 +2,7 @@ use anyhow::Context;
 use camino::Utf8PathBuf;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpaceAccum;
-use pageserver_api::models::PagestreamGetPageRequest;
+use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest};
 
 use pageserver_api::shard::TenantShardId;
 use tokio_util::sync::CancellationToken;
@@ -322,12 +322,15 @@ async fn main_impl(
                         .to_rel_block()
                         .expect("we filter non-rel-block keys out above");
                     PagestreamGetPageRequest {
-                        request_lsn: if rng.gen_bool(args.req_latest_probability) {
-                            Lsn::MAX
-                        } else {
-                            r.timeline_lsn
+                        hdr: PagestreamRequest {
+                            reqid: 0,
+                            request_lsn: if rng.gen_bool(args.req_latest_probability) {
+                                Lsn::MAX
+                            } else {
+                                r.timeline_lsn
+                            },
+                            not_modified_since: r.timeline_lsn,
                         },
-                        not_modified_since: r.timeline_lsn,
                         rel: rel_tag,
                         blkno: block_no,
                     }
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index e825d538a2..a313a64080 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1854,6 +1854,7 @@ pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
 
 #[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
 pub(crate) enum ComputeCommandKind {
+    PageStreamV3,
     PageStreamV2,
     Basebackup,
     Fullbackup,
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index d00ec11a76..0c4a1b18f5 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -17,7 +17,7 @@ use pageserver_api::models::{
     PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
     PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest,
     PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse,
-    PagestreamProtocolVersion,
+    PagestreamProtocolVersion, PagestreamRequest,
 };
 use pageserver_api::shard::TenantShardId;
 use postgres_backend::{
@@ -67,7 +67,7 @@ use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::{basebackup, timed_after_cancellation};
 use pageserver_api::key::rel_block_to_key;
-use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
+use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
 
@@ -537,6 +537,23 @@ impl From<WaitLsnError> for QueryError {
     }
 }
 
+#[derive(thiserror::Error, Debug)]
+struct BatchedPageStreamError {
+    req: PagestreamRequest,
+    err: PageStreamError,
+}
+
+impl std::fmt::Display for BatchedPageStreamError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.err.fmt(f)
+    }
+}
+
+struct BatchedGetPageRequest {
+    req: PagestreamGetPageRequest,
+    timer: SmgrOpTimer,
+}
+
 enum BatchedFeMessage {
     Exists {
         span: Span,
@@ -554,7 +571,7 @@ enum BatchedFeMessage {
         span: Span,
         shard: timeline::handle::Handle<TenantManagerTypes>,
         effective_request_lsn: Lsn,
-        pages: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>,
+        pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
     },
     DbSize {
         span: Span,
@@ -570,7 +587,7 @@ enum BatchedFeMessage {
     },
     RespondError {
         span: Span,
-        error: PageStreamError,
+        error: BatchedPageStreamError,
     },
 }
 
@@ -595,7 +612,7 @@ impl BatchedFeMessage {
             BatchedFeMessage::GetPage { shard, pages, .. } => (
                 shard,
                 pages.len(),
-                itertools::Either::Right(pages.iter_mut().map(|(_, _, timer)| timer)),
+                itertools::Either::Right(pages.iter_mut().map(|p| &mut p.timer)),
             ),
             BatchedFeMessage::RespondError { .. } => return Ok(()),
         };
@@ -654,6 +671,7 @@ impl PageServerHandler {
         )
     }
 
+    #[allow(clippy::too_many_arguments)]
     async fn pagestream_read_message<IO>(
         pgb: &mut PostgresBackendReader<IO>,
         tenant_id: TenantId,
@@ -661,6 +679,7 @@ impl PageServerHandler {
         timeline_handles: &mut TimelineHandles,
         cancel: &CancellationToken,
         ctx: &RequestContext,
+        protocol_version: PagestreamProtocolVersion,
         parent_span: Span,
     ) -> Result<Option<BatchedFeMessage>, QueryError>
     where
@@ -695,11 +714,12 @@ impl PageServerHandler {
         fail::fail_point!("ps::handle-pagerequest-message");
 
         // parse request
-        let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
+        let neon_fe_msg =
+            PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
 
         let batched_msg = match neon_fe_msg {
             PagestreamFeMessage::Exists(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
+                let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
                 let shard = timeline_handles
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
@@ -715,7 +735,7 @@ impl PageServerHandler {
                 }
             }
             PagestreamFeMessage::Nblocks(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
+                let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
                 let shard = timeline_handles
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
@@ -731,7 +751,7 @@ impl PageServerHandler {
                 }
             }
             PagestreamFeMessage::DbSize(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
+                let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn);
                 let shard = timeline_handles
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
@@ -747,7 +767,7 @@ impl PageServerHandler {
                 }
             }
             PagestreamFeMessage::GetSlruSegment(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
+                let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn);
                 let shard = timeline_handles
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
@@ -762,25 +782,23 @@ impl PageServerHandler {
                     req,
                 }
             }
-            PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                request_lsn,
-                not_modified_since,
-                rel,
-                blkno,
-            }) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %request_lsn);
+            PagestreamFeMessage::GetPage(req) => {
+                let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %req.hdr.request_lsn);
 
                 macro_rules! respond_error {
                     ($error:expr) => {{
                         let error = BatchedFeMessage::RespondError {
                             span,
-                            error: $error,
+                            error: BatchedPageStreamError {
+                                req: req.hdr,
+                                err: $error,
+                            },
                         };
                         Ok(Some(error))
                     }};
                 }
 
-                let key = rel_block_to_key(rel, blkno);
+                let key = rel_block_to_key(req.rel, req.blkno);
                 let shard = match timeline_handles
                     .get(tenant_id, timeline_id, ShardSelector::Page(key))
                     .instrument(span.clone()) // sets `shard_id` field
@@ -814,8 +832,8 @@ impl PageServerHandler {
 
                 let effective_request_lsn = match Self::wait_or_get_last_lsn(
                     &shard,
-                    request_lsn,
-                    not_modified_since,
+                    req.hdr.request_lsn,
+                    req.hdr.not_modified_since,
                     &shard.get_latest_gc_cutoff_lsn(),
                     ctx,
                 )
@@ -831,7 +849,7 @@ impl PageServerHandler {
                     span,
                     shard,
                     effective_request_lsn,
-                    pages: smallvec::smallvec![(rel, blkno, timer)],
+                    pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }],
                 }
             }
         };
@@ -910,6 +928,7 @@ impl PageServerHandler {
         pgb_writer: &mut PostgresBackend<IO>,
         batch: BatchedFeMessage,
         cancel: &CancellationToken,
+        protocol_version: PagestreamProtocolVersion,
         ctx: &RequestContext,
     ) -> Result<(), QueryError>
     where
@@ -917,7 +936,7 @@ impl PageServerHandler {
     {
         // invoke handler function
         let (handler_results, span): (
-            Vec<Result<(PagestreamBeMessage, SmgrOpTimer), PageStreamError>>,
+            Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>>,
             _,
         ) = match batch {
             BatchedFeMessage::Exists {
@@ -932,7 +951,8 @@ impl PageServerHandler {
                         .handle_get_rel_exists_request(&shard, &req, ctx)
                         .instrument(span.clone())
                         .await
-                        .map(|msg| (msg, timer))],
+                        .map(|msg| (msg, timer))
+                        .map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
                     span,
                 )
             }
@@ -948,7 +968,8 @@ impl PageServerHandler {
                         .handle_get_nblocks_request(&shard, &req, ctx)
                         .instrument(span.clone())
                         .await
-                        .map(|msg| (msg, timer))],
+                        .map(|msg| (msg, timer))
+                        .map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
                     span,
                 )
             }
@@ -990,7 +1011,8 @@ impl PageServerHandler {
                         .handle_db_size_request(&shard, &req, ctx)
                         .instrument(span.clone())
                         .await
-                        .map(|msg| (msg, timer))],
+                        .map(|msg| (msg, timer))
+                        .map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
                     span,
                 )
             }
@@ -1006,7 +1028,8 @@ impl PageServerHandler {
                         .handle_get_slru_segment_request(&shard, &req, ctx)
                         .instrument(span.clone())
                         .await
-                        .map(|msg| (msg, timer))],
+                        .map(|msg| (msg, timer))
+                        .map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
                     span,
                 )
             }
@@ -1022,7 +1045,7 @@ impl PageServerHandler {
         // Other handler errors are sent back as an error message and we stay in pagestream protocol.
         for handler_result in handler_results {
             let (response_msg, timer) = match handler_result {
-                Err(e) => match &e {
+                Err(e) => match &e.err {
                     PageStreamError::Shutdown => {
                         // If we fail to fulfil a request during shutdown, which may be _because_ of
                         // shutdown, then do not send the error to the client.  Instead just drop the
@@ -1041,13 +1064,14 @@ impl PageServerHandler {
                         // print the all details to the log with {:#}, but for the client the
                         // error message is enough.  Do not log if shutting down, as the anyhow::Error
                         // here includes cancellation which is not an error.
-                        let full = utils::error::report_compact_sources(&e);
+                        let full = utils::error::report_compact_sources(&e.err);
                         span.in_scope(|| {
                             error!("error reading relation or page version: {full:#}")
                         });
                         (
                             PagestreamBeMessage::Error(PagestreamErrorResponse {
-                                message: e.to_string(),
+                                req: e.req,
+                                message: e.err.to_string(),
                             }),
                             None, // TODO: measure errors
                         )
@@ -1060,7 +1084,9 @@ impl PageServerHandler {
             // marshal & transmit response message
             //
 
-            pgb_writer.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
+            pgb_writer.write_message_noflush(&BeMessage::CopyData(
+                &response_msg.serialize(protocol_version),
+            ))?;
 
             // We purposefully don't count flush time into the timer.
             //
@@ -1123,7 +1149,7 @@ impl PageServerHandler {
         pgb: &mut PostgresBackend<IO>,
         tenant_id: TenantId,
         timeline_id: TimelineId,
-        _protocol_version: PagestreamProtocolVersion,
+        protocol_version: PagestreamProtocolVersion,
         ctx: RequestContext,
     ) -> Result<(), QueryError>
     where
@@ -1163,6 +1189,7 @@ impl PageServerHandler {
                     timeline_handles,
                     request_span,
                     pipelining_config,
+                    protocol_version,
                     &ctx,
                 )
                 .await
@@ -1175,6 +1202,7 @@ impl PageServerHandler {
                     timeline_id,
                     timeline_handles,
                     request_span,
+                    protocol_version,
                     &ctx,
                 )
                 .await
@@ -1201,6 +1229,7 @@ impl PageServerHandler {
         timeline_id: TimelineId,
         mut timeline_handles: TimelineHandles,
         request_span: Span,
+        protocol_version: PagestreamProtocolVersion,
         ctx: &RequestContext,
     ) -> (
         (PostgresBackendReader<IO>, TimelineHandles),
@@ -1218,6 +1247,7 @@ impl PageServerHandler {
                 &mut timeline_handles,
                 &cancel,
                 ctx,
+                protocol_version,
                 request_span.clone(),
             )
             .await;
@@ -1238,7 +1268,7 @@ impl PageServerHandler {
             }
 
             let err = self
-                .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, ctx)
+                .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, protocol_version, ctx)
                 .await;
             match err {
                 Ok(()) => {}
@@ -1261,6 +1291,7 @@ impl PageServerHandler {
         mut timeline_handles: TimelineHandles,
         request_span: Span,
         pipelining_config: PageServicePipeliningConfigPipelined,
+        protocol_version: PagestreamProtocolVersion,
         ctx: &RequestContext,
     ) -> (
         (PostgresBackendReader<IO>, TimelineHandles),
@@ -1358,6 +1389,7 @@ impl PageServerHandler {
                         &mut timeline_handles,
                         &cancel_batcher,
                         &ctx,
+                        protocol_version,
                         request_span.clone(),
                     )
                     .await;
@@ -1403,8 +1435,14 @@ impl PageServerHandler {
                     batch
                         .throttle_and_record_start_processing(&self.cancel)
                         .await?;
-                    self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
-                        .await?;
+                    self.pagesteam_handle_batched_message(
+                        pgb_writer,
+                        batch,
+                        &cancel,
+                        protocol_version,
+                        &ctx,
+                    )
+                    .await?;
                 }
             }
         });
@@ -1578,8 +1616,8 @@ impl PageServerHandler {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
-            req.request_lsn,
-            req.not_modified_since,
+            req.hdr.request_lsn,
+            req.hdr.not_modified_since,
             &latest_gc_cutoff_lsn,
             ctx,
         )
@@ -1590,6 +1628,7 @@ impl PageServerHandler {
             .await?;
 
         Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
+            req: *req,
             exists,
         }))
     }
@@ -1604,8 +1643,8 @@ impl PageServerHandler {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
-            req.request_lsn,
-            req.not_modified_since,
+            req.hdr.request_lsn,
+            req.hdr.not_modified_since,
             &latest_gc_cutoff_lsn,
             ctx,
         )
@@ -1616,6 +1655,7 @@ impl PageServerHandler {
             .await?;
 
         Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
+            req: *req,
             n_blocks,
         }))
     }
@@ -1630,8 +1670,8 @@ impl PageServerHandler {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
-            req.request_lsn,
-            req.not_modified_since,
+            req.hdr.request_lsn,
+            req.hdr.not_modified_since,
             &latest_gc_cutoff_lsn,
             ctx,
         )
@@ -1643,6 +1683,7 @@ impl PageServerHandler {
         let db_size = total_blocks as i64 * BLCKSZ as i64;
 
         Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
+            req: *req,
             db_size,
         }))
     }
@@ -1652,9 +1693,9 @@ impl PageServerHandler {
         &mut self,
         timeline: &Timeline,
         effective_lsn: Lsn,
-        requests: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>,
+        requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
         ctx: &RequestContext,
-    ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), PageStreamError>> {
+    ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
         timeline
@@ -1663,7 +1704,7 @@ impl PageServerHandler {
 
         let results = timeline
             .get_rel_page_at_lsn_batched(
-                requests.iter().map(|(reltag, blkno, _)| (reltag, blkno)),
+                requests.iter().map(|p| (&p.req.rel, &p.req.blkno)),
                 effective_lsn,
                 ctx,
             )
@@ -1675,16 +1716,20 @@ impl PageServerHandler {
             requests
                 .into_iter()
                 .zip(results.into_iter())
-                .map(|((_, _, timer), res)| {
+                .map(|(req, res)| {
                     res.map(|page| {
                         (
                             PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse {
+                                req: req.req,
                                 page,
                             }),
-                            timer,
+                            req.timer,
                         )
                     })
-                    .map_err(PageStreamError::from)
+                    .map_err(|e| BatchedPageStreamError {
+                        err: PageStreamError::from(e),
+                        req: req.req.hdr,
+                    })
                 }),
         )
     }
@@ -1699,8 +1744,8 @@ impl PageServerHandler {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
-            req.request_lsn,
-            req.not_modified_since,
+            req.hdr.request_lsn,
+            req.hdr.not_modified_since,
             &latest_gc_cutoff_lsn,
             ctx,
         )
@@ -1711,7 +1756,7 @@ impl PageServerHandler {
         let segment = timeline.get_slru_segment(kind, req.segno, lsn, ctx).await?;
 
         Ok(PagestreamBeMessage::GetSlruSegment(
-            PagestreamGetSlruSegmentResponse { segment },
+            PagestreamGetSlruSegmentResponse { req: *req, segment },
         ))
     }
 
@@ -1906,6 +1951,7 @@ struct FullBackupCmd {
 struct PageStreamCmd {
     tenant_id: TenantId,
     timeline_id: TimelineId,
+    protocol_version: PagestreamProtocolVersion,
 }
 
 /// `lease lsn tenant timeline lsn`
@@ -1926,7 +1972,7 @@ enum PageServiceCmd {
 }
 
 impl PageStreamCmd {
-    fn parse(query: &str) -> anyhow::Result<Self> {
+    fn parse(query: &str, protocol_version: PagestreamProtocolVersion) -> anyhow::Result<Self> {
         let parameters = query.split_whitespace().collect_vec();
         if parameters.len() != 2 {
             bail!(
@@ -1941,6 +1987,7 @@ impl PageStreamCmd {
         Ok(Self {
             tenant_id,
             timeline_id,
+            protocol_version,
         })
     }
 }
@@ -2078,7 +2125,14 @@ impl PageServiceCmd {
             bail!("cannot parse query: {query}")
         };
         match cmd.to_ascii_lowercase().as_str() {
-            "pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse(other)?)),
+            "pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse(
+                other,
+                PagestreamProtocolVersion::V2,
+            )?)),
+            "pagestream_v3" => Ok(Self::PageStream(PageStreamCmd::parse(
+                other,
+                PagestreamProtocolVersion::V3,
+            )?)),
             "basebackup" => Ok(Self::BaseBackup(BaseBackupCmd::parse(other)?)),
             "fullbackup" => Ok(Self::FullBackup(FullBackupCmd::parse(other)?)),
             "lease" => {
@@ -2160,25 +2214,21 @@ where
             PageServiceCmd::PageStream(PageStreamCmd {
                 tenant_id,
                 timeline_id,
+                protocol_version,
             }) => {
                 tracing::Span::current()
                     .record("tenant_id", field::display(tenant_id))
                     .record("timeline_id", field::display(timeline_id));
 
                 self.check_permission(Some(tenant_id))?;
+                let command_kind = match protocol_version {
+                    PagestreamProtocolVersion::V2 => ComputeCommandKind::PageStreamV2,
+                    PagestreamProtocolVersion::V3 => ComputeCommandKind::PageStreamV3,
+                };
+                COMPUTE_COMMANDS_COUNTERS.for_command(command_kind).inc();
 
-                COMPUTE_COMMANDS_COUNTERS
-                    .for_command(ComputeCommandKind::PageStreamV2)
-                    .inc();
-
-                self.handle_pagerequests(
-                    pgb,
-                    tenant_id,
-                    timeline_id,
-                    PagestreamProtocolVersion::V2,
-                    ctx,
-                )
-                .await?;
+                self.handle_pagerequests(pgb, tenant_id, timeline_id, protocol_version, ctx)
+                    .await?;
             }
             PageServiceCmd::BaseBackup(BaseBackupCmd {
                 tenant_id,
@@ -2357,7 +2407,8 @@ mod tests {
             cmd,
             PageServiceCmd::PageStream(PageStreamCmd {
                 tenant_id,
-                timeline_id
+                timeline_id,
+                protocol_version: PagestreamProtocolVersion::V2,
             })
         );
         let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id}")).unwrap();
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index fa2a570ea8..769befb4e5 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -556,6 +556,9 @@ pageserver_connect(shardno_t shard_no, int elevel)
 
 		switch (neon_protocol_version)
 		{
+		case 3:
+			pagestream_query = psprintf("pagestream_v3 %s %s", neon_tenant, neon_timeline);
+			break;
 		case 2:
 			pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
 			break;
@@ -1135,9 +1138,9 @@ pg_init_libpagestore(void)
 							"Version of compute<->page server protocol",
 							NULL,
 							&neon_protocol_version,
-							2, /* use protocol version 2 */
-							2, /* min */
-							2, /* max */
+							2,	/* use protocol version 2 */
+							2,	/* min */
+							3,	/* max */
 							PGC_SU_BACKEND,
 							0,	/* no flags required */
 							NULL, NULL, NULL);
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index f905e3b0fa..37bc4f7886 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -44,10 +44,15 @@ typedef enum
 	T_NeonGetSlruSegmentResponse,
 } NeonMessageTag;
 
+typedef uint64 NeonRequestId;
+
 /* base struct for c-style inheritance */
 typedef struct
 {
 	NeonMessageTag tag;
+	NeonRequestId reqid;
+	XLogRecPtr	lsn;
+	XLogRecPtr	not_modified_since;
 } NeonMessage;
 
 #define messageTag(m) (((const NeonMessage *)(m))->tag)
@@ -67,6 +72,7 @@ typedef enum {
 	SLRU_MULTIXACT_OFFSETS
 } SlruKind;
 
+
 /*--
  * supertype of all the Neon*Request structs below.
  *
@@ -87,37 +93,37 @@ typedef enum {
  *
  * These structs describe the V2 of these requests. (The old now-defunct V1
  * protocol contained just one LSN and a boolean 'latest' flag.)
+ *
+ * V3 version of protocol adds request ID to all requests. This request ID is also included in response
+ * as well as other fields from requests, which allows to verify that we receive response for our request.
+ * We copy fields from request to response to make checking more reliable: request ID is formed from process ID
+ * and local counter, so in principle there can be duplicated requests IDs if process PID is reused.
  */
-typedef struct
-{
-	NeonMessageTag tag;
-	XLogRecPtr	lsn;
-	XLogRecPtr	not_modified_since;
-} NeonRequest;
+typedef NeonMessage NeonRequest;
 
 typedef struct
 {
-	NeonRequest req;
+	NeonRequest hdr;
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
 } NeonExistsRequest;
 
 typedef struct
 {
-	NeonRequest req;
+	NeonRequest hdr;
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
 } NeonNblocksRequest;
 
 typedef struct
 {
-	NeonRequest req;
+	NeonRequest hdr;
 	Oid			dbNode;
 } NeonDbSizeRequest;
 
 typedef struct
 {
-	NeonRequest req;
+	NeonRequest hdr;
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
 	BlockNumber blkno;
@@ -125,32 +131,29 @@ typedef struct
 
 typedef struct
 {
-	NeonRequest req;
-	SlruKind kind;
-	int      segno;
+	NeonRequest hdr;
+	SlruKind	kind;
+	int			segno;
 } NeonGetSlruSegmentRequest;
 
 /* supertype of all the Neon*Response structs below */
-typedef struct
-{
-	NeonMessageTag tag;
-} NeonResponse;
+typedef NeonMessage NeonResponse;
 
 typedef struct
 {
-	NeonMessageTag tag;
+	NeonExistsRequest req;
 	bool		exists;
 } NeonExistsResponse;
 
 typedef struct
 {
-	NeonMessageTag tag;
+	NeonNblocksRequest req;
 	uint32		n_blocks;
 } NeonNblocksResponse;
 
 typedef struct
 {
-	NeonMessageTag tag;
+	NeonGetPageRequest req;
 	char		page[FLEXIBLE_ARRAY_MEMBER];
 } NeonGetPageResponse;
 
@@ -158,21 +161,21 @@ typedef struct
 
 typedef struct
 {
-	NeonMessageTag tag;
+	NeonDbSizeRequest req;
 	int64		db_size;
 } NeonDbSizeResponse;
 
 typedef struct
 {
-	NeonMessageTag tag;
+	NeonResponse req;
 	char		message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error
 												 * message */
 } NeonErrorResponse;
 
 typedef struct
 {
-	NeonMessageTag tag;
-	int         n_blocks;
+	NeonGetSlruSegmentRequest req;
+	int			n_blocks;
 	char		data[BLCKSZ * SLRU_PAGES_PER_SEGMENT];
 } NeonGetSlruSegmentResponse;
 
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index b733807026..7a4c0ef487 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -120,6 +120,9 @@ static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block
 
 static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum);
 
+static uint32 local_request_counter;
+#define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter)
+
 /*
  * Prefetch implementation:
  *
@@ -188,15 +191,11 @@ typedef struct PrefetchRequest
 	uint8		status;		/* see PrefetchStatus for valid values */
 	uint8		flags;		/* see PrefetchRequestFlags */
 	neon_request_lsns request_lsns;
+	NeonRequestId reqid;
 	NeonResponse *response;		/* may be null */
 	uint64		my_ring_index;
 } PrefetchRequest;
 
-StaticAssertDecl(sizeof(PrefetchRequest) == 64,
-				 "We prefer to have a power-of-2 size for this struct. Please"
-				 " try to find an alternative solution before reaching to"
-				 " increase the expected size here");
-
 /* prefetch buffer lookup hash table */
 
 typedef struct PrfHashEntry
@@ -365,6 +364,7 @@ compact_prefetch_buffers(void)
 		target_slot->shard_no = source_slot->shard_no;
 		target_slot->status = source_slot->status;
 		target_slot->response = source_slot->response;
+		target_slot->reqid = source_slot->reqid;
 		target_slot->request_lsns = source_slot->request_lsns;
 		target_slot->my_ring_index = empty_ring_index;
 
@@ -798,7 +798,8 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 	uint64		mySlotNo PG_USED_FOR_ASSERTS_ONLY = slot->my_ring_index;
 
 	NeonGetPageRequest request = {
-		.req.tag = T_NeonGetPageRequest,
+		.hdr.tag = T_NeonGetPageRequest,
+		.hdr.reqid = GENERATE_REQUEST_ID(),
 		/* lsn and not_modified_since are filled in below */
 		.rinfo = BufTagGetNRelFileInfo(slot->buftag),
 		.forknum = slot->buftag.forkNum,
@@ -807,14 +808,16 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 
 	Assert(mySlotNo == MyPState->ring_unused);
 
+	slot->reqid = request.hdr.reqid;
+
 	if (force_request_lsns)
 		slot->request_lsns = *force_request_lsns;
 	else
 		neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag),
 							  slot->buftag.forkNum, slot->buftag.blockNum,
 							  &slot->request_lsns, 1, NULL);
-	request.req.lsn = slot->request_lsns.request_lsn;
-	request.req.not_modified_since = slot->request_lsns.not_modified_since;
+	request.hdr.lsn = slot->request_lsns.request_lsn;
+	request.hdr.not_modified_since = slot->request_lsns.not_modified_since;
 
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);
@@ -1102,6 +1105,12 @@ Retry:
 	return min_ring_index;
 }
 
+static bool
+equal_requests(NeonRequest* a, NeonRequest* b)
+{
+	return a->reqid == b->reqid && a->lsn == b->lsn && a->not_modified_since == b->not_modified_since;
+}
+
 
 /*
  * Note: this function can get canceled and use a long jump to the next catch
@@ -1184,6 +1193,10 @@ nm_pack_request(NeonRequest *msg)
 	initStringInfo(&s);
 
 	pq_sendbyte(&s, msg->tag);
+	if (neon_protocol_version >= 3)
+	{
+		pq_sendint64(&s, msg->reqid);
+	}
 	pq_sendint64(&s, msg->lsn);
 	pq_sendint64(&s, msg->not_modified_since);
 
@@ -1261,8 +1274,16 @@ NeonResponse *
 nm_unpack_response(StringInfo s)
 {
 	NeonMessageTag tag = pq_getmsgbyte(s);
+	NeonResponse resp_hdr = {0}; /* make valgrind happy */
 	NeonResponse *resp = NULL;
 
+	resp_hdr.tag = tag;
+	if (neon_protocol_version >= 3)
+	{
+		resp_hdr.reqid = pq_getmsgint64(s);
+		resp_hdr.lsn = pq_getmsgint64(s);
+		resp_hdr.not_modified_since = pq_getmsgint64(s);
+	}
 	switch (tag)
 	{
 			/* pagestore -> pagestore_client */
@@ -1270,7 +1291,14 @@ nm_unpack_response(StringInfo s)
 			{
 				NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse));
 
-				msg_resp->tag = tag;
+				if (neon_protocol_version >= 3)
+				{
+					NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					msg_resp->req.forknum = pq_getmsgbyte(s);
+				}
+				msg_resp->req.hdr = resp_hdr;
 				msg_resp->exists = pq_getmsgbyte(s);
 				pq_getmsgend(s);
 
@@ -1282,7 +1310,14 @@ nm_unpack_response(StringInfo s)
 			{
 				NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse));
 
-				msg_resp->tag = tag;
+				if (neon_protocol_version >= 3)
+				{
+					NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					msg_resp->req.forknum = pq_getmsgbyte(s);
+				}
+				msg_resp->req.hdr = resp_hdr;
 				msg_resp->n_blocks = pq_getmsgint(s, 4);
 				pq_getmsgend(s);
 
@@ -1295,12 +1330,20 @@ nm_unpack_response(StringInfo s)
 				NeonGetPageResponse *msg_resp;
 
 				msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE);
-				msg_resp->tag = tag;
+				if (neon_protocol_version >= 3)
+				{
+					NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					msg_resp->req.forknum = pq_getmsgbyte(s);
+					msg_resp->req.blkno = pq_getmsgint(s, 4);
+				}
+				msg_resp->req.hdr = resp_hdr;
 				/* XXX:	should be varlena */
 				memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
 				pq_getmsgend(s);
 
-				Assert(msg_resp->tag == T_NeonGetPageResponse);
+				Assert(msg_resp->req.hdr.tag == T_NeonGetPageResponse);
 
 				resp = (NeonResponse *) msg_resp;
 				break;
@@ -1310,7 +1353,11 @@ nm_unpack_response(StringInfo s)
 			{
 				NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse));
 
-				msg_resp->tag = tag;
+				if (neon_protocol_version >= 3)
+				{
+					msg_resp->req.dbNode = pq_getmsgint(s, 4);
+				}
+				msg_resp->req.hdr = resp_hdr;
 				msg_resp->db_size = pq_getmsgint64(s);
 				pq_getmsgend(s);
 
@@ -1328,7 +1375,7 @@ nm_unpack_response(StringInfo s)
 				msglen = strlen(msgtext);
 
 				msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1);
-				msg_resp->tag = tag;
+				msg_resp->req = resp_hdr;
 				memcpy(msg_resp->message, msgtext, msglen + 1);
 				pq_getmsgend(s);
 
@@ -1339,9 +1386,17 @@ nm_unpack_response(StringInfo s)
 		case T_NeonGetSlruSegmentResponse:
 		    {
 				NeonGetSlruSegmentResponse *msg_resp;
-				int n_blocks = pq_getmsgint(s, 4);
-				msg_resp = palloc(sizeof(NeonGetSlruSegmentResponse));
-				msg_resp->tag = tag;
+				int n_blocks;
+				msg_resp = palloc0(sizeof(NeonGetSlruSegmentResponse));
+
+				if (neon_protocol_version >= 3)
+				{
+					msg_resp->req.kind = pq_getmsgbyte(s);
+					msg_resp->req.segno = pq_getmsgint(s, 4);
+				}
+				msg_resp->req.hdr = resp_hdr;
+
+				n_blocks = pq_getmsgint(s, 4);
 				msg_resp->n_blocks = n_blocks;
 				memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ);
 				pq_getmsgend(s);
@@ -1386,8 +1441,8 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\"");
 				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
-				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1399,8 +1454,8 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\"");
 				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
-				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1413,8 +1468,8 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
 				appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
-				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1424,8 +1479,8 @@ nm_to_string(NeonMessage *msg)
 
 				appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\"");
 				appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
-				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1436,8 +1491,8 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\"");
 				appendStringInfo(&s, ", \"kind\": %u", msg_req->kind);
 				appendStringInfo(&s, ", \"segno\": %u", msg_req->segno);
-				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -2312,39 +2367,64 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL);
 	{
 		NeonExistsRequest request = {
-			.req.tag = T_NeonExistsRequest,
-			.req.lsn = request_lsns.request_lsn,
-			.req.not_modified_since = request_lsns.not_modified_since,
+			.hdr.tag = T_NeonExistsRequest,
+			.hdr.reqid = GENERATE_REQUEST_ID(),
+			.hdr.lsn = request_lsns.request_lsn,
+			.hdr.not_modified_since = request_lsns.not_modified_since,
 			.rinfo = InfoFromSMgrRel(reln),
 			.forknum = forkNum
 		};
 
 		resp = page_server_request(&request);
+
+		switch (resp->tag)
+		{
+			case T_NeonExistsResponse:
+			{
+				NeonExistsResponse* exists_resp = (NeonExistsResponse *) resp;
+				if (neon_protocol_version >= 3)
+				{
+					if (!equal_requests(resp, &request.hdr) ||
+						!RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) ||
+						exists_resp->req.forknum != request.forknum)
+					{
+						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
+													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum,
+													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum);
+					}
+				}
+				exists = exists_resp->exists;
+				break;
+			}
+			case T_NeonErrorResponse:
+				if (neon_protocol_version >= 3)
+				{
+					if (!equal_requests(resp, &request.hdr))
+					{
+						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
+							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
+					}
+				}
+				ereport(ERROR,
+						(errcode(ERRCODE_IO_ERROR),
+						 errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+								resp->reqid,
+								RelFileInfoFmt(InfoFromSMgrRel(reln)),
+								forkNum,
+								LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
+						 errdetail("page server returned error: %s",
+								   ((NeonErrorResponse *) resp)->message)));
+				break;
+
+			default:
+				NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+											"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
+											T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
+		}
+		pfree(resp);
 	}
-
-	switch (resp->tag)
-	{
-		case T_NeonExistsResponse:
-			exists = ((NeonExistsResponse *) resp)->exists;
-			break;
-
-		case T_NeonErrorResponse:
-			ereport(ERROR,
-					(errcode(ERRCODE_IO_ERROR),
-					 errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
-							RelFileInfoFmt(InfoFromSMgrRel(reln)),
-							forkNum,
-							LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
-					 errdetail("page server returned error: %s",
-							   ((NeonErrorResponse *) resp)->message)));
-			break;
-
-		default:
-			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-										"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
-										T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
-	}
-	pfree(resp);
 	return exists;
 }
 
@@ -2952,15 +3032,43 @@ Retry:
 		switch (resp->tag)
 		{
 			case T_NeonGetPageResponse:
-				memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ);
+			{
+				NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp;
+				if (neon_protocol_version >= 3)
+				{
+					if (resp->reqid != slot->reqid ||
+						resp->lsn != slot->request_lsns.request_lsn ||
+						resp->not_modified_since != slot->request_lsns.not_modified_since ||
+						!RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) ||
+						getpage_resp->req.forknum != forkNum ||
+						getpage_resp->req.blkno != base_blockno + i)
+					{
+						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
+													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno,
+													slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), forkNum, base_blockno + i);
+					}
+				}
+				memcpy(buffer, getpage_resp->page, BLCKSZ);
 				lfc_write(rinfo, forkNum, blockno, buffer);
 				break;
-
+			}
 			case T_NeonErrorResponse:
+				if (neon_protocol_version >= 3)
+				{
+					if (resp->reqid != slot->reqid ||
+						resp->lsn != slot->request_lsns.request_lsn ||
+						resp->not_modified_since != slot->request_lsns.not_modified_since)
+					{
+						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
+							 slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since));
+					}
+				}
 				ereport(ERROR,
 						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
-								slot->shard_no, blockno, RelFileInfoFmt(rinfo),
+						 errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
+								slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo),
 								forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)),
 						 errdetail("page server returned error: %s",
 								   ((NeonErrorResponse *) resp)->message)));
@@ -3443,47 +3551,72 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 
 	{
 		NeonNblocksRequest request = {
-			.req.tag = T_NeonNblocksRequest,
-			.req.lsn = request_lsns.request_lsn,
-			.req.not_modified_since = request_lsns.not_modified_since,
+			.hdr.tag = T_NeonNblocksRequest,
+			.hdr.reqid = GENERATE_REQUEST_ID(),
+			.hdr.lsn = request_lsns.request_lsn,
+			.hdr.not_modified_since = request_lsns.not_modified_since,
 			.rinfo = InfoFromSMgrRel(reln),
 			.forknum = forknum,
 		};
 
 		resp = page_server_request(&request);
+
+		switch (resp->tag)
+		{
+			case T_NeonNblocksResponse:
+			{
+				NeonNblocksResponse * relsize_resp = (NeonNblocksResponse *) resp;
+				if (neon_protocol_version >= 3)
+				{
+					if (!equal_requests(resp, &request.hdr) ||
+						!RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) ||
+						relsize_resp->req.forknum != forknum)
+					{
+						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
+													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum,
+													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum);
+					}
+				}
+				n_blocks = relsize_resp->n_blocks;
+				break;
+			}
+			case T_NeonErrorResponse:
+				if (neon_protocol_version >= 3)
+				{
+					if (!equal_requests(resp, &request.hdr))
+					{
+						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
+							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
+					}
+				}
+				ereport(ERROR,
+						(errcode(ERRCODE_IO_ERROR),
+						 errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+								resp->reqid,
+								RelFileInfoFmt(InfoFromSMgrRel(reln)),
+								forknum,
+								LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
+						 errdetail("page server returned error: %s",
+								   ((NeonErrorResponse *) resp)->message)));
+				break;
+
+			default:
+				NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+											"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
+											T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
+		}
+		update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
+
+		neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
+				 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+				 forknum,
+				 LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
+				 n_blocks);
+
+		pfree(resp);
 	}
-
-	switch (resp->tag)
-	{
-		case T_NeonNblocksResponse:
-			n_blocks = ((NeonNblocksResponse *) resp)->n_blocks;
-			break;
-
-		case T_NeonErrorResponse:
-			ereport(ERROR,
-					(errcode(ERRCODE_IO_ERROR),
-					 errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
-							RelFileInfoFmt(InfoFromSMgrRel(reln)),
-							forknum,
-							LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
-					 errdetail("page server returned error: %s",
-							   ((NeonErrorResponse *) resp)->message)));
-			break;
-
-		default:
-			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-										"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
-										T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
-	}
-	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
-
-	neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
-			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-			 forknum,
-			 LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
-			 n_blocks);
-
-	pfree(resp);
 	return n_blocks;
 }
 
@@ -3503,40 +3636,64 @@ neon_dbsize(Oid dbNode)
 
 	{
 		NeonDbSizeRequest request = {
-			.req.tag = T_NeonDbSizeRequest,
-			.req.lsn = request_lsns.request_lsn,
-			.req.not_modified_since = request_lsns.not_modified_since,
+			.hdr.tag = T_NeonDbSizeRequest,
+			.hdr.reqid = GENERATE_REQUEST_ID(),
+			.hdr.lsn = request_lsns.request_lsn,
+			.hdr.not_modified_since = request_lsns.not_modified_since,
 			.dbNode = dbNode,
 		};
 
 		resp = page_server_request(&request);
+
+		switch (resp->tag)
+		{
+			case T_NeonDbSizeResponse:
+			{
+				NeonDbSizeResponse* dbsize_resp = (NeonDbSizeResponse *) resp;
+				if (neon_protocol_version >= 3)
+				{
+					if (!equal_requests(resp, &request.hdr) ||
+						dbsize_resp->req.dbNode != dbNode)
+					{
+						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}",
+													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode,
+													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode);
+					}
+				}
+				db_size = dbsize_resp->db_size;
+				break;
+			}
+			case T_NeonErrorResponse:
+				if (neon_protocol_version >= 3)
+				{
+					if (!equal_requests(resp, &request.hdr))
+					{
+						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
+							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
+					}
+				}
+				ereport(ERROR,
+						(errcode(ERRCODE_IO_ERROR),
+						 errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X",
+								resp->reqid,
+								dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
+						 errdetail("page server returned error: %s",
+								   ((NeonErrorResponse *) resp)->message)));
+				break;
+
+			default:
+				NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+											"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
+											T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
+		}
+
+		neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
+				 dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
+
+		pfree(resp);
 	}
-
-	switch (resp->tag)
-	{
-		case T_NeonDbSizeResponse:
-			db_size = ((NeonDbSizeResponse *) resp)->db_size;
-			break;
-
-		case T_NeonErrorResponse:
-			ereport(ERROR,
-					(errcode(ERRCODE_IO_ERROR),
-					 errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X",
-							dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
-					 errdetail("page server returned error: %s",
-							   ((NeonErrorResponse *) resp)->message)));
-			break;
-
-		default:
-			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-										"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
-										T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
-	}
-
-	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
-			 dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
-
-	pfree(resp);
 	return db_size;
 }
 
@@ -3868,16 +4025,17 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 		return -1;
 
 	request = (NeonGetSlruSegmentRequest) {
-		.req.tag = T_NeonGetSlruSegmentRequest,
-		.req.lsn = request_lsn,
-		.req.not_modified_since = not_modified_since,
+		.hdr.tag = T_NeonGetSlruSegmentRequest,
+		.hdr.reqid = GENERATE_REQUEST_ID(),
+		.hdr.lsn = request_lsn,
+		.hdr.not_modified_since = not_modified_since,
 		.kind = kind,
 		.segno = segno
 	};
 
 	do
 	{
-		while (!page_server->send(shard_no, &request.req) || !page_server->flush(shard_no));
+		while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no));
 
 		consume_prefetch_responses();
 
@@ -3887,14 +4045,38 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	switch (resp->tag)
 	{
 		case T_NeonGetSlruSegmentResponse:
-			n_blocks = ((NeonGetSlruSegmentResponse *) resp)->n_blocks;
-			memcpy(buffer, ((NeonGetSlruSegmentResponse *) resp)->data, n_blocks*BLCKSZ);
+		{
+			NeonGetSlruSegmentResponse* slru_resp = (NeonGetSlruSegmentResponse *) resp;
+			if (neon_protocol_version >= 3)
+			{
+				if (!equal_requests(resp, &request.hdr) ||
+					slru_resp->req.kind != kind ||
+					slru_resp->req.segno != segno)
+				{
+					NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+												"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u}",
+												resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno,
+												request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, segno);
+				}
+			}
+			n_blocks = slru_resp->n_blocks;
+			memcpy(buffer, slru_resp->data, n_blocks*BLCKSZ);
 			break;
-
+		}
 		case T_NeonErrorResponse:
+			if (neon_protocol_version >= 3)
+			{
+				if (!equal_requests(resp, &request.hdr))
+				{
+					elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+						 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
+						 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
+				}
+			}
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
-					 errmsg(NEON_TAG "could not read SLRU %d segment %d at lsn %X/%08X",
+					 errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %d at lsn %X/%08X",
+							resp->reqid,
 							kind,
 							segno,
 							LSN_FORMAT_ARGS(request_lsn)),
@@ -4033,8 +4215,9 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		NeonResponse *response;
 		NeonNblocksResponse *nbresponse;
 		NeonNblocksRequest request = {
-			.req = (NeonRequest) {
+			.hdr = (NeonRequest) {
 				.tag = T_NeonNblocksRequest,
+				.reqid = GENERATE_REQUEST_ID(),
 				.lsn = end_recptr,
 				.not_modified_since = end_recptr,
 			},

From 640ac4fc9efcdadad442f7dfafe15e7f9d816906 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 9 Jan 2025 09:43:20 -0500
Subject: [PATCH 134/583] fix(pageserver): report timestamp is in the past if
 the key is missing (#10210)

## Problem

If for some reasons we already garbage-collected the data under an LSN
but the caller uses a past LSN for the find_time_cutoff function, now we
will report a missing key error and GC will never proceed.

Note that missing key error can also happen if the key is really missing
(i.e., during the past offload incidents)

## Summary of changes

Make sure GC proceeds by bumping the LSN. When time_cutoff=None, we will
not increase the time_cutoff (it will be set to latest_gc_cutoff). If we
really need to bump the GC LSN for maintenance purpose, we need a
separate API to do that.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 14c7e0d2f8..b65fe6cf7c 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -627,7 +627,7 @@ impl Timeline {
             // cannot overflow, high and low are both smaller than u64::MAX / 2
             let mid = (high + low) / 2;
 
-            let cmp = self
+            let cmp = match self
                 .is_latest_commit_timestamp_ge_than(
                     search_timestamp,
                     Lsn(mid * 8),
@@ -635,7 +635,16 @@ impl Timeline {
                     &mut found_larger,
                     ctx,
                 )
-                .await?;
+                .await
+            {
+                Ok(res) => res,
+                Err(PageReconstructError::MissingKey(e)) => {
+                    warn!("Missing key while find_lsn_for_timestamp. Either we might have already garbage-collected that data or the key is really missing. Last error: {:#}", e);
+                    // Return that we didn't find any requests smaller than the LSN, and logging the error.
+                    return Ok(LsnForTimestamp::Past(min_lsn));
+                }
+                Err(e) => return Err(e),
+            };
 
             if cmp {
                 high = mid;
@@ -643,6 +652,7 @@ impl Timeline {
                 low = mid + 1;
             }
         }
+
         // If `found_smaller == true`, `low = t + 1` where `t` is the target LSN,
         // so the LSN of the last commit record before or at `search_timestamp`.
         // Remove one from `low` to get `t`.

From ac6cca17acae975ab261c9e82c8fb1bc8a06bcce Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 9 Jan 2025 15:33:44 +0000
Subject: [PATCH 135/583] storcon: don't log a heartbeat error during shutdown
 (#10325)

## Problem

Occasionally we see an unexpected error like:
```
ERROR spawn_heartbeat_driver: Failed to update node state 1 after heartbeat round: Shutting down\n')
Hint: use scripts/check_allowed_errors.sh to test any new allowed_error you add
```


https://neon-github-public-dev.s3.amazonaws.com/reports/pr-10324/12690404952/index.html#/testresult/63406a0687bf6eca

## Summary of changes

- Explicitly handle ApiError::ShuttingDown as a no-op when mutating node
status
---
 storage_controller/src/service.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index fd4ee7fd10..cf2d1ef97b 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1047,6 +1047,9 @@ impl Service {
                             // on a snapshot of the nodes.
                             tracing::info!("Node {} was not found after heartbeat round", node_id);
                         }
+                        Err(ApiError::ShuttingDown) => {
+                            // No-op: we're shutting down, no need to try and update any nodes' statuses
+                        }
                         Err(err) => {
                             // Transition to active involves reconciling: if a node responds to a heartbeat then
                             // becomes unavailable again, we may get an error here.

From ad51622568a24b6be53ff165f02d32e004476a73 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 9 Jan 2025 15:34:06 +0000
Subject: [PATCH 136/583] remote_storage: enable Azure connection pooling by
 default (#10324)

## Problem

Initially we defaulted this to zero to reduce risk. We have now been
using pooling in staging for some time without issues, so let's make it
the default for anyone using this software without setting the config
explicitly.

Closes: https://github.com/neondatabase/cloud/issues/20971

## Summary of changes

- Set Azure blob storage connection pool size to 8 by default
---
 libs/remote_storage/src/config.rs | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index dd49d4d5e7..49b1d9dc87 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -115,13 +115,15 @@ fn default_max_keys_per_list_response() -> Option<i32> {
 }
 
 fn default_azure_conn_pool_size() -> usize {
-    // Conservative default: no connection pooling.  At time of writing this is the Azure
-    // SDK's default as well, due to historic reports of hard-to-reproduce issues
+    // By default, the Azure SDK does no connection pooling, due to historic reports of hard-to-reproduce issues
     // (https://github.com/hyperium/hyper/issues/2312)
     //
     // However, using connection pooling is important to avoid exhausting client ports when
     // doing huge numbers of requests (https://github.com/neondatabase/cloud/issues/20971)
-    0
+    //
+    // We therefore enable a modest pool size by default: this may be configured to zero if
+    // issues like the alleged upstream hyper issue appear.
+    8
 }
 
 impl Debug for S3Config {

From bebc46e71336104a3282e9f2497c2889583caffa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 9 Jan 2025 16:55:02 +0100
Subject: [PATCH 137/583] Add scheduling_policy column to safekeepers table
 (#10205)

Add a `scheduling_policy` column to the safekeepers table of the storage
controller.

Part of #9981
---
 control_plane/storcon_cli/src/main.rs         | 13 +++-
 libs/pageserver_api/src/controller_api.rs     | 33 +++++++++
 .../down.sql                                  |  1 +
 .../up.sql                                    |  1 +
 storage_controller/src/http.rs                |  4 +-
 storage_controller/src/persistence.rs         | 69 ++++++++++++++-----
 storage_controller/src/schema.rs              |  1 +
 storage_controller/src/service.rs             |  9 ++-
 .../regress/test_storage_controller.py        |  2 +-
 9 files changed, 106 insertions(+), 27 deletions(-)
 create mode 100644 storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/down.sql
 create mode 100644 storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/up.sql

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 6ee1044c18..617b2cd1ba 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1035,7 +1035,15 @@ async fn main() -> anyhow::Result<()> {
             resp.sort_by(|a, b| a.id.cmp(&b.id));
 
             let mut table = comfy_table::Table::new();
-            table.set_header(["Id", "Version", "Host", "Port", "Http Port", "AZ Id"]);
+            table.set_header([
+                "Id",
+                "Version",
+                "Host",
+                "Port",
+                "Http Port",
+                "AZ Id",
+                "Scheduling",
+            ]);
             for sk in resp {
                 table.add_row([
                     format!("{}", sk.id),
@@ -1043,7 +1051,8 @@ async fn main() -> anyhow::Result<()> {
                     sk.host,
                     format!("{}", sk.port),
                     format!("{}", sk.http_port),
-                    sk.availability_zone_id.to_string(),
+                    sk.availability_zone_id.clone(),
+                    String::from(sk.scheduling_policy),
                 ]);
             }
             println!("{table}");
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index faf11e487c..7eb3547183 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -320,6 +320,38 @@ impl From<NodeSchedulingPolicy> for String {
     }
 }
 
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
+pub enum SkSchedulingPolicy {
+    Active,
+    Disabled,
+    Decomissioned,
+}
+
+impl FromStr for SkSchedulingPolicy {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Ok(match s {
+            "active" => Self::Active,
+            "disabled" => Self::Disabled,
+            "decomissioned" => Self::Decomissioned,
+            _ => return Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
+        })
+    }
+}
+
+impl From<SkSchedulingPolicy> for String {
+    fn from(value: SkSchedulingPolicy) -> String {
+        use SkSchedulingPolicy::*;
+        match value {
+            Active => "active",
+            Disabled => "disabled",
+            Decomissioned => "decomissioned",
+        }
+        .to_string()
+    }
+}
+
 /// Controls how tenant shards are mapped to locations on pageservers, e.g. whether
 /// to create secondary locations.
 #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
@@ -387,6 +419,7 @@ pub struct SafekeeperDescribeResponse {
     pub port: i32,
     pub http_port: i32,
     pub availability_zone_id: String,
+    pub scheduling_policy: SkSchedulingPolicy,
 }
 
 #[cfg(test)]
diff --git a/storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/down.sql b/storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/down.sql
new file mode 100644
index 0000000000..e26bff798f
--- /dev/null
+++ b/storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/down.sql
@@ -0,0 +1 @@
+ALTER TABLE safekeepers DROP scheduling_policy;
diff --git a/storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/up.sql b/storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/up.sql
new file mode 100644
index 0000000000..d83cc6cc46
--- /dev/null
+++ b/storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/up.sql
@@ -0,0 +1 @@
+ALTER TABLE safekeepers ADD scheduling_policy VARCHAR NOT NULL DEFAULT 'disabled';
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 24fd4c341a..5385e4ee0b 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -3,7 +3,7 @@ use crate::metrics::{
     HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
     METRICS_REGISTRY,
 };
-use crate::persistence::SafekeeperPersistence;
+use crate::persistence::SafekeeperUpsert;
 use crate::reconciler::ReconcileError;
 use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT};
 use anyhow::Context;
@@ -1249,7 +1249,7 @@ async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, Api
 async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Infra)?;
 
-    let body = json_request::<SafekeeperPersistence>(&mut req).await?;
+    let body = json_request::<SafekeeperUpsert>(&mut req).await?;
     let id = parse_request_param::<i64>(&req, "id")?;
 
     if id != body.id {
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index c5eb106f24..cebf3e9594 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -13,6 +13,7 @@ use pageserver_api::controller_api::AvailabilityZone;
 use pageserver_api::controller_api::MetadataHealthRecord;
 use pageserver_api::controller_api::SafekeeperDescribeResponse;
 use pageserver_api::controller_api::ShardSchedulingPolicy;
+use pageserver_api::controller_api::SkSchedulingPolicy;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::ShardConfigError;
@@ -1075,12 +1076,14 @@ impl Persistence {
 
     pub(crate) async fn safekeeper_upsert(
         &self,
-        record: SafekeeperPersistence,
+        record: SafekeeperUpsert,
     ) -> Result<(), DatabaseError> {
         use crate::schema::safekeepers::dsl::*;
 
         self.with_conn(move |conn| -> DatabaseResult<()> {
-            let bind = record.as_insert_or_update();
+            let bind = record
+                .as_insert_or_update()
+                .map_err(|e| DatabaseError::Logical(format!("{e}")))?;
 
             let inserted_updated = diesel::insert_into(safekeepers)
                 .values(&bind)
@@ -1243,6 +1246,7 @@ pub(crate) struct ControllerPersistence {
     pub(crate) started_at: chrono::DateTime<chrono::Utc>,
 }
 
+// What we store in the database
 #[derive(Serialize, Deserialize, Queryable, Selectable, Eq, PartialEq, Debug, Clone)]
 #[diesel(table_name = crate::schema::safekeepers)]
 pub(crate) struct SafekeeperPersistence {
@@ -1257,11 +1261,51 @@ pub(crate) struct SafekeeperPersistence {
     pub(crate) active: bool,
     pub(crate) http_port: i32,
     pub(crate) availability_zone_id: String,
+    pub(crate) scheduling_policy: String,
 }
 
 impl SafekeeperPersistence {
-    fn as_insert_or_update(&self) -> InsertUpdateSafekeeper<'_> {
-        InsertUpdateSafekeeper {
+    pub(crate) fn as_describe_response(&self) -> Result<SafekeeperDescribeResponse, DatabaseError> {
+        let scheduling_policy =
+            SkSchedulingPolicy::from_str(&self.scheduling_policy).map_err(|e| {
+                DatabaseError::Logical(format!("can't construct SkSchedulingPolicy: {e:?}"))
+            })?;
+        // omit the `active` flag on purpose: it is deprecated.
+        Ok(SafekeeperDescribeResponse {
+            id: NodeId(self.id as u64),
+            region_id: self.region_id.clone(),
+            version: self.version,
+            host: self.host.clone(),
+            port: self.port,
+            http_port: self.http_port,
+            availability_zone_id: self.availability_zone_id.clone(),
+            scheduling_policy,
+        })
+    }
+}
+
+/// What we expect from the upsert http api
+#[derive(Serialize, Deserialize, Eq, PartialEq, Debug, Clone)]
+pub(crate) struct SafekeeperUpsert {
+    pub(crate) id: i64,
+    pub(crate) region_id: String,
+    /// 1 is special, it means just created (not currently posted to storcon).
+    /// Zero or negative is not really expected.
+    /// Otherwise the number from `release-$(number_of_commits_on_branch)` tag.
+    pub(crate) version: i64,
+    pub(crate) host: String,
+    pub(crate) port: i32,
+    pub(crate) active: bool,
+    pub(crate) http_port: i32,
+    pub(crate) availability_zone_id: String,
+}
+
+impl SafekeeperUpsert {
+    fn as_insert_or_update(&self) -> anyhow::Result<InsertUpdateSafekeeper<'_>> {
+        if self.version < 0 {
+            anyhow::bail!("negative version: {}", self.version);
+        }
+        Ok(InsertUpdateSafekeeper {
             id: self.id,
             region_id: &self.region_id,
             version: self.version,
@@ -1270,19 +1314,9 @@ impl SafekeeperPersistence {
             active: self.active,
             http_port: self.http_port,
             availability_zone_id: &self.availability_zone_id,
-        }
-    }
-    pub(crate) fn as_describe_response(&self) -> SafekeeperDescribeResponse {
-        // omit the `active` flag on purpose: it is deprecated.
-        SafekeeperDescribeResponse {
-            id: NodeId(self.id as u64),
-            region_id: self.region_id.clone(),
-            version: self.version,
-            host: self.host.clone(),
-            port: self.port,
-            http_port: self.http_port,
-            availability_zone_id: self.availability_zone_id.clone(),
-        }
+            // None means a wish to not update this column. We expose abilities to update it via other means.
+            scheduling_policy: None,
+        })
     }
 }
 
@@ -1297,4 +1331,5 @@ struct InsertUpdateSafekeeper<'a> {
     active: bool,
     http_port: i32,
     availability_zone_id: &'a str,
+    scheduling_policy: Option<&'a str>,
 }
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index 9e005ab932..44c91619ab 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -39,6 +39,7 @@ diesel::table! {
         active -> Bool,
         http_port -> Int4,
         availability_zone_id -> Text,
+        scheduling_policy -> Varchar,
     }
 }
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index cf2d1ef97b..265b2798d2 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -7350,13 +7350,12 @@ impl Service {
     pub(crate) async fn safekeepers_list(
         &self,
     ) -> Result<Vec<SafekeeperDescribeResponse>, DatabaseError> {
-        Ok(self
-            .persistence
+        self.persistence
             .list_safekeepers()
             .await?
             .into_iter()
             .map(|v| v.as_describe_response())
-            .collect::<Vec<_>>())
+            .collect::<Result<Vec<_>, _>>()
     }
 
     pub(crate) async fn get_safekeeper(
@@ -7366,12 +7365,12 @@ impl Service {
         self.persistence
             .safekeeper_get(id)
             .await
-            .map(|v| v.as_describe_response())
+            .and_then(|v| v.as_describe_response())
     }
 
     pub(crate) async fn upsert_safekeeper(
         &self,
-        record: crate::persistence::SafekeeperPersistence,
+        record: crate::persistence::SafekeeperUpsert,
     ) -> Result<(), DatabaseError> {
         self.persistence.safekeeper_upsert(record).await
     }
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 207f55a214..da6d5b8622 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3019,7 +3019,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
 def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
     compared = [dict(a), dict(b)]
 
-    masked_keys = ["created_at", "updated_at", "active"]
+    masked_keys = ["created_at", "updated_at", "active", "scheduling_policy"]
 
     for d in compared:
         # keep deleting these in case we are comparing the body as it will be uploaded by real scripts

From f37eeb56addb3c7946ac6299cd7e415f4dc47a9d Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 9 Jan 2025 17:39:53 +0100
Subject: [PATCH 138/583] fix(compute_ctl): Resolve issues with dropping roles
 having dangling permissions (#10299)

## Problem

In Postgres, one cannot drop a role if it has any dependent objects in
the DB. In `compute_ctl`, we automatically reassign all dependent
objects in every DB to the corresponding DB owner. Yet, it seems that it
doesn't help with some implicit permissions. The issue is reproduced by
installing a `postgis` extension because it creates some views and
tables in the public schema.

## Summary of changes

Added a repro test without using a `postgis`: i) create a role via
`compute_ctl` (with `neon_superuser` grant); ii) create a test role, a
table in schema public, and grant permissions via the role in
`neon_superuser`.

To fix the issue, I added a new `compute_ctl` code that removes such
dangling permissions before dropping the role. It's done in the least
invasive way, i.e., only touches the schema public, because i) that's
the problem we had with PostGIS; ii) it creates a smaller chance of
messing anything up and getting a stuck operation again, just for a
different reason.

Properly, any API-based catalog operations should fail gracefully and
provide an actionable error and status code to the control plane,
allowing the latter to unwind the operation and propagate an error
message and hint to the user. In this sense, it's aligned with another
feature request https://github.com/neondatabase/cloud/issues/21611

Resolve neondatabase/cloud#13582
---
 compute_tools/src/spec_apply.rs               |  14 ++-
 .../sql/pre_drop_role_revoke_privileges.sql   |  28 +++++
 test_runner/regress/test_compute_catalog.py   | 115 ++++++++++++++++++
 3 files changed, 156 insertions(+), 1 deletion(-)
 create mode 100644 compute_tools/src/sql/pre_drop_role_revoke_privileges.sql

diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs
index 695a722d6d..7401de2e60 100644
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -75,7 +75,7 @@ pub struct MutableApplyContext {
     pub dbs: HashMap<String, Database>,
 }
 
-/// Appply the operations that belong to the given spec apply phase.
+/// Apply the operations that belong to the given spec apply phase.
 ///
 /// Commands within a single phase are executed in order of Iterator yield.
 /// Commands of ApplySpecPhase::RunInEachDatabase will execute in the database
@@ -498,7 +498,19 @@ async fn get_operations<'a>(
                                         ),
                                         comment: None,
                                     },
+                                    // Revoke some potentially blocking privileges (Neon-specific currently)
+                                    Operation {
+                                        query: format!(
+                                            include_str!("sql/pre_drop_role_revoke_privileges.sql"),
+                                            role_name = quoted,
+                                        ),
+                                        comment: None,
+                                    },
                                     // This now will only drop privileges of the role
+                                    // TODO: this is obviously not 100% true because of the above case,
+                                    // there could be still some privileges that are not revoked. Maybe this
+                                    // only drops privileges that were granted *by this* role, not *to this* role,
+                                    // but this has to be checked.
                                     Operation {
                                         query: format!("DROP OWNED BY {}", quoted),
                                         comment: None,
diff --git a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql
new file mode 100644
index 0000000000..cdaa7071d3
--- /dev/null
+++ b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql
@@ -0,0 +1,28 @@
+SET SESSION ROLE neon_superuser;
+
+DO $$
+DECLARE
+    schema TEXT;
+    revoke_query TEXT;
+BEGIN
+    FOR schema IN
+        SELECT schema_name
+        FROM information_schema.schemata
+        -- So far, we only had issues with 'public' schema. Probably, because we do some additional grants,
+        -- e.g., make DB owner the owner of 'public' schema automatically (when created via API).
+        -- See https://github.com/neondatabase/cloud/issues/13582 for the context.
+        -- Still, keep the loop because i) it efficiently handles the case when there is no 'public' schema,
+        -- ii) it's easy to add more schemas to the list if needed.
+        WHERE schema_name IN ('public')
+    LOOP
+        revoke_query := format(
+            'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;',
+            schema
+        );
+
+        EXECUTE revoke_query;
+    END LOOP;
+END;
+$$;
+
+RESET ROLE;
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
index e411aad97d..f0878b2631 100644
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -250,3 +250,118 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
         assert curr_db is not None
         assert len(curr_db) == 1
         assert curr_db[0] == "publisher_db"
+
+
+def test_compute_drop_role(neon_simple_env: NeonEnv):
+    """
+    Test that compute_ctl can drop a role even if it has some depending objects
+    like permissions in one of the databases.
+    Reproduction test for https://github.com/neondatabase/cloud/issues/13582
+    """
+    env = neon_simple_env
+    TEST_DB_NAME = "db_with_permissions"
+
+    endpoint = env.endpoints.create_start("main")
+
+    endpoint.respec_deep(
+        **{
+            "skip_pg_catalog_updates": False,
+            "cluster": {
+                "roles": [
+                    {
+                        # We need to create role via compute_ctl, because in this case it will receive
+                        # additional grants equivalent to our real environment, so we can repro some
+                        # issues.
+                        "name": "neon",
+                        # Some autocomplete-suggested hash, no specific meaning.
+                        "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=",
+                        "options": [],
+                    },
+                ],
+                "databases": [
+                    {
+                        "name": TEST_DB_NAME,
+                        "owner": "neon",
+                    },
+                ],
+            },
+        }
+    )
+    endpoint.reconfigure()
+
+    with endpoint.cursor(dbname=TEST_DB_NAME) as cursor:
+        # Create table and view as `cloud_admin`. This is the case when, for example,
+        # PostGIS extensions creates tables in `public` schema.
+        cursor.execute("create table test_table (id int)")
+        cursor.execute("create view test_view as select * from test_table")
+
+    with endpoint.cursor(dbname=TEST_DB_NAME, user="neon") as cursor:
+        cursor.execute("create role readonly")
+        # We (`compute_ctl`) make 'neon' the owner of schema 'public' in the owned database.
+        # Postgres has all sorts of permissions and grants that we may not handle well,
+        # but this is the shortest repro grant for the issue
+        # https://github.com/neondatabase/cloud/issues/13582
+        cursor.execute("grant select on all tables in schema public to readonly")
+
+    # Check that role was created
+    with endpoint.cursor() as cursor:
+        cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly'")
+        role = cursor.fetchone()
+        assert role is not None
+
+    # Confirm that we actually have some permissions for 'readonly' role
+    # that may block our ability to drop the role.
+    with endpoint.cursor(dbname=TEST_DB_NAME) as cursor:
+        cursor.execute(
+            "select grantor from information_schema.role_table_grants where grantee = 'readonly'"
+        )
+        res = cursor.fetchall()
+        assert len(res) == 2, f"Expected 2 table grants, got {len(res)}"
+        for row in res:
+            assert row[0] == "neon_superuser"
+
+    # Drop role via compute_ctl
+    endpoint.respec_deep(
+        **{
+            "skip_pg_catalog_updates": False,
+            "delta_operations": [
+                {
+                    "action": "delete_role",
+                    "name": "readonly",
+                },
+            ],
+        }
+    )
+    endpoint.reconfigure()
+
+    # Check that role is dropped
+    with endpoint.cursor() as cursor:
+        cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly'")
+        role = cursor.fetchone()
+        assert role is None
+
+    #
+    # Drop schema 'public' and check that we can still drop the role
+    #
+    with endpoint.cursor(dbname=TEST_DB_NAME) as cursor:
+        cursor.execute("create role readonly2")
+        cursor.execute("grant select on all tables in schema public to readonly2")
+        cursor.execute("drop schema public cascade")
+
+    endpoint.respec_deep(
+        **{
+            "skip_pg_catalog_updates": False,
+            "delta_operations": [
+                {
+                    "action": "delete_role",
+                    "name": "readonly2",
+                },
+            ],
+        }
+    )
+    endpoint.reconfigure()
+
+    with endpoint.cursor() as cursor:
+        cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly2'")
+        role = cursor.fetchone()
+        assert role is None

From 99b5a6705f1dfa41a6cc7ae60b74b2f769d0eb85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 9 Jan 2025 19:29:09 +0100
Subject: [PATCH 139/583] Update rust to 1.84.0 (#10328)

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust
ecosystem as well.

[Release notes](https://releases.rs/docs/1.84.0/).

Prior update was in #9926.
---
 build-tools.Dockerfile | 2 +-
 rust-toolchain.toml    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index fa84e467ad..79210a2e1b 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -258,7 +258,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.83.0
+ENV RUSTC_VERSION=1.84.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index f0661a32e0..06746d3e1d 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.83.0"
+channel = "1.84.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From 49756a0d01530c6df00b6095da15ad05027c149d Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 9 Jan 2025 14:08:26 -0600
Subject: [PATCH 140/583] Implement compute_ctl management API in Axum (#10099)

This is a refactor to create better abstractions related to our
management server. It cleans up the code, and prepares everything for
authorized communication to and from the control plane.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 Cargo.lock                                    | 105 ++-
 Cargo.toml                                    |   6 +-
 compute_tools/Cargo.toml                      |   5 +-
 compute_tools/src/bin/compute_ctl.rs          |   9 +-
 compute_tools/src/catalog.rs                  |   6 +-
 compute_tools/src/http/api.rs                 | 606 ------------------
 compute_tools/src/http/extract/json.rs        |  48 ++
 compute_tools/src/http/extract/mod.rs         |   7 +
 compute_tools/src/http/extract/path.rs        |  48 ++
 compute_tools/src/http/extract/query.rs       |  48 ++
 compute_tools/src/http/mod.rs                 |  57 +-
 compute_tools/src/http/openapi_spec.yaml      |   2 +-
 .../src/http/routes/check_writability.rs      |  20 +
 compute_tools/src/http/routes/configure.rs    |  91 +++
 .../src/http/routes/database_schema.rs        |  34 +
 .../src/http/routes/dbs_and_roles.rs          |  16 +
 .../src/http/routes/extension_server.rs       |  67 ++
 compute_tools/src/http/routes/extensions.rs   |  45 ++
 compute_tools/src/http/routes/failpoints.rs   |  35 +
 compute_tools/src/http/routes/grants.rs       |  48 ++
 compute_tools/src/http/routes/info.rs         |  11 +
 compute_tools/src/http/routes/insights.rs     |  18 +
 .../src/http/routes/installed_extensions.rs   |  33 +
 compute_tools/src/http/routes/metrics.rs      |  32 +
 compute_tools/src/http/routes/metrics_json.rs |  12 +
 compute_tools/src/http/routes/mod.rs          |  38 ++
 compute_tools/src/http/routes/status.rs       |  14 +
 compute_tools/src/http/routes/terminate.rs    |  58 ++
 compute_tools/src/http/server.rs              | 165 +++++
 compute_tools/src/lib.rs                      |   2 -
 control_plane/src/endpoint.rs                 |   4 +-
 libs/compute_api/src/responses.rs             |  23 +-
 workspace_hack/Cargo.toml                     |   3 +-
 33 files changed, 1065 insertions(+), 651 deletions(-)
 delete mode 100644 compute_tools/src/http/api.rs
 create mode 100644 compute_tools/src/http/extract/json.rs
 create mode 100644 compute_tools/src/http/extract/mod.rs
 create mode 100644 compute_tools/src/http/extract/path.rs
 create mode 100644 compute_tools/src/http/extract/query.rs
 create mode 100644 compute_tools/src/http/routes/check_writability.rs
 create mode 100644 compute_tools/src/http/routes/configure.rs
 create mode 100644 compute_tools/src/http/routes/database_schema.rs
 create mode 100644 compute_tools/src/http/routes/dbs_and_roles.rs
 create mode 100644 compute_tools/src/http/routes/extension_server.rs
 create mode 100644 compute_tools/src/http/routes/extensions.rs
 create mode 100644 compute_tools/src/http/routes/failpoints.rs
 create mode 100644 compute_tools/src/http/routes/grants.rs
 create mode 100644 compute_tools/src/http/routes/info.rs
 create mode 100644 compute_tools/src/http/routes/insights.rs
 create mode 100644 compute_tools/src/http/routes/installed_extensions.rs
 create mode 100644 compute_tools/src/http/routes/metrics.rs
 create mode 100644 compute_tools/src/http/routes/metrics_json.rs
 create mode 100644 compute_tools/src/http/routes/mod.rs
 create mode 100644 compute_tools/src/http/routes/status.rs
 create mode 100644 compute_tools/src/http/routes/terminate.rs
 create mode 100644 compute_tools/src/http/server.rs

diff --git a/Cargo.lock b/Cargo.lock
index 9e0e343996..44143fa0da 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -718,13 +718,13 @@ dependencies = [
 
 [[package]]
 name = "axum"
-version = "0.7.5"
+version = "0.7.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a6c9af12842a67734c9a2e355436e5d03b22383ed60cf13cd0c18fbfe3dcbcf"
+checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
 dependencies = [
  "async-trait",
  "axum-core",
- "base64 0.21.1",
+ "base64 0.22.1",
  "bytes",
  "futures-util",
  "http 1.1.0",
@@ -746,8 +746,8 @@ dependencies = [
  "sha1",
  "sync_wrapper 1.0.1",
  "tokio",
- "tokio-tungstenite",
- "tower",
+ "tokio-tungstenite 0.24.0",
+ "tower 0.5.2",
  "tower-layer",
  "tower-service",
  "tracing",
@@ -1267,6 +1267,7 @@ dependencies = [
  "aws-config",
  "aws-sdk-kms",
  "aws-sdk-s3",
+ "axum",
  "base64 0.13.1",
  "bytes",
  "camino",
@@ -1277,7 +1278,7 @@ dependencies = [
  "fail",
  "flate2",
  "futures",
- "hyper 0.14.30",
+ "http 1.1.0",
  "metrics",
  "nix 0.27.1",
  "notify",
@@ -1303,6 +1304,8 @@ dependencies = [
  "tokio-postgres",
  "tokio-stream",
  "tokio-util",
+ "tower 0.5.2",
+ "tower-http",
  "tracing",
  "tracing-opentelemetry",
  "tracing-subscriber",
@@ -2720,7 +2723,7 @@ dependencies = [
  "pin-project-lite",
  "socket2",
  "tokio",
- "tower",
+ "tower 0.4.13",
  "tower-service",
  "tracing",
 ]
@@ -3260,9 +3263,9 @@ checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40"
 
 [[package]]
 name = "matchit"
-version = "0.8.2"
+version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "540f1c43aed89909c0cc0cc604e3bb2f7e7a341a3728a9e6cfe760e733cd11ed"
+checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3"
 
 [[package]]
 name = "md-5"
@@ -4758,7 +4761,7 @@ dependencies = [
  "tokio-postgres",
  "tokio-postgres2",
  "tokio-rustls 0.26.0",
- "tokio-tungstenite",
+ "tokio-tungstenite 0.21.0",
  "tokio-util",
  "tracing",
  "tracing-subscriber",
@@ -5186,7 +5189,7 @@ dependencies = [
  "async-trait",
  "getrandom 0.2.11",
  "http 1.1.0",
- "matchit 0.8.2",
+ "matchit 0.8.4",
  "opentelemetry",
  "reqwest",
  "reqwest-middleware",
@@ -6800,7 +6803,19 @@ dependencies = [
  "futures-util",
  "log",
  "tokio",
- "tungstenite",
+ "tungstenite 0.21.0",
+]
+
+[[package]]
+name = "tokio-tungstenite"
+version = "0.24.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edc5f74e248dc973e0dbb7b74c7e0d6fcc301c694ff50049504004ef4d0cdcd9"
+dependencies = [
+ "futures-util",
+ "log",
+ "tokio",
+ "tungstenite 0.24.0",
 ]
 
 [[package]]
@@ -6881,7 +6896,7 @@ dependencies = [
  "tokio",
  "tokio-rustls 0.26.0",
  "tokio-stream",
- "tower",
+ "tower 0.4.13",
  "tower-layer",
  "tower-service",
  "tracing",
@@ -6922,16 +6937,49 @@ dependencies = [
 ]
 
 [[package]]
-name = "tower-layer"
-version = "0.3.2"
+name = "tower"
+version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0"
+checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "pin-project-lite",
+ "sync_wrapper 1.0.1",
+ "tokio",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tower-http"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697"
+dependencies = [
+ "bitflags 2.4.1",
+ "bytes",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "pin-project-lite",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+ "uuid",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
 
 [[package]]
 name = "tower-service"
-version = "0.3.2"
+version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
+checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
 
 [[package]]
 name = "tracing"
@@ -7086,6 +7134,24 @@ dependencies = [
  "utf-8",
 ]
 
+[[package]]
+name = "tungstenite"
+version = "0.24.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "18e5b8366ee7a95b16d32197d0b2604b43a0be89dc5fac9f8e96ccafbaedda8a"
+dependencies = [
+ "byteorder",
+ "bytes",
+ "data-encoding",
+ "http 1.1.0",
+ "httparse",
+ "log",
+ "rand 0.8.5",
+ "sha1",
+ "thiserror",
+ "utf-8",
+]
+
 [[package]]
 name = "twox-hash"
 version = "1.6.3"
@@ -7867,7 +7933,8 @@ dependencies = [
  "tokio-util",
  "toml_edit",
  "tonic",
- "tower",
+ "tower 0.4.13",
+ "tower 0.5.2",
  "tracing",
  "tracing-core",
  "url",
diff --git a/Cargo.toml b/Cargo.toml
index 197808d5ae..39898e1c8d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -65,7 +65,7 @@ aws-smithy-types = "1.2"
 aws-credential-types = "1.2.0"
 aws-sigv4 = { version = "1.2", features = ["sign-http"] }
 aws-types = "1.3"
-axum = { version = "0.7.5", features = ["ws"] }
+axum = { version = "0.7.9", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
 bindgen = "0.70"
@@ -187,7 +187,9 @@ tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.8"
 toml_edit = "0.22"
 tonic = {version = "0.12.3", features = ["tls", "tls-roots"]}
-tower-service = "0.3.2"
+tower = { version = "0.5.2", default-features = false }
+tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
+tower-service = "0.3.3"
 tracing = "0.1"
 tracing-error = "0.2"
 tracing-opentelemetry = "0.27"
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 9525b27818..33892813c4 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -15,6 +15,7 @@ aws-config.workspace = true
 aws-sdk-s3.workspace = true
 aws-sdk-kms.workspace = true
 anyhow.workspace = true
+axum = { workspace = true, features = [] }
 camino.workspace = true
 chrono.workspace = true
 cfg-if.workspace = true
@@ -22,7 +23,7 @@ clap.workspace = true
 fail.workspace = true
 flate2.workspace = true
 futures.workspace = true
-hyper0 = { workspace = true, features = ["full"] }
+http.workspace = true
 metrics.workspace = true
 nix.workspace = true
 notify.workspace = true
@@ -37,6 +38,8 @@ serde_with.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
 tar.workspace = true
+tower.workspace = true
+tower-http.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 6ede5fdceb..04432ad0f3 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -60,7 +60,7 @@ use compute_tools::compute::{
 };
 use compute_tools::configurator::launch_configurator;
 use compute_tools::extension_server::get_pg_version_string;
-use compute_tools::http::api::launch_http_server;
+use compute_tools::http::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
@@ -493,7 +493,10 @@ fn start_postgres(
     let mut pg = None;
     if !prestartup_failed {
         pg = match compute.start_compute() {
-            Ok(pg) => Some(pg),
+            Ok(pg) => {
+                info!(postmaster_pid = %pg.0.id(), "Postgres was started");
+                Some(pg)
+            }
             Err(err) => {
                 error!("could not start the compute node: {:#}", err);
                 compute.set_failed_status(err);
@@ -591,6 +594,8 @@ fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
     // propagate to Postgres and it will be shut down as well.
     let mut exit_code = None;
     if let Some((mut pg, logs_handle)) = pg {
+        info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit");
+
         let ecode = pg
             .wait()
             .expect("failed to start waiting on Postgres process");
diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs
index 72198a9479..4a297cfacf 100644
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -36,11 +36,11 @@ pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<Cat
 
 #[derive(Debug, thiserror::Error)]
 pub enum SchemaDumpError {
-    #[error("Database does not exist.")]
+    #[error("database does not exist")]
     DatabaseDoesNotExist,
-    #[error("Failed to execute pg_dump.")]
+    #[error("failed to execute pg_dump")]
     IO(#[from] std::io::Error),
-    #[error("Unexpected error.")]
+    #[error("unexpected I/O error")]
     Unexpected,
 }
 
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
deleted file mode 100644
index a4b1a63e6d..0000000000
--- a/compute_tools/src/http/api.rs
+++ /dev/null
@@ -1,606 +0,0 @@
-use std::convert::Infallible;
-use std::net::IpAddr;
-use std::net::Ipv6Addr;
-use std::net::SocketAddr;
-use std::sync::Arc;
-use std::thread;
-
-use crate::catalog::SchemaDumpError;
-use crate::catalog::{get_database_schema, get_dbs_and_roles};
-use crate::compute::forward_termination_signal;
-use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
-use crate::installed_extensions;
-use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest};
-use compute_api::responses::{
-    ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError,
-    SetRoleGrantsResponse,
-};
-
-use anyhow::Result;
-use hyper::header::CONTENT_TYPE;
-use hyper::service::{make_service_fn, service_fn};
-use hyper::{Body, Method, Request, Response, Server, StatusCode};
-use metrics::proto::MetricFamily;
-use metrics::Encoder;
-use metrics::TextEncoder;
-use tokio::task;
-use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, warn};
-use tracing_utils::http::OtelName;
-use utils::failpoint_support::failpoints_handler;
-use utils::http::error::ApiError;
-use utils::http::request::must_get_query_param;
-
-fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
-    ComputeStatusResponse {
-        start_time: state.start_time,
-        tenant: state
-            .pspec
-            .as_ref()
-            .map(|pspec| pspec.tenant_id.to_string()),
-        timeline: state
-            .pspec
-            .as_ref()
-            .map(|pspec| pspec.timeline_id.to_string()),
-        status: state.status,
-        last_active: state.last_active,
-        error: state.error.clone(),
-    }
-}
-
-// Service function to handle all available routes.
-async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
-    //
-    // NOTE: The URI path is currently included in traces. That's OK because
-    // it doesn't contain any variable parts or sensitive information. But
-    // please keep that in mind if you change the routing here.
-    //
-    match (req.method(), req.uri().path()) {
-        // Serialized compute state.
-        (&Method::GET, "/status") => {
-            debug!("serving /status GET request");
-            let state = compute.state.lock().unwrap();
-            let status_response = status_response_from_state(&state);
-            Response::new(Body::from(serde_json::to_string(&status_response).unwrap()))
-        }
-
-        // Startup metrics in JSON format. Keep /metrics reserved for a possible
-        // future use for Prometheus metrics format.
-        (&Method::GET, "/metrics.json") => {
-            info!("serving /metrics.json GET request");
-            let metrics = compute.state.lock().unwrap().metrics.clone();
-            Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
-        }
-
-        // Prometheus metrics
-        (&Method::GET, "/metrics") => {
-            debug!("serving /metrics GET request");
-
-            // When we call TextEncoder::encode() below, it will immediately
-            // return an error if a metric family has no metrics, so we need to
-            // preemptively filter out metric families with no metrics.
-            let metrics = installed_extensions::collect()
-                .into_iter()
-                .filter(|m| !m.get_metric().is_empty())
-                .collect::<Vec<MetricFamily>>();
-
-            let encoder = TextEncoder::new();
-            let mut buffer = vec![];
-
-            if let Err(err) = encoder.encode(&metrics, &mut buffer) {
-                let msg = format!("error handling /metrics request: {err}");
-                error!(msg);
-                return render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR);
-            }
-
-            match Response::builder()
-                .status(StatusCode::OK)
-                .header(CONTENT_TYPE, encoder.format_type())
-                .body(Body::from(buffer))
-            {
-                Ok(response) => response,
-                Err(err) => {
-                    let msg = format!("error handling /metrics request: {err}");
-                    error!(msg);
-                    render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR)
-                }
-            }
-        }
-        // Collect Postgres current usage insights
-        (&Method::GET, "/insights") => {
-            info!("serving /insights GET request");
-            let status = compute.get_status();
-            if status != ComputeStatus::Running {
-                let msg = format!("compute is not running, current status: {:?}", status);
-                error!(msg);
-                return Response::new(Body::from(msg));
-            }
-
-            let insights = compute.collect_insights().await;
-            Response::new(Body::from(insights))
-        }
-
-        (&Method::POST, "/check_writability") => {
-            info!("serving /check_writability POST request");
-            let status = compute.get_status();
-            if status != ComputeStatus::Running {
-                let msg = format!(
-                    "invalid compute status for check_writability request: {:?}",
-                    status
-                );
-                error!(msg);
-                return Response::new(Body::from(msg));
-            }
-
-            let res = crate::checker::check_writability(compute).await;
-            match res {
-                Ok(_) => Response::new(Body::from("true")),
-                Err(e) => {
-                    error!("check_writability failed: {}", e);
-                    Response::new(Body::from(e.to_string()))
-                }
-            }
-        }
-
-        (&Method::POST, "/extensions") => {
-            info!("serving /extensions POST request");
-            let status = compute.get_status();
-            if status != ComputeStatus::Running {
-                let msg = format!(
-                    "invalid compute status for extensions request: {:?}",
-                    status
-                );
-                error!(msg);
-                return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
-            }
-
-            let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
-            let request = serde_json::from_slice::<ExtensionInstallRequest>(&request).unwrap();
-            let res = compute
-                .install_extension(&request.extension, &request.database, request.version)
-                .await;
-            match res {
-                Ok(version) => render_json(Body::from(
-                    serde_json::to_string(&ExtensionInstallResult {
-                        extension: request.extension,
-                        version,
-                    })
-                    .unwrap(),
-                )),
-                Err(e) => {
-                    error!("install_extension failed: {}", e);
-                    render_json_error(&e.to_string(), StatusCode::INTERNAL_SERVER_ERROR)
-                }
-            }
-        }
-
-        (&Method::GET, "/info") => {
-            let num_cpus = num_cpus::get_physical();
-            info!("serving /info GET request. num_cpus: {}", num_cpus);
-            Response::new(Body::from(
-                serde_json::json!({
-                    "num_cpus": num_cpus,
-                })
-                .to_string(),
-            ))
-        }
-
-        // Accept spec in JSON format and request compute configuration. If
-        // anything goes wrong after we set the compute status to `ConfigurationPending`
-        // and update compute state with new spec, we basically leave compute
-        // in the potentially wrong state. That said, it's control-plane's
-        // responsibility to watch compute state after reconfiguration request
-        // and to clean restart in case of errors.
-        (&Method::POST, "/configure") => {
-            info!("serving /configure POST request");
-            match handle_configure_request(req, compute).await {
-                Ok(msg) => Response::new(Body::from(msg)),
-                Err((msg, code)) => {
-                    error!("error handling /configure request: {msg}");
-                    render_json_error(&msg, code)
-                }
-            }
-        }
-
-        (&Method::POST, "/terminate") => {
-            info!("serving /terminate POST request");
-            match handle_terminate_request(compute).await {
-                Ok(()) => Response::new(Body::empty()),
-                Err((msg, code)) => {
-                    error!("error handling /terminate request: {msg}");
-                    render_json_error(&msg, code)
-                }
-            }
-        }
-
-        (&Method::GET, "/dbs_and_roles") => {
-            info!("serving /dbs_and_roles GET request",);
-            match get_dbs_and_roles(compute).await {
-                Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
-                Err(_) => {
-                    render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR)
-                }
-            }
-        }
-
-        (&Method::GET, "/database_schema") => {
-            let database = match must_get_query_param(&req, "database") {
-                Err(e) => return e.into_response(),
-                Ok(database) => database,
-            };
-            info!("serving /database_schema GET request with database: {database}",);
-            match get_database_schema(compute, &database).await {
-                Ok(res) => render_plain(Body::wrap_stream(res)),
-                Err(SchemaDumpError::DatabaseDoesNotExist) => {
-                    render_json_error("database does not exist", StatusCode::NOT_FOUND)
-                }
-                Err(e) => {
-                    error!("can't get schema dump: {}", e);
-                    render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR)
-                }
-            }
-        }
-
-        (&Method::POST, "/grants") => {
-            info!("serving /grants POST request");
-            let status = compute.get_status();
-            if status != ComputeStatus::Running {
-                let msg = format!(
-                    "invalid compute status for set_role_grants request: {:?}",
-                    status
-                );
-                error!(msg);
-                return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
-            }
-
-            let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
-            let request = serde_json::from_slice::<SetRoleGrantsRequest>(&request).unwrap();
-
-            let res = compute
-                .set_role_grants(
-                    &request.database,
-                    &request.schema,
-                    &request.privileges,
-                    &request.role,
-                )
-                .await;
-            match res {
-                Ok(()) => render_json(Body::from(
-                    serde_json::to_string(&SetRoleGrantsResponse {
-                        database: request.database,
-                        schema: request.schema,
-                        role: request.role,
-                        privileges: request.privileges,
-                    })
-                    .unwrap(),
-                )),
-                Err(e) => render_json_error(
-                    &format!("could not grant role privileges to the schema: {e}"),
-                    // TODO: can we filter on role/schema not found errors
-                    // and return appropriate error code?
-                    StatusCode::INTERNAL_SERVER_ERROR,
-                ),
-            }
-        }
-
-        // get the list of installed extensions
-        // currently only used in python tests
-        // TODO: call it from cplane
-        (&Method::GET, "/installed_extensions") => {
-            info!("serving /installed_extensions GET request");
-            let status = compute.get_status();
-            if status != ComputeStatus::Running {
-                let msg = format!(
-                    "invalid compute status for extensions request: {:?}",
-                    status
-                );
-                error!(msg);
-                return Response::new(Body::from(msg));
-            }
-
-            let conf = compute.get_conn_conf(None);
-            let res =
-                task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf))
-                    .await
-                    .unwrap();
-
-            match res {
-                Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
-                Err(e) => render_json_error(
-                    &format!("could not get list of installed extensions: {}", e),
-                    StatusCode::INTERNAL_SERVER_ERROR,
-                ),
-            }
-        }
-
-        (&Method::POST, "/failpoints") if cfg!(feature = "testing") => {
-            match failpoints_handler(req, CancellationToken::new()).await {
-                Ok(r) => r,
-                Err(ApiError::BadRequest(e)) => {
-                    render_json_error(&e.to_string(), StatusCode::BAD_REQUEST)
-                }
-                Err(_) => {
-                    render_json_error("Internal server error", StatusCode::INTERNAL_SERVER_ERROR)
-                }
-            }
-        }
-
-        // download extension files from remote extension storage on demand
-        (&Method::POST, route) if route.starts_with("/extension_server/") => {
-            info!("serving {:?} POST request", route);
-            info!("req.uri {:?}", req.uri());
-
-            // don't even try to download extensions
-            // if no remote storage is configured
-            if compute.ext_remote_storage.is_none() {
-                info!("no extensions remote storage configured");
-                let mut resp = Response::new(Body::from("no remote storage configured"));
-                *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                return resp;
-            }
-
-            let mut is_library = false;
-            if let Some(params) = req.uri().query() {
-                info!("serving {:?} POST request with params: {}", route, params);
-                if params == "is_library=true" {
-                    is_library = true;
-                } else {
-                    let mut resp = Response::new(Body::from("Wrong request parameters"));
-                    *resp.status_mut() = StatusCode::BAD_REQUEST;
-                    return resp;
-                }
-            }
-            let filename = route.split('/').last().unwrap().to_string();
-            info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");
-
-            // get ext_name and path from spec
-            // don't lock compute_state for too long
-            let ext = {
-                let compute_state = compute.state.lock().unwrap();
-                let pspec = compute_state.pspec.as_ref().expect("spec must be set");
-                let spec = &pspec.spec;
-
-                // debug only
-                info!("spec: {:?}", spec);
-
-                let remote_extensions = match spec.remote_extensions.as_ref() {
-                    Some(r) => r,
-                    None => {
-                        info!("no remote extensions spec was provided");
-                        let mut resp = Response::new(Body::from("no remote storage configured"));
-                        *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                        return resp;
-                    }
-                };
-
-                remote_extensions.get_ext(
-                    &filename,
-                    is_library,
-                    &compute.build_tag,
-                    &compute.pgversion,
-                )
-            };
-
-            match ext {
-                Ok((ext_name, ext_path)) => {
-                    match compute.download_extension(ext_name, ext_path).await {
-                        Ok(_) => Response::new(Body::from("OK")),
-                        Err(e) => {
-                            error!("extension download failed: {}", e);
-                            let mut resp = Response::new(Body::from(e.to_string()));
-                            *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                            resp
-                        }
-                    }
-                }
-                Err(e) => {
-                    warn!("extension download failed to find extension: {}", e);
-                    let mut resp = Response::new(Body::from("failed to find file"));
-                    *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                    resp
-                }
-            }
-        }
-
-        // Return the `404 Not Found` for any other routes.
-        _ => {
-            let mut not_found = Response::new(Body::from("404 Not Found"));
-            *not_found.status_mut() = StatusCode::NOT_FOUND;
-            not_found
-        }
-    }
-}
-
-async fn handle_configure_request(
-    req: Request<Body>,
-    compute: &Arc<ComputeNode>,
-) -> Result<String, (String, StatusCode)> {
-    if !compute.live_config_allowed {
-        return Err((
-            "live configuration is not allowed for this compute node".to_string(),
-            StatusCode::PRECONDITION_FAILED,
-        ));
-    }
-
-    let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap();
-    let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap();
-    if let Ok(request) = serde_json::from_str::<ConfigurationRequest>(&spec_raw) {
-        let spec = request.spec;
-
-        let parsed_spec = match ParsedSpec::try_from(spec) {
-            Ok(ps) => ps,
-            Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)),
-        };
-
-        // XXX: wrap state update under lock in code blocks. Otherwise,
-        // we will try to `Send` `mut state` into the spawned thread
-        // bellow, which will cause error:
-        // ```
-        // error: future cannot be sent between threads safely
-        // ```
-        {
-            let mut state = compute.state.lock().unwrap();
-            if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
-                let msg = format!(
-                    "invalid compute status for configuration request: {:?}",
-                    state.status.clone()
-                );
-                return Err((msg, StatusCode::PRECONDITION_FAILED));
-            }
-            state.pspec = Some(parsed_spec);
-            state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
-            drop(state);
-            info!("set new spec and notified waiters");
-        }
-
-        // Spawn a blocking thread to wait for compute to become Running.
-        // This is needed to do not block the main pool of workers and
-        // be able to serve other requests while some particular request
-        // is waiting for compute to finish configuration.
-        let c = compute.clone();
-        task::spawn_blocking(move || {
-            let mut state = c.state.lock().unwrap();
-            while state.status != ComputeStatus::Running {
-                state = c.state_changed.wait(state).unwrap();
-                info!(
-                    "waiting for compute to become Running, current status: {:?}",
-                    state.status
-                );
-
-                if state.status == ComputeStatus::Failed {
-                    let err = state.error.as_ref().map_or("unknown error", |x| x);
-                    let msg = format!("compute configuration failed: {:?}", err);
-                    return Err((msg, StatusCode::INTERNAL_SERVER_ERROR));
-                }
-            }
-
-            Ok(())
-        })
-        .await
-        .unwrap()?;
-
-        // Return current compute state if everything went well.
-        let state = compute.state.lock().unwrap().clone();
-        let status_response = status_response_from_state(&state);
-        Ok(serde_json::to_string(&status_response).unwrap())
-    } else {
-        Err(("invalid spec".to_string(), StatusCode::BAD_REQUEST))
-    }
-}
-
-fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
-    let error = GenericAPIError {
-        error: e.to_string(),
-    };
-    Response::builder()
-        .status(status)
-        .header(CONTENT_TYPE, "application/json")
-        .body(Body::from(serde_json::to_string(&error).unwrap()))
-        .unwrap()
-}
-
-fn render_json(body: Body) -> Response<Body> {
-    Response::builder()
-        .header(CONTENT_TYPE, "application/json")
-        .body(body)
-        .unwrap()
-}
-
-fn render_plain(body: Body) -> Response<Body> {
-    Response::builder()
-        .header(CONTENT_TYPE, "text/plain")
-        .body(body)
-        .unwrap()
-}
-
-async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
-    {
-        let mut state = compute.state.lock().unwrap();
-        if state.status == ComputeStatus::Terminated {
-            return Ok(());
-        }
-        if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
-            let msg = format!(
-                "invalid compute status for termination request: {}",
-                state.status
-            );
-            return Err((msg, StatusCode::PRECONDITION_FAILED));
-        }
-        state.set_status(ComputeStatus::TerminationPending, &compute.state_changed);
-        drop(state);
-    }
-
-    forward_termination_signal();
-    info!("sent signal and notified waiters");
-
-    // Spawn a blocking thread to wait for compute to become Terminated.
-    // This is needed to do not block the main pool of workers and
-    // be able to serve other requests while some particular request
-    // is waiting for compute to finish configuration.
-    let c = compute.clone();
-    task::spawn_blocking(move || {
-        let mut state = c.state.lock().unwrap();
-        while state.status != ComputeStatus::Terminated {
-            state = c.state_changed.wait(state).unwrap();
-            info!(
-                "waiting for compute to become {}, current status: {:?}",
-                ComputeStatus::Terminated,
-                state.status
-            );
-        }
-
-        Ok(())
-    })
-    .await
-    .unwrap()?;
-    info!("terminated Postgres");
-    Ok(())
-}
-
-// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
-#[tokio::main]
-async fn serve(port: u16, state: Arc<ComputeNode>) {
-    // this usually binds to both IPv4 and IPv6 on linux
-    // see e.g. https://github.com/rust-lang/rust/pull/34440
-    let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
-
-    let make_service = make_service_fn(move |_conn| {
-        let state = state.clone();
-        async move {
-            Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
-                let state = state.clone();
-                async move {
-                    Ok::<_, Infallible>(
-                        // NOTE: We include the URI path in the string. It
-                        // doesn't contain any variable parts or sensitive
-                        // information in this API.
-                        tracing_utils::http::tracing_handler(
-                            req,
-                            |req| routes(req, &state),
-                            OtelName::UriPath,
-                        )
-                        .await,
-                    )
-                }
-            }))
-        }
-    });
-
-    info!("starting HTTP server on {}", addr);
-
-    let server = Server::bind(&addr).serve(make_service);
-
-    // Run this server forever
-    if let Err(e) = server.await {
-        error!("server error: {}", e);
-    }
-}
-
-/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
-pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
-    let state = Arc::clone(state);
-
-    Ok(thread::Builder::new()
-        .name("http-endpoint".into())
-        .spawn(move || serve(port, state))?)
-}
diff --git a/compute_tools/src/http/extract/json.rs b/compute_tools/src/http/extract/json.rs
new file mode 100644
index 0000000000..41f13625ad
--- /dev/null
+++ b/compute_tools/src/http/extract/json.rs
@@ -0,0 +1,48 @@
+use std::ops::{Deref, DerefMut};
+
+use axum::{
+    async_trait,
+    extract::{rejection::JsonRejection, FromRequest, Request},
+};
+use compute_api::responses::GenericAPIError;
+use http::StatusCode;
+
+/// Custom `Json` extractor, so that we can format errors into
+/// `JsonResponse<GenericAPIError>`.
+#[derive(Debug, Clone, Copy, Default)]
+pub(crate) struct Json<T>(pub T);
+
+#[async_trait]
+impl<S, T> FromRequest<S> for Json<T>
+where
+    axum::Json<T>: FromRequest<S, Rejection = JsonRejection>,
+    S: Send + Sync,
+{
+    type Rejection = (StatusCode, axum::Json<GenericAPIError>);
+
+    async fn from_request(req: Request, state: &S) -> Result<Self, Self::Rejection> {
+        match axum::Json::<T>::from_request(req, state).await {
+            Ok(value) => Ok(Self(value.0)),
+            Err(rejection) => Err((
+                rejection.status(),
+                axum::Json(GenericAPIError {
+                    error: rejection.body_text().to_lowercase(),
+                }),
+            )),
+        }
+    }
+}
+
+impl<T> Deref for Json<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T> DerefMut for Json<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
diff --git a/compute_tools/src/http/extract/mod.rs b/compute_tools/src/http/extract/mod.rs
new file mode 100644
index 0000000000..1b690e444d
--- /dev/null
+++ b/compute_tools/src/http/extract/mod.rs
@@ -0,0 +1,7 @@
+pub(crate) mod json;
+pub(crate) mod path;
+pub(crate) mod query;
+
+pub(crate) use json::Json;
+pub(crate) use path::Path;
+pub(crate) use query::Query;
diff --git a/compute_tools/src/http/extract/path.rs b/compute_tools/src/http/extract/path.rs
new file mode 100644
index 0000000000..95edc657f2
--- /dev/null
+++ b/compute_tools/src/http/extract/path.rs
@@ -0,0 +1,48 @@
+use std::ops::{Deref, DerefMut};
+
+use axum::{
+    async_trait,
+    extract::{rejection::PathRejection, FromRequestParts},
+};
+use compute_api::responses::GenericAPIError;
+use http::{request::Parts, StatusCode};
+
+/// Custom `Path` extractor, so that we can format errors into
+/// `JsonResponse<GenericAPIError>`.
+#[derive(Debug, Clone, Copy, Default)]
+pub(crate) struct Path<T>(pub T);
+
+#[async_trait]
+impl<S, T> FromRequestParts<S> for Path<T>
+where
+    axum::extract::Path<T>: FromRequestParts<S, Rejection = PathRejection>,
+    S: Send + Sync,
+{
+    type Rejection = (StatusCode, axum::Json<GenericAPIError>);
+
+    async fn from_request_parts(parts: &mut Parts, state: &S) -> Result<Self, Self::Rejection> {
+        match axum::extract::Path::<T>::from_request_parts(parts, state).await {
+            Ok(value) => Ok(Self(value.0)),
+            Err(rejection) => Err((
+                rejection.status(),
+                axum::Json(GenericAPIError {
+                    error: rejection.body_text().to_ascii_lowercase(),
+                }),
+            )),
+        }
+    }
+}
+
+impl<T> Deref for Path<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T> DerefMut for Path<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
diff --git a/compute_tools/src/http/extract/query.rs b/compute_tools/src/http/extract/query.rs
new file mode 100644
index 0000000000..a1f1b0cef0
--- /dev/null
+++ b/compute_tools/src/http/extract/query.rs
@@ -0,0 +1,48 @@
+use std::ops::{Deref, DerefMut};
+
+use axum::{
+    async_trait,
+    extract::{rejection::QueryRejection, FromRequestParts},
+};
+use compute_api::responses::GenericAPIError;
+use http::{request::Parts, StatusCode};
+
+/// Custom `Query` extractor, so that we can format errors into
+/// `JsonResponse<GenericAPIError>`.
+#[derive(Debug, Clone, Copy, Default)]
+pub(crate) struct Query<T>(pub T);
+
+#[async_trait]
+impl<S, T> FromRequestParts<S> for Query<T>
+where
+    axum::extract::Query<T>: FromRequestParts<S, Rejection = QueryRejection>,
+    S: Send + Sync,
+{
+    type Rejection = (StatusCode, axum::Json<GenericAPIError>);
+
+    async fn from_request_parts(parts: &mut Parts, state: &S) -> Result<Self, Self::Rejection> {
+        match axum::extract::Query::<T>::from_request_parts(parts, state).await {
+            Ok(value) => Ok(Self(value.0)),
+            Err(rejection) => Err((
+                rejection.status(),
+                axum::Json(GenericAPIError {
+                    error: rejection.body_text().to_ascii_lowercase(),
+                }),
+            )),
+        }
+    }
+}
+
+impl<T> Deref for Query<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T> DerefMut for Query<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs
index e5fdf85eed..a596bea504 100644
--- a/compute_tools/src/http/mod.rs
+++ b/compute_tools/src/http/mod.rs
@@ -1 +1,56 @@
-pub mod api;
+use axum::{body::Body, response::Response};
+use compute_api::responses::{ComputeStatus, GenericAPIError};
+use http::{header::CONTENT_TYPE, StatusCode};
+use serde::Serialize;
+use tracing::error;
+
+pub use server::launch_http_server;
+
+mod extract;
+mod routes;
+mod server;
+
+/// Convenience response builder for JSON responses
+struct JsonResponse;
+
+impl JsonResponse {
+    /// Helper for actually creating a response
+    fn create_response(code: StatusCode, body: impl Serialize) -> Response {
+        Response::builder()
+            .status(code)
+            .header(CONTENT_TYPE.as_str(), "application/json")
+            .body(Body::from(serde_json::to_string(&body).unwrap()))
+            .unwrap()
+    }
+
+    /// Create a successful error response
+    pub(self) fn success(code: StatusCode, body: impl Serialize) -> Response {
+        assert!({
+            let code = code.as_u16();
+
+            (200..300).contains(&code)
+        });
+
+        Self::create_response(code, body)
+    }
+
+    /// Create an error response
+    pub(self) fn error(code: StatusCode, error: impl ToString) -> Response {
+        assert!(code.as_u16() >= 400);
+
+        let message = error.to_string();
+        error!(message);
+
+        Self::create_response(code, &GenericAPIError { error: message })
+    }
+
+    /// Create an error response related to the compute being in an invalid state
+    pub(self) fn invalid_status(status: ComputeStatus) -> Response {
+        Self::create_response(
+            StatusCode::PRECONDITION_FAILED,
+            &GenericAPIError {
+                error: format!("invalid compute status: {status}"),
+            },
+        )
+    }
+}
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index 24a67cac71..50319cdd85 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -37,7 +37,7 @@ paths:
               schema:
                 $ref: "#/components/schemas/ComputeMetrics"
 
-  /metrics
+  /metrics:
     get:
       tags:
       - Info
diff --git a/compute_tools/src/http/routes/check_writability.rs b/compute_tools/src/http/routes/check_writability.rs
new file mode 100644
index 0000000000..d7feb055e9
--- /dev/null
+++ b/compute_tools/src/http/routes/check_writability.rs
@@ -0,0 +1,20 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use compute_api::responses::ComputeStatus;
+use http::StatusCode;
+
+use crate::{checker::check_writability, compute::ComputeNode, http::JsonResponse};
+
+/// Check that the compute is currently running.
+pub(in crate::http) async fn is_writable(State(compute): State<Arc<ComputeNode>>) -> Response {
+    let status = compute.get_status();
+    if status != ComputeStatus::Running {
+        return JsonResponse::invalid_status(status);
+    }
+
+    match check_writability(&compute).await {
+        Ok(_) => JsonResponse::success(StatusCode::OK, true),
+        Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
+    }
+}
diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs
new file mode 100644
index 0000000000..2546cbc344
--- /dev/null
+++ b/compute_tools/src/http/routes/configure.rs
@@ -0,0 +1,91 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use compute_api::{
+    requests::ConfigurationRequest,
+    responses::{ComputeStatus, ComputeStatusResponse},
+};
+use http::StatusCode;
+use tokio::task;
+use tracing::info;
+
+use crate::{
+    compute::{ComputeNode, ParsedSpec},
+    http::{extract::Json, JsonResponse},
+};
+
+// Accept spec in JSON format and request compute configuration. If anything
+// goes wrong after we set the compute status to `ConfigurationPending` and
+// update compute state with new spec, we basically leave compute in the
+// potentially wrong state. That said, it's control-plane's responsibility to
+// watch compute state after reconfiguration request and to clean restart in
+// case of errors.
+pub(in crate::http) async fn configure(
+    State(compute): State<Arc<ComputeNode>>,
+    request: Json<ConfigurationRequest>,
+) -> Response {
+    if !compute.live_config_allowed {
+        return JsonResponse::error(
+            StatusCode::PRECONDITION_FAILED,
+            "live configuration is not allowed for this compute node".to_string(),
+        );
+    }
+
+    let pspec = match ParsedSpec::try_from(request.spec.clone()) {
+        Ok(p) => p,
+        Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
+    };
+
+    // XXX: wrap state update under lock in a code block. Otherwise, we will try
+    // to `Send` `mut state` into the spawned thread bellow, which will cause
+    // the following rustc error:
+    //
+    // error: future cannot be sent between threads safely
+    {
+        let mut state = compute.state.lock().unwrap();
+        if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
+            return JsonResponse::invalid_status(state.status);
+        }
+
+        state.pspec = Some(pspec);
+        state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
+        drop(state);
+    }
+
+    // Spawn a blocking thread to wait for compute to become Running. This is
+    // needed to do not block the main pool of workers and be able to serve
+    // other requests while some particular request is waiting for compute to
+    // finish configuration.
+    let c = compute.clone();
+    let completed = task::spawn_blocking(move || {
+        let mut state = c.state.lock().unwrap();
+        while state.status != ComputeStatus::Running {
+            state = c.state_changed.wait(state).unwrap();
+            info!(
+                "waiting for compute to become {}, current status: {}",
+                ComputeStatus::Running,
+                state.status
+            );
+
+            if state.status == ComputeStatus::Failed {
+                let err = state.error.as_ref().map_or("unknown error", |x| x);
+                let msg = format!("compute configuration failed: {:?}", err);
+                return Err(msg);
+            }
+        }
+
+        Ok(())
+    })
+    .await
+    .unwrap();
+
+    if let Err(e) = completed {
+        return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e);
+    }
+
+    // Return current compute state if everything went well.
+    let state = compute.state.lock().unwrap().clone();
+    let body = ComputeStatusResponse::from(&state);
+
+    JsonResponse::success(StatusCode::OK, body)
+}
diff --git a/compute_tools/src/http/routes/database_schema.rs b/compute_tools/src/http/routes/database_schema.rs
new file mode 100644
index 0000000000..fd716272dc
--- /dev/null
+++ b/compute_tools/src/http/routes/database_schema.rs
@@ -0,0 +1,34 @@
+use std::sync::Arc;
+
+use axum::{body::Body, extract::State, response::Response};
+use http::{header::CONTENT_TYPE, StatusCode};
+use serde::Deserialize;
+
+use crate::{
+    catalog::{get_database_schema, SchemaDumpError},
+    compute::ComputeNode,
+    http::{extract::Query, JsonResponse},
+};
+
+#[derive(Debug, Clone, Deserialize)]
+pub(in crate::http) struct DatabaseSchemaParams {
+    database: String,
+}
+
+/// Get a schema dump of the requested database.
+pub(in crate::http) async fn get_schema_dump(
+    params: Query<DatabaseSchemaParams>,
+    State(compute): State<Arc<ComputeNode>>,
+) -> Response {
+    match get_database_schema(&compute, &params.database).await {
+        Ok(schema) => Response::builder()
+            .status(StatusCode::OK)
+            .header(CONTENT_TYPE.as_str(), "application/json")
+            .body(Body::from_stream(schema))
+            .unwrap(),
+        Err(SchemaDumpError::DatabaseDoesNotExist) => {
+            JsonResponse::error(StatusCode::NOT_FOUND, SchemaDumpError::DatabaseDoesNotExist)
+        }
+        Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
+    }
+}
diff --git a/compute_tools/src/http/routes/dbs_and_roles.rs b/compute_tools/src/http/routes/dbs_and_roles.rs
new file mode 100644
index 0000000000..4843c3fab4
--- /dev/null
+++ b/compute_tools/src/http/routes/dbs_and_roles.rs
@@ -0,0 +1,16 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use http::StatusCode;
+
+use crate::{catalog::get_dbs_and_roles, compute::ComputeNode, http::JsonResponse};
+
+/// Get the databases and roles from the compute.
+pub(in crate::http) async fn get_catalog_objects(
+    State(compute): State<Arc<ComputeNode>>,
+) -> Response {
+    match get_dbs_and_roles(&compute).await {
+        Ok(catalog_objects) => JsonResponse::success(StatusCode::OK, catalog_objects),
+        Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
+    }
+}
diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs
new file mode 100644
index 0000000000..ee5bc675ba
--- /dev/null
+++ b/compute_tools/src/http/routes/extension_server.rs
@@ -0,0 +1,67 @@
+use std::sync::Arc;
+
+use axum::{
+    extract::State,
+    response::{IntoResponse, Response},
+};
+use http::StatusCode;
+use serde::Deserialize;
+
+use crate::{
+    compute::ComputeNode,
+    http::{
+        extract::{Path, Query},
+        JsonResponse,
+    },
+};
+
+#[derive(Debug, Clone, Deserialize)]
+pub(in crate::http) struct ExtensionServerParams {
+    is_library: Option<bool>,
+}
+
+/// Download a remote extension.
+pub(in crate::http) async fn download_extension(
+    Path(filename): Path<String>,
+    params: Query<ExtensionServerParams>,
+    State(compute): State<Arc<ComputeNode>>,
+) -> Response {
+    // Don't even try to download extensions if no remote storage is configured
+    if compute.ext_remote_storage.is_none() {
+        return JsonResponse::error(
+            StatusCode::PRECONDITION_FAILED,
+            "remote storage is not configured",
+        );
+    }
+
+    let ext = {
+        let state = compute.state.lock().unwrap();
+        let pspec = state.pspec.as_ref().unwrap();
+        let spec = &pspec.spec;
+
+        let remote_extensions = match spec.remote_extensions.as_ref() {
+            Some(r) => r,
+            None => {
+                return JsonResponse::error(
+                    StatusCode::CONFLICT,
+                    "information about remote extensions is unavailable",
+                );
+            }
+        };
+
+        remote_extensions.get_ext(
+            &filename,
+            params.is_library.unwrap_or(false),
+            &compute.build_tag,
+            &compute.pgversion,
+        )
+    };
+
+    match ext {
+        Ok((ext_name, ext_path)) => match compute.download_extension(ext_name, ext_path).await {
+            Ok(_) => StatusCode::OK.into_response(),
+            Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
+        },
+        Err(e) => JsonResponse::error(StatusCode::NOT_FOUND, e),
+    }
+}
diff --git a/compute_tools/src/http/routes/extensions.rs b/compute_tools/src/http/routes/extensions.rs
new file mode 100644
index 0000000000..1fc03b9109
--- /dev/null
+++ b/compute_tools/src/http/routes/extensions.rs
@@ -0,0 +1,45 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use compute_api::{
+    requests::ExtensionInstallRequest,
+    responses::{ComputeStatus, ExtensionInstallResponse},
+};
+use http::StatusCode;
+
+use crate::{
+    compute::ComputeNode,
+    http::{extract::Json, JsonResponse},
+};
+
+/// Install a extension.
+pub(in crate::http) async fn install_extension(
+    State(compute): State<Arc<ComputeNode>>,
+    request: Json<ExtensionInstallRequest>,
+) -> Response {
+    let status = compute.get_status();
+    if status != ComputeStatus::Running {
+        return JsonResponse::invalid_status(status);
+    }
+
+    match compute
+        .install_extension(
+            &request.extension,
+            &request.database,
+            request.version.to_string(),
+        )
+        .await
+    {
+        Ok(version) => JsonResponse::success(
+            StatusCode::CREATED,
+            Some(ExtensionInstallResponse {
+                extension: request.extension.clone(),
+                version,
+            }),
+        ),
+        Err(e) => JsonResponse::error(
+            StatusCode::INTERNAL_SERVER_ERROR,
+            format!("failed to install extension: {e}"),
+        ),
+    }
+}
diff --git a/compute_tools/src/http/routes/failpoints.rs b/compute_tools/src/http/routes/failpoints.rs
new file mode 100644
index 0000000000..2ec4511676
--- /dev/null
+++ b/compute_tools/src/http/routes/failpoints.rs
@@ -0,0 +1,35 @@
+use axum::response::{IntoResponse, Response};
+use http::StatusCode;
+use tracing::info;
+use utils::failpoint_support::{apply_failpoint, ConfigureFailpointsRequest};
+
+use crate::http::{extract::Json, JsonResponse};
+
+/// Configure failpoints for testing purposes.
+pub(in crate::http) async fn configure_failpoints(
+    failpoints: Json<ConfigureFailpointsRequest>,
+) -> Response {
+    if !fail::has_failpoints() {
+        return JsonResponse::error(
+            StatusCode::PRECONDITION_FAILED,
+            "Cannot manage failpoints because neon was compiled without failpoints support",
+        );
+    }
+
+    for fp in &*failpoints {
+        info!("cfg failpoint: {} {}", fp.name, fp.actions);
+
+        // We recognize one extra "action" that's not natively recognized
+        // by the failpoints crate: exit, to immediately kill the process
+        let cfg_result = apply_failpoint(&fp.name, &fp.actions);
+
+        if let Err(e) = cfg_result {
+            return JsonResponse::error(
+                StatusCode::BAD_REQUEST,
+                format!("failed to configure failpoints: {e}"),
+            );
+        }
+    }
+
+    StatusCode::OK.into_response()
+}
diff --git a/compute_tools/src/http/routes/grants.rs b/compute_tools/src/http/routes/grants.rs
new file mode 100644
index 0000000000..3f67f011e5
--- /dev/null
+++ b/compute_tools/src/http/routes/grants.rs
@@ -0,0 +1,48 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use compute_api::{
+    requests::SetRoleGrantsRequest,
+    responses::{ComputeStatus, SetRoleGrantsResponse},
+};
+use http::StatusCode;
+
+use crate::{
+    compute::ComputeNode,
+    http::{extract::Json, JsonResponse},
+};
+
+/// Add grants for a role.
+pub(in crate::http) async fn add_grant(
+    State(compute): State<Arc<ComputeNode>>,
+    request: Json<SetRoleGrantsRequest>,
+) -> Response {
+    let status = compute.get_status();
+    if status != ComputeStatus::Running {
+        return JsonResponse::invalid_status(status);
+    }
+
+    match compute
+        .set_role_grants(
+            &request.database,
+            &request.schema,
+            &request.privileges,
+            &request.role,
+        )
+        .await
+    {
+        Ok(()) => JsonResponse::success(
+            StatusCode::CREATED,
+            Some(SetRoleGrantsResponse {
+                database: request.database.clone(),
+                schema: request.schema.clone(),
+                role: request.role.clone(),
+                privileges: request.privileges.clone(),
+            }),
+        ),
+        Err(e) => JsonResponse::error(
+            StatusCode::INTERNAL_SERVER_ERROR,
+            format!("failed to grant role privileges to the schema: {e}"),
+        ),
+    }
+}
diff --git a/compute_tools/src/http/routes/info.rs b/compute_tools/src/http/routes/info.rs
new file mode 100644
index 0000000000..32d6fea74c
--- /dev/null
+++ b/compute_tools/src/http/routes/info.rs
@@ -0,0 +1,11 @@
+use axum::response::Response;
+use compute_api::responses::InfoResponse;
+use http::StatusCode;
+
+use crate::http::JsonResponse;
+
+/// Get information about the physical characteristics about the compute.
+pub(in crate::http) async fn get_info() -> Response {
+    let num_cpus = num_cpus::get_physical();
+    JsonResponse::success(StatusCode::OK, &InfoResponse { num_cpus })
+}
diff --git a/compute_tools/src/http/routes/insights.rs b/compute_tools/src/http/routes/insights.rs
new file mode 100644
index 0000000000..6b03a461c3
--- /dev/null
+++ b/compute_tools/src/http/routes/insights.rs
@@ -0,0 +1,18 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use compute_api::responses::ComputeStatus;
+use http::StatusCode;
+
+use crate::{compute::ComputeNode, http::JsonResponse};
+
+/// Collect current Postgres usage insights.
+pub(in crate::http) async fn get_insights(State(compute): State<Arc<ComputeNode>>) -> Response {
+    let status = compute.get_status();
+    if status != ComputeStatus::Running {
+        return JsonResponse::invalid_status(status);
+    }
+
+    let insights = compute.collect_insights().await;
+    JsonResponse::success(StatusCode::OK, insights)
+}
diff --git a/compute_tools/src/http/routes/installed_extensions.rs b/compute_tools/src/http/routes/installed_extensions.rs
new file mode 100644
index 0000000000..db74a6b195
--- /dev/null
+++ b/compute_tools/src/http/routes/installed_extensions.rs
@@ -0,0 +1,33 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use compute_api::responses::ComputeStatus;
+use http::StatusCode;
+use tokio::task;
+
+use crate::{compute::ComputeNode, http::JsonResponse, installed_extensions};
+
+/// Get a list of installed extensions.
+pub(in crate::http) async fn get_installed_extensions(
+    State(compute): State<Arc<ComputeNode>>,
+) -> Response {
+    let status = compute.get_status();
+    if status != ComputeStatus::Running {
+        return JsonResponse::invalid_status(status);
+    }
+
+    let conf = compute.get_conn_conf(None);
+    let res = task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf))
+        .await
+        .unwrap();
+
+    match res {
+        Ok(installed_extensions) => {
+            JsonResponse::success(StatusCode::OK, Some(installed_extensions))
+        }
+        Err(e) => JsonResponse::error(
+            StatusCode::INTERNAL_SERVER_ERROR,
+            format!("failed to get list of installed extensions: {e}"),
+        ),
+    }
+}
diff --git a/compute_tools/src/http/routes/metrics.rs b/compute_tools/src/http/routes/metrics.rs
new file mode 100644
index 0000000000..40d71b5de7
--- /dev/null
+++ b/compute_tools/src/http/routes/metrics.rs
@@ -0,0 +1,32 @@
+use axum::{body::Body, response::Response};
+use http::header::CONTENT_TYPE;
+use http::StatusCode;
+use metrics::proto::MetricFamily;
+use metrics::Encoder;
+use metrics::TextEncoder;
+
+use crate::{http::JsonResponse, installed_extensions};
+
+/// Expose Prometheus metrics.
+pub(in crate::http) async fn get_metrics() -> Response {
+    // When we call TextEncoder::encode() below, it will immediately return an
+    // error if a metric family has no metrics, so we need to preemptively
+    // filter out metric families with no metrics.
+    let metrics = installed_extensions::collect()
+        .into_iter()
+        .filter(|m| !m.get_metric().is_empty())
+        .collect::<Vec<MetricFamily>>();
+
+    let encoder = TextEncoder::new();
+    let mut buffer = vec![];
+
+    if let Err(e) = encoder.encode(&metrics, &mut buffer) {
+        return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e);
+    }
+
+    Response::builder()
+        .status(StatusCode::OK)
+        .header(CONTENT_TYPE, encoder.format_type())
+        .body(Body::from(buffer))
+        .unwrap()
+}
diff --git a/compute_tools/src/http/routes/metrics_json.rs b/compute_tools/src/http/routes/metrics_json.rs
new file mode 100644
index 0000000000..0709db5011
--- /dev/null
+++ b/compute_tools/src/http/routes/metrics_json.rs
@@ -0,0 +1,12 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use http::StatusCode;
+
+use crate::{compute::ComputeNode, http::JsonResponse};
+
+/// Get startup metrics.
+pub(in crate::http) async fn get_metrics(State(compute): State<Arc<ComputeNode>>) -> Response {
+    let metrics = compute.state.lock().unwrap().metrics.clone();
+    JsonResponse::success(StatusCode::OK, metrics)
+}
diff --git a/compute_tools/src/http/routes/mod.rs b/compute_tools/src/http/routes/mod.rs
new file mode 100644
index 0000000000..3efa1153ad
--- /dev/null
+++ b/compute_tools/src/http/routes/mod.rs
@@ -0,0 +1,38 @@
+use compute_api::responses::ComputeStatusResponse;
+
+use crate::compute::ComputeState;
+
+pub(in crate::http) mod check_writability;
+pub(in crate::http) mod configure;
+pub(in crate::http) mod database_schema;
+pub(in crate::http) mod dbs_and_roles;
+pub(in crate::http) mod extension_server;
+pub(in crate::http) mod extensions;
+pub(in crate::http) mod failpoints;
+pub(in crate::http) mod grants;
+pub(in crate::http) mod info;
+pub(in crate::http) mod insights;
+pub(in crate::http) mod installed_extensions;
+pub(in crate::http) mod metrics;
+pub(in crate::http) mod metrics_json;
+pub(in crate::http) mod status;
+pub(in crate::http) mod terminate;
+
+impl From<&ComputeState> for ComputeStatusResponse {
+    fn from(state: &ComputeState) -> Self {
+        ComputeStatusResponse {
+            start_time: state.start_time,
+            tenant: state
+                .pspec
+                .as_ref()
+                .map(|pspec| pspec.tenant_id.to_string()),
+            timeline: state
+                .pspec
+                .as_ref()
+                .map(|pspec| pspec.timeline_id.to_string()),
+            status: state.status,
+            last_active: state.last_active,
+            error: state.error.clone(),
+        }
+    }
+}
diff --git a/compute_tools/src/http/routes/status.rs b/compute_tools/src/http/routes/status.rs
new file mode 100644
index 0000000000..d64d53a58f
--- /dev/null
+++ b/compute_tools/src/http/routes/status.rs
@@ -0,0 +1,14 @@
+use std::{ops::Deref, sync::Arc};
+
+use axum::{extract::State, http::StatusCode, response::Response};
+use compute_api::responses::ComputeStatusResponse;
+
+use crate::{compute::ComputeNode, http::JsonResponse};
+
+/// Retrieve the state of the comute.
+pub(in crate::http) async fn get_status(State(compute): State<Arc<ComputeNode>>) -> Response {
+    let state = compute.state.lock().unwrap();
+    let body = ComputeStatusResponse::from(state.deref());
+
+    JsonResponse::success(StatusCode::OK, body)
+}
diff --git a/compute_tools/src/http/routes/terminate.rs b/compute_tools/src/http/routes/terminate.rs
new file mode 100644
index 0000000000..7acd84f236
--- /dev/null
+++ b/compute_tools/src/http/routes/terminate.rs
@@ -0,0 +1,58 @@
+use std::sync::Arc;
+
+use axum::{
+    extract::State,
+    response::{IntoResponse, Response},
+};
+use compute_api::responses::ComputeStatus;
+use http::StatusCode;
+use tokio::task;
+use tracing::info;
+
+use crate::{
+    compute::{forward_termination_signal, ComputeNode},
+    http::JsonResponse,
+};
+
+/// Terminate the compute.
+pub(in crate::http) async fn terminate(State(compute): State<Arc<ComputeNode>>) -> Response {
+    {
+        let mut state = compute.state.lock().unwrap();
+        if state.status == ComputeStatus::Terminated {
+            return StatusCode::CREATED.into_response();
+        }
+
+        if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
+            return JsonResponse::invalid_status(state.status);
+        }
+
+        state.set_status(ComputeStatus::TerminationPending, &compute.state_changed);
+        drop(state);
+    }
+
+    forward_termination_signal();
+    info!("sent signal and notified waiters");
+
+    // Spawn a blocking thread to wait for compute to become Terminated.
+    // This is needed to do not block the main pool of workers and
+    // be able to serve other requests while some particular request
+    // is waiting for compute to finish configuration.
+    let c = compute.clone();
+    task::spawn_blocking(move || {
+        let mut state = c.state.lock().unwrap();
+        while state.status != ComputeStatus::Terminated {
+            state = c.state_changed.wait(state).unwrap();
+            info!(
+                "waiting for compute to become {}, current status: {:?}",
+                ComputeStatus::Terminated,
+                state.status
+            );
+        }
+    })
+    .await
+    .unwrap();
+
+    info!("terminated Postgres");
+
+    StatusCode::OK.into_response()
+}
diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs
new file mode 100644
index 0000000000..33d4b489a0
--- /dev/null
+++ b/compute_tools/src/http/server.rs
@@ -0,0 +1,165 @@
+use std::{
+    net::{IpAddr, Ipv6Addr, SocketAddr},
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Arc,
+    },
+    thread,
+    time::Duration,
+};
+
+use anyhow::Result;
+use axum::{
+    response::{IntoResponse, Response},
+    routing::{get, post},
+    Router,
+};
+use http::StatusCode;
+use tokio::net::TcpListener;
+use tower::ServiceBuilder;
+use tower_http::{
+    request_id::{MakeRequestId, PropagateRequestIdLayer, RequestId, SetRequestIdLayer},
+    trace::TraceLayer,
+};
+use tracing::{debug, error, info, Span};
+
+use super::routes::{
+    check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
+    grants, info as info_route, insights, installed_extensions, metrics, metrics_json, status,
+    terminate,
+};
+use crate::compute::ComputeNode;
+
+async fn handle_404() -> Response {
+    StatusCode::NOT_FOUND.into_response()
+}
+
+#[derive(Clone, Default)]
+struct ComputeMakeRequestId(Arc<AtomicU64>);
+
+impl MakeRequestId for ComputeMakeRequestId {
+    fn make_request_id<B>(
+        &mut self,
+        _request: &http::Request<B>,
+    ) -> Option<tower_http::request_id::RequestId> {
+        let request_id = self
+            .0
+            .fetch_add(1, Ordering::SeqCst)
+            .to_string()
+            .parse()
+            .unwrap();
+
+        Some(RequestId::new(request_id))
+    }
+}
+
+/// Run the HTTP server and wait on it forever.
+#[tokio::main]
+async fn serve(port: u16, compute: Arc<ComputeNode>) {
+    const X_REQUEST_ID: &str = "x-request-id";
+
+    let mut app = Router::new()
+        .route("/check_writability", post(check_writability::is_writable))
+        .route("/configure", post(configure::configure))
+        .route("/database_schema", get(database_schema::get_schema_dump))
+        .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
+        .route(
+            "/extension_server/*filename",
+            post(extension_server::download_extension),
+        )
+        .route("/extensions", post(extensions::install_extension))
+        .route("/grants", post(grants::add_grant))
+        .route("/info", get(info_route::get_info))
+        .route("/insights", get(insights::get_insights))
+        .route(
+            "/installed_extensions",
+            get(installed_extensions::get_installed_extensions),
+        )
+        .route("/metrics", get(metrics::get_metrics))
+        .route("/metrics.json", get(metrics_json::get_metrics))
+        .route("/status", get(status::get_status))
+        .route("/terminate", post(terminate::terminate))
+        .fallback(handle_404)
+        .layer(
+            ServiceBuilder::new()
+                .layer(SetRequestIdLayer::x_request_id(
+                    ComputeMakeRequestId::default(),
+                ))
+                .layer(
+                    TraceLayer::new_for_http()
+                        .on_request(|request: &http::Request<_>, _span: &Span| {
+                            let request_id = request
+                                .headers()
+                                .get(X_REQUEST_ID)
+                                .unwrap()
+                                .to_str()
+                                .unwrap();
+
+                            match request.uri().path() {
+                                "/metrics" => {
+                                    debug!(%request_id, "{} {}", request.method(), request.uri())
+                                }
+                                _ => info!(%request_id, "{} {}", request.method(), request.uri()),
+                            };
+                        })
+                        .on_response(
+                            |response: &http::Response<_>, latency: Duration, _span: &Span| {
+                                let request_id = response
+                                    .headers()
+                                    .get(X_REQUEST_ID)
+                                    .unwrap()
+                                    .to_str()
+                                    .unwrap();
+
+                                info!(
+                                    %request_id,
+                                    code = response.status().as_u16(),
+                                    latency = latency.as_millis()
+                                )
+                            },
+                        ),
+                )
+                .layer(PropagateRequestIdLayer::x_request_id()),
+        )
+        .with_state(compute);
+
+    // Add in any testing support
+    if cfg!(feature = "testing") {
+        use super::routes::failpoints;
+
+        app = app.route("/failpoints", post(failpoints::configure_failpoints))
+    }
+
+    // This usually binds to both IPv4 and IPv6 on Linux, see
+    // https://github.com/rust-lang/rust/pull/34440 for more information
+    let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
+    let listener = match TcpListener::bind(&addr).await {
+        Ok(listener) => listener,
+        Err(e) => {
+            error!(
+                "failed to bind the compute_ctl HTTP server to port {}: {}",
+                port, e
+            );
+            return;
+        }
+    };
+
+    if let Ok(local_addr) = listener.local_addr() {
+        info!("compute_ctl HTTP server listening on {}", local_addr);
+    } else {
+        info!("compute_ctl HTTP server listening on port {}", port);
+    }
+
+    if let Err(e) = axum::serve(listener, app).await {
+        error!("compute_ctl HTTP server error: {}", e);
+    }
+}
+
+/// Launch a separate HTTP server thread and return its `JoinHandle`.
+pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+    let state = Arc::clone(state);
+
+    Ok(thread::Builder::new()
+        .name("http-server".into())
+        .spawn(move || serve(port, state))?)
+}
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index ee4cf2dfa5..12fea4e61a 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -3,8 +3,6 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
 
-extern crate hyper0 as hyper;
-
 pub mod checker;
 pub mod config;
 pub mod configurator;
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 5e47ec4811..b8027abf7c 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -62,7 +62,7 @@ use crate::local_env::LocalEnv;
 use crate::postgresql_conf::PostgresConf;
 use crate::storage_controller::StorageController;
 
-use compute_api::responses::{ComputeState, ComputeStatus};
+use compute_api::responses::{ComputeStatus, ComputeStatusResponse};
 use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
 
 // contents of a endpoint.json file
@@ -739,7 +739,7 @@ impl Endpoint {
     }
 
     // Call the /status HTTP API
-    pub async fn get_status(&self) -> Result<ComputeState> {
+    pub async fn get_status(&self) -> Result<ComputeStatusResponse> {
         let client = reqwest::Client::new();
 
         let response = client
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 0d65f6a38d..9ce605089b 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -15,6 +15,17 @@ pub struct GenericAPIError {
     pub error: String,
 }
 
+#[derive(Debug, Clone, Serialize)]
+pub struct InfoResponse {
+    pub num_cpus: usize,
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct ExtensionInstallResponse {
+    pub extension: PgIdent,
+    pub version: ExtVersion,
+}
+
 /// Response of the /status API
 #[derive(Serialize, Debug, Deserialize)]
 #[serde(rename_all = "snake_case")]
@@ -28,16 +39,6 @@ pub struct ComputeStatusResponse {
     pub error: Option<String>,
 }
 
-#[derive(Deserialize, Serialize)]
-#[serde(rename_all = "snake_case")]
-pub struct ComputeState {
-    pub status: ComputeStatus,
-    /// Timestamp of the last Postgres activity
-    #[serde(serialize_with = "rfc3339_serialize")]
-    pub last_active: Option<DateTime<Utc>>,
-    pub error: Option<String>,
-}
-
 #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
 pub enum ComputeStatus {
@@ -78,7 +79,7 @@ impl Display for ComputeStatus {
     }
 }
 
-fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
+pub fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
 where
     S: Serializer,
 {
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 33bdc25785..0ffeeead18 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -91,7 +91,8 @@ tokio-stream = { version = "0.1", features = ["net"] }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }
 toml_edit = { version = "0.22", features = ["serde"] }
 tonic = { version = "0.12", features = ["tls-roots"] }
-tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "util"] }
+tower-9fbad63c4bcf4a8f = { package = "tower", version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "util"] }
+tower-d8f496e17d97b5cb = { package = "tower", version = "0.5", default-features = false, features = ["log", "make", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
 url = { version = "2", features = ["serde"] }

From 6149ac88343c455fc99b6fb8e49c5d8e92e7dec2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 9 Jan 2025 21:41:49 +0100
Subject: [PATCH 141/583] Handle race between auto-offload and unarchival
 (#10305)

## Problem

Auto-offloading as requested by the compaction task is racy with
unarchival, in that the compaction task might attempt to offload an
unarchived timeline. By that point it will already have set the timeline
to the `Stopping` state however, which makes it unusable for any
purpose. For example:

1. compaction task decides to offload timeline
2. timeline gets unarchived
3. `offload_timeline` gets called by compaction task
  * sets timeline's state to `Stopping`
  * realizes that the timeline can't be unarchived, errors out
6. endpoint can't be started as the timeline is `Stopping` and thus
'can't be found'.

A future iteration of the compaction task can't "heal" this state either
as the timeline will still not be archived, same goes for other
automatic stuff. The only way to heal this is a tenant detach+attach, or
alternatively a pageserver restart.

Furthermore, the compaction task is especially amenable for such races
as it first stores `can_offload` into a variable, figures out whether
compaction is needed (which takes some time), and only then does it
attempt an offload operation: the time difference between "check" and
"use" is non-trivially small.

To make it even worse, we start the compaction task right after attach
of a tenant, and it is a common pattern by pageserver users to attach a
tenant to then immediately unarchive a timeline, so that an endpoint can
be started.

## Solutions not adopted

The simplest solution is to move the `can_offload` check to right before
attempting of the offload. But this is not a good solution, as no lock
is held between that check and timeline shutdown. So races would still
be possible, just become less likely.

I explored using the timeline state for this, as in adding an additional
enum variant. But `Timeline::set_state` is racy (#10297).

## Adopted solution

We use the lock on the timeline's upload queue as an arbiter: either
unarchival gets to it first and sours the state for auto-offloading, or
auto-offloading shuts it down, which stops any parallel unarchival in
its tracks. The key part is not releasing the upload queue's lock
between the check whether the timeline is archived or not, and shutting
it down (the actual implementation only sets `shutting_down` but it has
the same effect on `initialized_mut()` as a full shutdown). The rest of
the patch is stuff that follows from this.

We also move the part where we set the state to `Stopping` to after that
arbiter has decided the fate of the timeline. For deletions, we do keep
it inside `DeleteTimelineFlow::prepare` however, so that it is called
with all of the the timelines locks held that the function allocates
(timelines lock most importantly). This is only a precautionary measure
however, as I didn't want to analyze deletion related code for possible
races.

## Future changes

It might make sense to move `can_offload` to right before the offload
attempt. Maybe some other properties might have changed as well.
Although this will not be perfect either as no lock is held. I want to
keep it out of this change to emphasize that this move wasn't the main
reason we are race free now.

Fixes #10220
---
 pageserver/src/tenant.rs                      |  15 ++-
 .../src/tenant/remote_timeline_client.rs      |  58 ++++++++++
 pageserver/src/tenant/timeline/delete.rs      |  11 +-
 pageserver/src/tenant/timeline/offload.rs     |  32 +++---
 test_runner/regress/test_timeline_archive.py  | 100 ++++++++++++++++++
 5 files changed, 195 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e3dab2fc1d..8e61d09de7 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -48,6 +48,7 @@ use timeline::compaction::GcCompactJob;
 use timeline::compaction::ScheduledCompactionTask;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
+use timeline::offload::OffloadError;
 use timeline::CompactFlags;
 use timeline::CompactOptions;
 use timeline::CompactionError;
@@ -2039,7 +2040,7 @@ impl Tenant {
     ) -> Result<Arc<Timeline>, TimelineArchivalError> {
         info!("unoffloading timeline");
 
-        // We activate the timeline below manually, so this must be called on an active timeline.
+        // We activate the timeline below manually, so this must be called on an active tenant.
         // We expect callers of this function to ensure this.
         match self.current_state() {
             TenantState::Activating { .. }
@@ -3100,9 +3101,17 @@ impl Tenant {
             };
             has_pending_task |= pending_task_left.unwrap_or(false);
             if pending_task_left == Some(false) && *can_offload {
-                offload_timeline(self, timeline)
+                pausable_failpoint!("before-timeline-auto-offload");
+                match offload_timeline(self, timeline)
                     .instrument(info_span!("offload_timeline", %timeline_id))
-                    .await?;
+                    .await
+                {
+                    Err(OffloadError::NotArchived) => {
+                        // Ignore this, we likely raced with unarchival
+                        Ok(())
+                    }
+                    other => other,
+                }?;
             }
         }
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index b27ac3e933..813111245d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -304,6 +304,15 @@ pub enum WaitCompletionError {
 #[derive(Debug, thiserror::Error)]
 #[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")]
 pub struct UploadQueueNotReadyError;
+
+#[derive(Debug, thiserror::Error)]
+pub enum ShutdownIfArchivedError {
+    #[error(transparent)]
+    NotInitialized(NotInitialized),
+    #[error("timeline is not archived")]
+    NotArchived,
+}
+
 /// Behavioral modes that enable seamless live migration.
 ///
 /// See docs/rfcs/028-pageserver-migration.md to understand how these fit in.
@@ -816,6 +825,55 @@ impl RemoteTimelineClient {
         Ok(need_wait)
     }
 
+    /// Shuts the timeline client down, but only if the timeline is archived.
+    ///
+    /// This function and [`Self::schedule_index_upload_for_timeline_archival_state`] use the
+    /// same lock to prevent races between unarchival and offloading: unarchival requires the
+    /// upload queue to be initialized, and leaves behind an upload queue where either dirty
+    /// or clean has archived_at of `None`. offloading leaves behind an uninitialized upload
+    /// queue.
+    pub(crate) async fn shutdown_if_archived(
+        self: &Arc<Self>,
+    ) -> Result<(), ShutdownIfArchivedError> {
+        {
+            let mut guard = self.upload_queue.lock().unwrap();
+            let upload_queue = guard
+                .initialized_mut()
+                .map_err(ShutdownIfArchivedError::NotInitialized)?;
+
+            match (
+                upload_queue.dirty.archived_at.is_none(),
+                upload_queue.clean.0.archived_at.is_none(),
+            ) {
+                // The expected case: the timeline is archived and we don't want to unarchive
+                (false, false) => {}
+                (true, false) => {
+                    tracing::info!("can't shut down timeline: timeline slated for unarchival");
+                    return Err(ShutdownIfArchivedError::NotArchived);
+                }
+                (dirty_archived, true) => {
+                    tracing::info!(%dirty_archived, "can't shut down timeline: timeline not archived in remote storage");
+                    return Err(ShutdownIfArchivedError::NotArchived);
+                }
+            }
+
+            // Set the shutting_down flag while the guard from the archival check is held.
+            // This prevents a race with unarchival, as initialized_mut will not return
+            // an upload queue from this point.
+            // Also launch the queued tasks like shutdown() does.
+            if !upload_queue.shutting_down {
+                upload_queue.shutting_down = true;
+                upload_queue.queued_operations.push_back(UploadOp::Shutdown);
+                // this operation is not counted similar to Barrier
+                self.launch_queued_tasks(upload_queue);
+            }
+        }
+
+        self.shutdown().await;
+
+        Ok(())
+    }
+
     /// Launch an index-file upload operation in the background, setting `import_pgdata` field.
     pub(crate) fn schedule_index_upload_for_import_pgdata_state_update(
         self: &Arc<Self>,
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 47a93b19d2..ae44af3fad 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -194,7 +194,9 @@ impl DeleteTimelineFlow {
         super::debug_assert_current_span_has_tenant_and_timeline_id();
 
         let allow_offloaded_children = false;
-        let (timeline, mut guard) = Self::prepare(tenant, timeline_id, allow_offloaded_children)?;
+        let set_stopping = true;
+        let (timeline, mut guard) =
+            Self::prepare(tenant, timeline_id, allow_offloaded_children, set_stopping)?;
 
         guard.mark_in_progress()?;
 
@@ -334,6 +336,7 @@ impl DeleteTimelineFlow {
         tenant: &Tenant,
         timeline_id: TimelineId,
         allow_offloaded_children: bool,
+        set_stopping: bool,
     ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
         // Note the interaction between this guard and deletion guard.
         // Here we attempt to lock deletion guard when we're holding a lock on timelines.
@@ -389,8 +392,10 @@ impl DeleteTimelineFlow {
             }
         };
 
-        if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
-            timeline.set_state(TimelineState::Stopping);
+        if set_stopping {
+            if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
+                timeline.set_state(TimelineState::Stopping);
+            }
         }
 
         Ok((timeline, delete_lock_guard))
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 15628a9645..6c6b19e8b1 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -1,10 +1,11 @@
 use std::sync::Arc;
 
-use pageserver_api::models::TenantState;
+use pageserver_api::models::{TenantState, TimelineState};
 
 use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
 use super::Timeline;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
 use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
 
 #[derive(thiserror::Error, Debug)]
@@ -36,28 +37,29 @@ pub(crate) async fn offload_timeline(
     tracing::info!("offloading archived timeline");
 
     let allow_offloaded_children = true;
-    let (timeline, guard) =
-        DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children)
-            .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
+    let set_stopping = false;
+    let (timeline, guard) = DeleteTimelineFlow::prepare(
+        tenant,
+        timeline.timeline_id,
+        allow_offloaded_children,
+        set_stopping,
+    )
+    .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
 
     let TimelineOrOffloaded::Timeline(timeline) = timeline else {
         tracing::error!("timeline already offloaded, but given timeline object");
         return Ok(());
     };
 
-    let is_archived = timeline.is_archived();
-    match is_archived {
-        Some(true) => (),
-        Some(false) => {
-            tracing::warn!("tried offloading a non-archived timeline");
-            return Err(OffloadError::NotArchived);
-        }
-        None => {
-            // This is legal: calls to this function can race with the timeline shutting down
-            tracing::info!("tried offloading a timeline whose remote storage is not initialized");
-            return Err(OffloadError::Cancelled);
+    match timeline.remote_client.shutdown_if_archived().await {
+        Ok(()) => {}
+        Err(ShutdownIfArchivedError::NotInitialized(_)) => {
+            // Either the timeline is being deleted, the operation is being retried, or we are shutting down.
+            // Don't return cancelled here to keep it idempotent.
         }
+        Err(ShutdownIfArchivedError::NotArchived) => return Err(OffloadError::NotArchived),
     }
+    timeline.set_state(TimelineState::Stopping);
 
     // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
     timeline.shutdown(super::ShutdownMode::Reload).await;
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 9b3a48add9..bec8270582 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -959,3 +959,103 @@ def test_timeline_offload_generations(neon_env_builder: NeonEnvBuilder):
     assert gc_summary["remote_storage_errors"] == 0
     assert gc_summary["indices_deleted"] > 0
     assert gc_summary["tenant_manifests_deleted"] > 0
+
+
+@pytest.mark.parametrize("end_with_offloaded", [False, True])
+def test_timeline_offload_race_unarchive(
+    neon_env_builder: NeonEnvBuilder, end_with_offloaded: bool
+):
+    """
+    Ensure that unarchive and timeline offload don't race each other
+    """
+    # Regression test for issue https://github.com/neondatabase/neon/issues/10220
+    # (automatic) timeline offloading defaults to false for now
+    neon_env_builder.pageserver_config_override = "timeline_offloading = true"
+
+    failpoint = "before-timeline-auto-offload"
+
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+
+    # Turn off gc and compaction loops: we want to issue them manually for better reliability
+    tenant_id, initial_timeline_id = env.create_tenant(
+        conf={
+            "gc_period": "0s",
+            "compaction_period": "1s",
+        }
+    )
+
+    # Create a branch
+    leaf_timeline_id = env.create_branch("test_ancestor_branch_archive", tenant_id)
+
+    # write some stuff to the leaf
+    with env.endpoints.create_start(
+        "test_ancestor_branch_archive", tenant_id=tenant_id
+    ) as endpoint:
+        endpoint.safe_psql_many(
+            [
+                "CREATE TABLE foo(key serial primary key, t text default 'data_content')",
+                "INSERT INTO foo SELECT FROM generate_series(1,1000)",
+            ]
+        )
+        sum = endpoint.safe_psql("SELECT sum(key) from foo where key % 7 = 1")
+
+    ps_http.configure_failpoints((failpoint, "pause"))
+
+    ps_http.timeline_archival_config(
+        tenant_id,
+        leaf_timeline_id,
+        state=TimelineArchivalState.ARCHIVED,
+    )
+    leaf_detail = ps_http.timeline_detail(
+        tenant_id,
+        leaf_timeline_id,
+    )
+    assert leaf_detail["is_archived"] is True
+
+    # The actual race: get the compaction task to right before
+    # offloading the timeline and attempt to unarchive it
+    wait_until(lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}"))
+
+    # This unarchival should go through
+    ps_http.timeline_archival_config(
+        tenant_id,
+        leaf_timeline_id,
+        state=TimelineArchivalState.UNARCHIVED,
+    )
+
+    def timeline_offloaded_api(timeline_id: TimelineId) -> bool:
+        # TODO add a proper API to check if a timeline has been offloaded or not
+        return not any(
+            timeline["timeline_id"] == str(timeline_id)
+            for timeline in ps_http.timeline_list(tenant_id=tenant_id)
+        )
+
+    def leaf_offloaded():
+        assert timeline_offloaded_api(leaf_timeline_id)
+
+    # Ensure that we've hit the failed offload attempt
+    ps_http.configure_failpoints((failpoint, "off"))
+    wait_until(
+        lambda: env.pageserver.assert_log_contains(
+            f".*compaction_loop.*offload_timeline.*{leaf_timeline_id}.*can't shut down timeline.*"
+        )
+    )
+
+    with env.endpoints.create_start(
+        "test_ancestor_branch_archive", tenant_id=tenant_id
+    ) as endpoint:
+        sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key % 7 = 1")
+        assert sum == sum_again
+
+    if end_with_offloaded:
+        # Ensure that offloading still works after all of this
+        ps_http.timeline_archival_config(
+            tenant_id,
+            leaf_timeline_id,
+            state=TimelineArchivalState.ARCHIVED,
+        )
+        wait_until(leaf_offloaded)
+    else:
+        # Test that deletion of leaf timeline works
+        ps_http.timeline_delete(tenant_id, leaf_timeline_id)

From b6205af4a5542935ca97ccd89d205f95bb75ce27 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Fri, 10 Jan 2025 09:48:03 +0100
Subject: [PATCH 142/583] Update tracing/otel crates (#10311)

Update the tracing(-x) and opentelemetry(-x) crates.

Some breaking changes require updating our code:
* Initialization is done via builders now

https://github.com/open-telemetry/opentelemetry-rust/blob/main/opentelemetry-otlp/CHANGELOG.md#0270
* Errors from OTel SDK are logged via tracing crate as well.

https://github.com/open-telemetry/opentelemetry-rust/blob/main/opentelemetry/CHANGELOG.md#0270
---
 Cargo.lock                           | 36 ++++++++++++++--------------
 Cargo.toml                           | 12 +++++-----
 compute_tools/src/bin/compute_ctl.rs |  5 ----
 libs/tracing-utils/src/lib.rs        | 24 +++++++++----------
 4 files changed, 35 insertions(+), 42 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 44143fa0da..87a54c78ec 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3693,23 +3693,23 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "opentelemetry"
-version = "0.26.0"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "570074cc999d1a58184080966e5bd3bf3a9a4af650c3b05047c2621e7405cd17"
+checksum = "ab70038c28ed37b97d8ed414b6429d343a8bbf44c9f79ec854f3a643029ba6d7"
 dependencies = [
  "futures-core",
  "futures-sink",
  "js-sys",
- "once_cell",
  "pin-project-lite",
  "thiserror",
+ "tracing",
 ]
 
 [[package]]
 name = "opentelemetry-http"
-version = "0.26.0"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6351496aeaa49d7c267fb480678d85d1cd30c5edb20b497c48c56f62a8c14b99"
+checksum = "10a8a7f5f6ba7c1b286c2fbca0454eaba116f63bbe69ed250b642d36fbb04d80"
 dependencies = [
  "async-trait",
  "bytes",
@@ -3720,9 +3720,9 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry-otlp"
-version = "0.26.0"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29e1f9c8b032d4f635c730c0efcf731d5e2530ea13fa8bef7939ddc8420696bd"
+checksum = "91cf61a1868dacc576bf2b2a1c3e9ab150af7272909e80085c3173384fe11f76"
 dependencies = [
  "async-trait",
  "futures-core",
@@ -3738,9 +3738,9 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry-proto"
-version = "0.26.1"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c9d3968ce3aefdcca5c27e3c4ea4391b37547726a70893aab52d3de95d5f8b34"
+checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6"
 dependencies = [
  "opentelemetry",
  "opentelemetry_sdk",
@@ -3750,22 +3750,21 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry-semantic-conventions"
-version = "0.26.0"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "db945c1eaea8ac6a9677185357480d215bb6999faa9f691d0c4d4d641eab7a09"
+checksum = "bc1b6902ff63b32ef6c489e8048c5e253e2e4a803ea3ea7e783914536eb15c52"
 
 [[package]]
 name = "opentelemetry_sdk"
-version = "0.26.0"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d2c627d9f4c9cdc1f21a29ee4bfbd6028fcb8bcf2a857b43f3abdf72c9c862f3"
+checksum = "231e9d6ceef9b0b2546ddf52335785ce41252bc7474ee8ba05bfad277be13ab8"
 dependencies = [
  "async-trait",
  "futures-channel",
  "futures-executor",
  "futures-util",
  "glob",
- "once_cell",
  "opentelemetry",
  "percent-encoding",
  "rand 0.8.5",
@@ -3773,6 +3772,7 @@ dependencies = [
  "thiserror",
  "tokio",
  "tokio-stream",
+ "tracing",
 ]
 
 [[package]]
@@ -5181,9 +5181,9 @@ dependencies = [
 
 [[package]]
 name = "reqwest-tracing"
-version = "0.5.4"
+version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff82cf5730a1311fb9413b0bc2b8e743e0157cd73f010ab4ec374a923873b6a2"
+checksum = "73e6153390585f6961341b50e5a1931d6be6dee4292283635903c26ef9d980d2"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -7048,9 +7048,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-opentelemetry"
-version = "0.27.0"
+version = "0.28.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc58af5d3f6c5811462cabb3289aec0093f7338e367e5a33d28c0433b3c7360b"
+checksum = "97a971f6058498b5c0f1affa23e7ea202057a7301dbff68e968b2d578bcbd053"
 dependencies = [
  "js-sys",
  "once_cell",
diff --git a/Cargo.toml b/Cargo.toml
index 39898e1c8d..55bb9bfe11 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -126,10 +126,10 @@ notify = "6.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
-opentelemetry = "0.26"
-opentelemetry_sdk = "0.26"
-opentelemetry-otlp = { version = "0.26", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
-opentelemetry-semantic-conventions = "0.26"
+opentelemetry = "0.27"
+opentelemetry_sdk = "0.27"
+opentelemetry-otlp = { version = "0.27", default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.27"
 parking_lot = "0.12"
 parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
@@ -143,7 +143,7 @@ rand = "0.8"
 redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_26"] }
+reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_27"] }
 reqwest-middleware = "0.4"
 reqwest-retry = "0.7"
 routerify = "3"
@@ -192,7 +192,7 @@ tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
 tower-service = "0.3.3"
 tracing = "0.1"
 tracing-error = "0.2"
-tracing-opentelemetry = "0.27"
+tracing-opentelemetry = "0.28"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 04432ad0f3..b98cf706d3 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -111,11 +111,6 @@ fn main() -> Result<()> {
 fn init() -> Result<(String, clap::ArgMatches)> {
     init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
 
-    opentelemetry::global::set_error_handler(|err| {
-        tracing::info!("OpenTelemetry error: {err}");
-    })
-    .expect("global error handler lock poisoned");
-
     let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
     thread::spawn(move || {
         for sig in signals.forever() {
diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs
index c4aad53cdb..818d759eac 100644
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -38,7 +38,6 @@ pub mod http;
 
 use opentelemetry::trace::TracerProvider;
 use opentelemetry::KeyValue;
-use opentelemetry_sdk::Resource;
 use tracing::Subscriber;
 use tracing_subscriber::registry::LookupSpan;
 use tracing_subscriber::Layer;
@@ -121,7 +120,10 @@ where
     S: Subscriber + for<'span> LookupSpan<'span>,
 {
     // Sets up exporter from the OTEL_EXPORTER_* environment variables.
-    let exporter = opentelemetry_otlp::new_exporter().http();
+    let exporter = opentelemetry_otlp::SpanExporter::builder()
+        .with_http()
+        .build()
+        .expect("could not initialize opentelemetry exporter");
 
     // TODO: opentelemetry::global::set_error_handler() with custom handler that
     //       bypasses default tracing layers, but logs regular looking log
@@ -132,17 +134,13 @@ where
         opentelemetry_sdk::propagation::TraceContextPropagator::new(),
     );
 
-    let tracer = opentelemetry_otlp::new_pipeline()
-        .tracing()
-        .with_exporter(exporter)
-        .with_trace_config(opentelemetry_sdk::trace::Config::default().with_resource(
-            Resource::new(vec![KeyValue::new(
-                opentelemetry_semantic_conventions::resource::SERVICE_NAME,
-                service_name,
-            )]),
-        ))
-        .install_batch(opentelemetry_sdk::runtime::Tokio)
-        .expect("could not initialize opentelemetry exporter")
+    let tracer = opentelemetry_sdk::trace::TracerProvider::builder()
+        .with_batch_exporter(exporter, opentelemetry_sdk::runtime::Tokio)
+        .with_resource(opentelemetry_sdk::Resource::new(vec![KeyValue::new(
+            opentelemetry_semantic_conventions::resource::SERVICE_NAME,
+            service_name,
+        )]))
+        .build()
         .tracer("global");
 
     tracing_opentelemetry::layer().with_tracer(tracer)

From 77660f3d88d27c3a7414eee611a6192900f98fe9 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Fri, 10 Jan 2025 10:12:31 +0100
Subject: [PATCH 143/583] proxy: Fix parsing of UnknownTopic with payload
 (#10339)

## Problem

When the proxy receives a `Notification` with an unknown topic it's
supposed to use the `UnknownTopic` unit variant. Unfortunately, in
adjacently tagged enums serde will not simply ignore the configured
content if found and try to deserialize a map/object instead.

## Summary of changes

* Use a custom deserialize function to ignore variant content.
* Add a little unit test covering both cases.
---
 proxy/src/redis/notifications.rs | 41 +++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index bf9d61ded3..63cdf6176c 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -74,7 +74,11 @@ pub(crate) enum Notification {
     #[serde(rename = "/cancel_session")]
     Cancel(CancelSession),
 
-    #[serde(other, skip_serializing)]
+    #[serde(
+        other,
+        deserialize_with = "deserialize_unknown_topic",
+        skip_serializing
+    )]
     UnknownTopic,
 }
 
@@ -123,6 +127,15 @@ where
     serde_json::from_str(&s).map_err(<D::Error as serde::de::Error>::custom)
 }
 
+// https://github.com/serde-rs/serde/issues/1714
+fn deserialize_unknown_topic<'de, D>(deserializer: D) -> Result<(), D::Error>
+where
+    D: serde::Deserializer<'de>,
+{
+    deserializer.deserialize_any(serde::de::IgnoredAny)?;
+    Ok(())
+}
+
 struct MessageHandler<C: ProjectInfoCache + Send + Sync + 'static> {
     cache: Arc<C>,
     cancellation_handler: Arc<CancellationHandler<()>>,
@@ -458,4 +471,30 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn parse_unknown_topic() -> anyhow::Result<()> {
+        let with_data = json!({
+            "type": "message",
+            "topic": "/doesnotexist",
+            "data": {
+                "payload": "ignored"
+            },
+            "extra_fields": "something"
+        })
+        .to_string();
+        let result: Notification = serde_json::from_str(&with_data)?;
+        assert_eq!(result, Notification::UnknownTopic);
+
+        let without_data = json!({
+            "type": "message",
+            "topic": "/doesnotexist",
+            "extra_fields": "something"
+        })
+        .to_string();
+        let result: Notification = serde_json::from_str(&without_data)?;
+        assert_eq!(result, Notification::UnknownTopic);
+
+        Ok(())
+    }
 }

From 735c66dc65f1163591a2745934f4be766072c88c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 10 Jan 2025 09:36:51 +0000
Subject: [PATCH 144/583] fix(proxy): propagate the existing ComputeUserInfo to
 connect for cancellation (#10322)

## Problem

We were incorrectly constructing the ComputeUserInfo, used for
cancellation checks, based on the return parameters from postgres. This
didn't contain the correct info.

## Summary of changes

Propagate down the existing ComputeUserInfo.
---
 proxy/src/auth/backend/console_redirect.rs | 30 +++++++++++++++++-----
 proxy/src/compute.rs                       | 27 ++-----------------
 proxy/src/console_redirect_proxy.rs        |  5 ++--
 proxy/src/control_plane/mod.rs             |  5 +++-
 proxy/src/proxy/connect_compute.rs         |  6 +++--
 proxy/src/proxy/mod.rs                     | 13 ++++++----
 6 files changed, 45 insertions(+), 41 deletions(-)

diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index dbfda588cc..1cbf91d3ae 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -1,7 +1,8 @@
+use std::fmt;
+
 use async_trait::async_trait;
 use postgres_client::config::SslMode;
 use pq_proto::BeMessage as Be;
-use std::fmt;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, info_span};
@@ -12,10 +13,13 @@ use crate::auth::IpPattern;
 use crate::cache::Cached;
 use crate::config::AuthenticationConfig;
 use crate::context::RequestContext;
-use crate::control_plane::{self, client::cplane_proxy_v1, CachedNodeInfo, NodeInfo};
+use crate::control_plane::client::cplane_proxy_v1;
+use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
 use crate::error::{ReportableError, UserFacingError};
 use crate::proxy::connect_compute::ComputeConnectBackend;
+use crate::proxy::NeonOptions;
 use crate::stream::PqStream;
+use crate::types::RoleName;
 use crate::{auth, compute, waiters};
 
 #[derive(Debug, Error)]
@@ -105,10 +109,16 @@ impl ConsoleRedirectBackend {
         ctx: &RequestContext,
         auth_config: &'static AuthenticationConfig,
         client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-    ) -> auth::Result<(ConsoleRedirectNodeInfo, Option<Vec<IpPattern>>)> {
+    ) -> auth::Result<(
+        ConsoleRedirectNodeInfo,
+        ComputeUserInfo,
+        Option<Vec<IpPattern>>,
+    )> {
         authenticate(ctx, auth_config, &self.console_uri, client)
             .await
-            .map(|(node_info, ip_allowlist)| (ConsoleRedirectNodeInfo(node_info), ip_allowlist))
+            .map(|(node_info, user_info, ip_allowlist)| {
+                (ConsoleRedirectNodeInfo(node_info), user_info, ip_allowlist)
+            })
     }
 }
 
@@ -133,7 +143,7 @@ async fn authenticate(
     auth_config: &'static AuthenticationConfig,
     link_uri: &reqwest::Url,
     client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-) -> auth::Result<(NodeInfo, Option<Vec<IpPattern>>)> {
+) -> auth::Result<(NodeInfo, ComputeUserInfo, Option<Vec<IpPattern>>)> {
     ctx.set_auth_method(crate::context::AuthMethod::ConsoleRedirect);
 
     // registering waiter can fail if we get unlucky with rng.
@@ -188,8 +198,15 @@ async fn authenticate(
     let mut config = compute::ConnCfg::new(db_info.host.to_string(), db_info.port);
     config.dbname(&db_info.dbname).user(&db_info.user);
 
+    let user: RoleName = db_info.user.into();
+    let user_info = ComputeUserInfo {
+        endpoint: db_info.aux.endpoint_id.as_str().into(),
+        user: user.clone(),
+        options: NeonOptions::default(),
+    };
+
     ctx.set_dbname(db_info.dbname.into());
-    ctx.set_user(db_info.user.into());
+    ctx.set_user(user);
     ctx.set_project(db_info.aux.clone());
     info!("woken up a compute node");
 
@@ -212,6 +229,7 @@ async fn authenticate(
             config,
             aux: db_info.aux,
         },
+        user_info,
         db_info.allowed_ips,
     ))
 }
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 788bd63fee..aff796bbab 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -24,10 +24,8 @@ use crate::control_plane::messages::MetricsAuxInfo;
 use crate::error::{ReportableError, UserFacingError};
 use crate::metrics::{Metrics, NumDbConnectionsGuard};
 use crate::proxy::neon_option;
-use crate::proxy::NeonOptions;
 use crate::tls::postgres_rustls::MakeRustlsConnect;
 use crate::types::Host;
-use crate::types::{EndpointId, RoleName};
 
 pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
 
@@ -253,6 +251,7 @@ impl ConnCfg {
         ctx: &RequestContext,
         aux: MetricsAuxInfo,
         config: &ComputeConfig,
+        user_info: ComputeUserInfo,
     ) -> Result<PostgresConnection, ConnectionError> {
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let (socket_addr, stream, host) = self.connect_raw(config.timeout).await?;
@@ -287,28 +286,6 @@ impl ConnCfg {
             self.0.get_ssl_mode()
         );
 
-        let compute_info = match parameters.get("user") {
-            Some(user) => {
-                match parameters.get("database") {
-                    Some(database) => {
-                        ComputeUserInfo {
-                            user: RoleName::from(user),
-                            options: NeonOptions::default(), // just a shim, we don't need options
-                            endpoint: EndpointId::from(database),
-                        }
-                    }
-                    None => {
-                        warn!("compute node didn't return database name");
-                        ComputeUserInfo::default()
-                    }
-                }
-            }
-            None => {
-                warn!("compute node didn't return user name");
-                ComputeUserInfo::default()
-            }
-        };
-
         // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw.
         // Yet another reason to rework the connection establishing code.
         let cancel_closure = CancelClosure::new(
@@ -321,7 +298,7 @@ impl ConnCfg {
             },
             vec![], // TODO: deprecated, will be removed
             host.to_string(),
-            compute_info,
+            user_info,
         );
 
         let connection = PostgresConnection {
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 846f55f9e1..0c6755063f 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -195,7 +195,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     ctx.set_db_options(params.clone());
 
-    let (user_info, ip_allowlist) = match backend
+    let (node_info, user_info, ip_allowlist) = match backend
         .authenticate(ctx, &config.authentication_config, &mut stream)
         .await
     {
@@ -208,11 +208,12 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     let mut node = connect_to_compute(
         ctx,
         &TcpMechanism {
+            user_info,
             params_compat: true,
             params: &params,
             locks: &config.connect_compute_locks,
         },
-        &user_info,
+        &node_info,
         config.wake_compute_retry_config,
         &config.connect_to_compute,
     )
diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs
index c65041df0e..1dca26d686 100644
--- a/proxy/src/control_plane/mod.rs
+++ b/proxy/src/control_plane/mod.rs
@@ -74,8 +74,11 @@ impl NodeInfo {
         &self,
         ctx: &RequestContext,
         config: &ComputeConfig,
+        user_info: ComputeUserInfo,
     ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
-        self.config.connect(ctx, self.aux.clone(), config).await
+        self.config
+            .connect(ctx, self.aux.clone(), config, user_info)
+            .await
     }
 
     pub(crate) fn reuse_settings(&mut self, other: Self) {
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 8a80494860..dd145e6bb2 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -4,7 +4,7 @@ use tokio::time;
 use tracing::{debug, info, warn};
 
 use super::retry::ShouldRetryWakeCompute;
-use crate::auth::backend::ComputeCredentialKeys;
+use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
 use crate::compute::{self, PostgresConnection, COULD_NOT_CONNECT};
 use crate::config::{ComputeConfig, RetryConfig};
 use crate::context::RequestContext;
@@ -71,6 +71,8 @@ pub(crate) struct TcpMechanism<'a> {
 
     /// connect_to_compute concurrency lock
     pub(crate) locks: &'static ApiLocks<Host>,
+
+    pub(crate) user_info: ComputeUserInfo,
 }
 
 #[async_trait]
@@ -88,7 +90,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
     ) -> Result<PostgresConnection, Self::Error> {
         let host = node_info.config.get_host();
         let permit = self.locks.get_permit(&host).await?;
-        permit.release_result(node_info.connect(ctx, config).await)
+        permit.release_result(node_info.connect(ctx, config, self.user_info.clone()).await)
     }
 
     fn update_connect_config(&self, config: &mut compute::ConnCfg) {
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 1f7dba2f9a..63f93f0a91 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -332,16 +332,19 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         }
     };
 
-    let params_compat = match &user_info {
-        auth::Backend::ControlPlane(_, info) => {
-            info.info.options.get(NeonOptions::PARAMS_COMPAT).is_some()
-        }
-        auth::Backend::Local(_) => false,
+    let compute_user_info = match &user_info {
+        auth::Backend::ControlPlane(_, info) => &info.info,
+        auth::Backend::Local(_) => unreachable!("local proxy does not run tcp proxy service"),
     };
+    let params_compat = compute_user_info
+        .options
+        .get(NeonOptions::PARAMS_COMPAT)
+        .is_some();
 
     let mut node = connect_to_compute(
         ctx,
         &TcpMechanism {
+            user_info: compute_user_info.clone(),
             params_compat,
             params: &params,
             locks: &config.connect_compute_locks,

From db00eb41a1f27ce5d122fbdfbf3f958e5df9b79f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 10 Jan 2025 12:06:03 +0100
Subject: [PATCH 145/583] fix(spsc_fold): potentially missing wakeup when
 send()ing in state `SenderWaitsForReceiverToConsume` (#10318)

# Problem

Before this PR, there were cases where send() in state
SenderWaitsForReceiverToConsume would never be woken up
by the receiver, because it never registered with `wake_sender`.

Example Scenario 1: we stop polling a send() future A that was waiting
for the receiver to consume. We drop A and create a new send() future B.
B would return Poll::Pending and never regsister a waker.

Example Scenario 2: a send() future A transitions from HasData
to SenderWaitsForReceiverToConsume. This registers the context X
with `wake_sender`. But before the Receiver consumes the data,
we poll A from a different context Y.
The state is still SenderWaitsForReceiverToConsume, but we wouldn't
register the new context with `wake_sender`.
When the Receiver comes around to consume and `wake_sender.notify()`s,
it wakes the old context X instead of Y.

# Fix

Register the waker in the case where we're polled in
state `SenderWaitsForReceiverToConsume`.

# Relation to #10309

I found this bug while investigating #10309.
There was never proof that this bug here is the root cause for #10309.
In the meantime we found a more probably hypothesis
for the root cause than what is being fixed here.
Regardless, let's walk through my thought process about
how it might have been relevant:

There (in page_service), Scenario 1 does not apply because
we poll the send() future to completion.

Scenario 2 (`tokio::join!`) also does not apply with the
current `tokio::join!()` impl, because it will just poll each
future every time, each with the same context.
Although if we ever used something like a FuturesUnordered anywhere,
that will be using a different context, so, in that case,
the bug might materialize.

Regarding tokio & spurious poll in general:
@conradludgate is not aware of any spurious wakeup cases in current
tokio,
but within a `tokio::join!()`, any wake meant for one future will poll
all
the futures, so that can appear as a spurious wake up to the N-1 futures
of the `tokio::join!()`.
---
 libs/utils/src/sync/spsc_fold.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/libs/utils/src/sync/spsc_fold.rs b/libs/utils/src/sync/spsc_fold.rs
index b44f766ef0..7bad4f1068 100644
--- a/libs/utils/src/sync/spsc_fold.rs
+++ b/libs/utils/src/sync/spsc_fold.rs
@@ -96,7 +96,11 @@ impl<T: Send> Sender<T> {
                     }
                 }
                 State::SenderWaitsForReceiverToConsume(_data) => {
-                    // Really, we shouldn't be polled until receiver has consumed and wakes us.
+                    // SAFETY: send is single threaded due to `&mut self` requirement,
+                    // therefore register is not concurrent.
+                    unsafe {
+                        self.state.wake_sender.register(cx.waker());
+                    }
                     Poll::Pending
                 }
                 State::ReceiverGone => Poll::Ready(Err(SendError::ReceiverGone)),

From 2b8ea1e76835d60b178ed07911935326085d4ac6 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 10 Jan 2025 13:14:29 +0100
Subject: [PATCH 146/583] utils: add flamegraph for heap profiles (#10223)

## Problem

Unlike CPU profiles, the `/profile/heap` endpoint can't automatically
generate SVG flamegraphs. This requires the user to install and use
`pprof` tooling, which is unnecessary and annoying.

Resolves #10203.

## Summary of changes

Add `format=svg` for the `/profile/heap` route, and generate an SVG
flamegraph using the `inferno` crate, similarly to what `pprof-rs`
already does for CPU profiles.
---
 Cargo.lock                      | 77 ++++++++++++++++++++++++++++++---
 Cargo.toml                      |  1 +
 libs/utils/Cargo.toml           |  1 +
 libs/utils/src/http/endpoint.rs | 54 ++++++++++++++++-------
 libs/utils/src/pprof.rs         | 59 ++++++++++++++++++++++++-
 5 files changed, 169 insertions(+), 23 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 87a54c78ec..f727741883 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1653,6 +1653,20 @@ dependencies = [
  "parking_lot_core 0.9.8",
 ]
 
+[[package]]
+name = "dashmap"
+version = "6.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
+dependencies = [
+ "cfg-if",
+ "crossbeam-utils",
+ "hashbrown 0.14.5",
+ "lock_api",
+ "once_cell",
+ "parking_lot_core 0.9.8",
+]
+
 [[package]]
 name = "data-encoding"
 version = "2.4.0"
@@ -1952,6 +1966,15 @@ dependencies = [
  "syn 2.0.90",
 ]
 
+[[package]]
+name = "env_filter"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0"
+dependencies = [
+ "log",
+]
+
 [[package]]
 name = "env_logger"
 version = "0.10.2"
@@ -1965,6 +1988,16 @@ dependencies = [
  "termcolor",
 ]
 
+[[package]]
+name = "env_logger"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d"
+dependencies = [
+ "env_filter",
+ "log",
+]
+
 [[package]]
 name = "equator"
 version = "0.2.2"
@@ -2948,6 +2981,28 @@ dependencies = [
  "str_stack",
 ]
 
+[[package]]
+name = "inferno"
+version = "0.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75a5d75fee4d36809e6b021e4b96b686e763d365ffdb03af2bd00786353f84fe"
+dependencies = [
+ "ahash",
+ "clap",
+ "crossbeam-channel",
+ "crossbeam-utils",
+ "dashmap 6.1.0",
+ "env_logger 0.11.2",
+ "indexmap 2.0.1",
+ "itoa",
+ "log",
+ "num-format",
+ "once_cell",
+ "quick-xml 0.37.1",
+ "rgb",
+ "str_stack",
+]
+
 [[package]]
 name = "inotify"
 version = "0.9.6"
@@ -3155,7 +3210,7 @@ version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4644821e1c3d7a560fe13d842d13f587c07348a1a05d3a797152d41c90c56df2"
 dependencies = [
- "dashmap",
+ "dashmap 5.5.0",
  "hashbrown 0.13.2",
 ]
 
@@ -4421,7 +4476,7 @@ dependencies = [
  "bytes",
  "crc32c",
  "criterion",
- "env_logger",
+ "env_logger 0.10.2",
  "log",
  "memoffset 0.9.0",
  "once_cell",
@@ -4462,7 +4517,7 @@ dependencies = [
  "cfg-if",
  "criterion",
  "findshlibs",
- "inferno",
+ "inferno 0.11.21",
  "libc",
  "log",
  "nix 0.26.4",
@@ -4688,9 +4743,9 @@ dependencies = [
  "clap",
  "compute_api",
  "consumption_metrics",
- "dashmap",
+ "dashmap 5.5.0",
  "ecdsa 0.16.9",
- "env_logger",
+ "env_logger 0.10.2",
  "fallible-iterator",
  "flate2",
  "framed-websockets",
@@ -4797,6 +4852,15 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "quick-xml"
+version = "0.37.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f22f29bdff3987b4d8632ef95fd6424ec7e4e0a57e2f4fc63e489e75357f6a03"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "quote"
 version = "1.0.37"
@@ -7319,6 +7383,7 @@ dependencies = [
  "hex-literal",
  "humantime",
  "hyper 0.14.30",
+ "inferno 0.12.0",
  "itertools 0.10.5",
  "jemalloc_pprof",
  "jsonwebtoken",
@@ -7422,7 +7487,7 @@ dependencies = [
  "anyhow",
  "camino-tempfile",
  "clap",
- "env_logger",
+ "env_logger 0.10.2",
  "log",
  "postgres",
  "postgres_ffi",
diff --git a/Cargo.toml b/Cargo.toml
index 55bb9bfe11..a4e601bb58 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -110,6 +110,7 @@ hyper-util = "0.1"
 tokio-tungstenite = "0.21.0"
 indexmap = "2"
 indoc = "2"
+inferno = "0.12.0"
 ipnet = "2.10.0"
 itertools = "0.10"
 itoa = "1.0.11"
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 02bf77760a..edb451a02c 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -26,6 +26,7 @@ git-version.workspace = true
 hex = { workspace = true, features = ["serde"] }
 humantime.workspace = true
 hyper0 = { workspace = true, features = ["full"] }
+inferno.workspace = true
 itertools.workspace = true
 fail.workspace = true
 futures = { workspace = true }
diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index 9b37b69939..4b4aa88d6b 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -417,6 +417,7 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
     enum Format {
         Jemalloc,
         Pprof,
+        Svg,
     }
 
     // Parameters.
@@ -424,9 +425,24 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
         None => Format::Pprof,
         Some("jemalloc") => Format::Jemalloc,
         Some("pprof") => Format::Pprof,
+        Some("svg") => Format::Svg,
         Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
     };
 
+    // Functions and mappings to strip when symbolizing pprof profiles. If true,
+    // also remove child frames.
+    static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
+        vec![
+            (Regex::new("^__rust").unwrap(), false),
+            (Regex::new("^_start$").unwrap(), false),
+            (Regex::new("^irallocx_prof").unwrap(), true),
+            (Regex::new("^prof_alloc_prep").unwrap(), true),
+            (Regex::new("^std::rt::lang_start").unwrap(), false),
+            (Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
+        ]
+    });
+    const STRIP_MAPPINGS: &[&str] = &["libc", "libgcc", "pthread", "vdso"];
+
     // Obtain profiler handle.
     let mut prof_ctl = jemalloc_pprof::PROF_CTL
         .as_ref()
@@ -464,24 +480,9 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
                 // Symbolize the profile.
                 // TODO: consider moving this upstream to jemalloc_pprof and avoiding the
                 // serialization roundtrip.
-                static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
-                    // Functions to strip from profiles. If true, also remove child frames.
-                    vec![
-                        (Regex::new("^__rust").unwrap(), false),
-                        (Regex::new("^_start$").unwrap(), false),
-                        (Regex::new("^irallocx_prof").unwrap(), true),
-                        (Regex::new("^prof_alloc_prep").unwrap(), true),
-                        (Regex::new("^std::rt::lang_start").unwrap(), false),
-                        (Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
-                    ]
-                });
                 let profile = pprof::decode(&bytes)?;
                 let profile = pprof::symbolize(profile)?;
-                let profile = pprof::strip_locations(
-                    profile,
-                    &["libc", "libgcc", "pthread", "vdso"],
-                    &STRIP_FUNCTIONS,
-                );
+                let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
                 pprof::encode(&profile)
             })
             .await
@@ -494,6 +495,27 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
                 .body(Body::from(data))
                 .map_err(|err| ApiError::InternalServerError(err.into()))
         }
+
+        Format::Svg => {
+            let body = tokio::task::spawn_blocking(move || {
+                let bytes = prof_ctl.dump_pprof()?;
+                let profile = pprof::decode(&bytes)?;
+                let profile = pprof::symbolize(profile)?;
+                let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
+                let mut opts = inferno::flamegraph::Options::default();
+                opts.title = "Heap inuse".to_string();
+                opts.count_name = "bytes".to_string();
+                pprof::flamegraph(profile, &mut opts)
+            })
+            .await
+            .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
+            .map_err(ApiError::InternalServerError)?;
+            Response::builder()
+                .status(200)
+                .header(CONTENT_TYPE, "image/svg+xml")
+                .body(Body::from(body))
+                .map_err(|err| ApiError::InternalServerError(err.into()))
+        }
     }
 }
 
diff --git a/libs/utils/src/pprof.rs b/libs/utils/src/pprof.rs
index 90910897bf..dd57f9ed4b 100644
--- a/libs/utils/src/pprof.rs
+++ b/libs/utils/src/pprof.rs
@@ -1,8 +1,9 @@
+use anyhow::bail;
 use flate2::write::{GzDecoder, GzEncoder};
 use flate2::Compression;
 use itertools::Itertools as _;
 use once_cell::sync::Lazy;
-use pprof::protos::{Function, Line, Message as _, Profile};
+use pprof::protos::{Function, Line, Location, Message as _, Profile};
 use regex::Regex;
 
 use std::borrow::Cow;
@@ -188,3 +189,59 @@ pub fn strip_locations(
 
     profile
 }
+
+/// Generates an SVG flamegraph from a symbolized pprof profile.
+pub fn flamegraph(
+    profile: Profile,
+    opts: &mut inferno::flamegraph::Options,
+) -> anyhow::Result<Vec<u8>> {
+    if profile.mapping.iter().any(|m| !m.has_functions) {
+        bail!("profile not symbolized");
+    }
+
+    // Index locations, functions, and strings.
+    let locations: HashMap<u64, Location> =
+        profile.location.into_iter().map(|l| (l.id, l)).collect();
+    let functions: HashMap<u64, Function> =
+        profile.function.into_iter().map(|f| (f.id, f)).collect();
+    let strings = profile.string_table;
+
+    // Resolve stacks as function names, and sum sample values per stack. Also reverse the stack,
+    // since inferno expects it bottom-up.
+    let mut stacks: HashMap<Vec<&str>, i64> = HashMap::new();
+    for sample in profile.sample {
+        let mut stack = Vec::with_capacity(sample.location_id.len());
+        for location in sample.location_id.into_iter().rev() {
+            let Some(location) = locations.get(&location) else {
+                bail!("missing location {location}");
+            };
+            for line in location.line.iter().rev() {
+                let Some(function) = functions.get(&line.function_id) else {
+                    bail!("missing function {}", line.function_id);
+                };
+                let Some(name) = strings.get(function.name as usize) else {
+                    bail!("missing string {}", function.name);
+                };
+                stack.push(name.as_str());
+            }
+        }
+        let Some(&value) = sample.value.first() else {
+            bail!("missing value");
+        };
+        *stacks.entry(stack).or_default() += value;
+    }
+
+    // Construct stack lines for inferno.
+    let lines = stacks
+        .into_iter()
+        .map(|(stack, value)| (stack.into_iter().join(";"), value))
+        .map(|(stack, value)| format!("{stack} {value}"))
+        .sorted()
+        .collect_vec();
+
+    // Construct the flamegraph.
+    let mut bytes = Vec::new();
+    let lines = lines.iter().map(|line| line.as_str());
+    inferno::flamegraph::from_lines(opts, lines, &mut bytes)?;
+    Ok(bytes)
+}

From 0d4fce2d358b3c63b2ac77dac7acd2d42944f3fb Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 10 Jan 2025 13:57:23 +0000
Subject: [PATCH 147/583] tests: refine how compat snapshot is generated
 (#10342)

## Problem

I noticed in https://github.com/neondatabase/neon/pull/9537 that tests
which work with compat snapshots were writing several hundred MB of
data, which isn't really necessary.

Also, the snapshots are large but don't have the proper variety of
storage format features, e.g. they could just have L0 deltas.

## Summary of changes

- Use smaller scale factor and runtime to generate less data
- Configure a small layer size and use force image layer generation so
that our output contains L1 deltas and image layers, and has a decent
number of entries in the layer map
---
 test_runner/regress/test_compatibility.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index ba7305148f..a6eaaf6c4c 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -141,11 +141,18 @@ def test_create_snapshot(
     neon_env_builder.num_safekeepers = 3
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            # Miniature layers to enable generating non-trivial layer map without writing lots of data
+            "checkpoint_distance": f"{128 * 1024}",
+            "compaction_threshold": "1",
+            "compaction_target_size": f"{128 * 1024}",
+        }
+    )
     endpoint = env.endpoints.create_start("main")
 
-    pg_bin.run_capture(["pgbench", "--initialize", "--scale=10", endpoint.connstr()])
-    pg_bin.run_capture(["pgbench", "--time=60", "--progress=2", endpoint.connstr()])
+    pg_bin.run_capture(["pgbench", "--initialize", "--scale=1", endpoint.connstr()])
+    pg_bin.run_capture(["pgbench", "--time=30", "--progress=2", endpoint.connstr()])
     pg_bin.run_capture(
         ["pg_dumpall", f"--dbname={endpoint.connstr()}", f"--file={test_output_dir / 'dump.sql'}"]
     )
@@ -157,7 +164,9 @@ def test_create_snapshot(
     pageserver_http = env.pageserver.http_client()
 
     flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)
-    pageserver_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
+    pageserver_http.timeline_checkpoint(
+        tenant_id, timeline_id, wait_until_uploaded=True, force_image_layer_creation=True
+    )
 
     env.endpoints.stop_all()
     for sk in env.safekeepers:

From 105f66c4ce760477e774382ee3f1f6945fb2a8a3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 10 Jan 2025 13:57:26 +0000
Subject: [PATCH 148/583] tests: move test_parallel_copy into performance tree
 (#10343)

## Problem

This test writes ~5GB of data. It is not suitable to run in parallel
with all the other small tests in test_runner/regress.

via #9537

## Summary of changes

- Move test_parallel_copy into the performance directory, so that it
does not run in parallel with other tests
---
 test_runner/{regress => performance}/test_parallel_copy.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test_runner/{regress => performance}/test_parallel_copy.py (100%)

diff --git a/test_runner/regress/test_parallel_copy.py b/test_runner/performance/test_parallel_copy.py
similarity index 100%
rename from test_runner/regress/test_parallel_copy.py
rename to test_runner/performance/test_parallel_copy.py

From 71bca6f58045966fd1c630e685e2d6ea3b7bbbd6 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Fri, 10 Jan 2025 15:32:26 +0100
Subject: [PATCH 149/583] poetry: Update packaging for poetry v2 (#10344)

## Problem

When poetry v2 (released Jan 5) is used it needs `packaging.metadata`
module, but we downgrade `packaging` to 23.0. `packaging==23.1`
introduced the metadata submodule.

## Summary of changes

Update `packaging` to 24.2.
---
 poetry.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 5f15223dca..2cd2bc6383 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2028,13 +2028,13 @@ openapi-schema-validator = ">=0.4.2,<0.5.0"
 
 [[package]]
 name = "packaging"
-version = "23.0"
+version = "24.2"
 description = "Core utilities for Python packages"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "packaging-23.0-py3-none-any.whl", hash = "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2"},
-    {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"},
+    {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"},
+    {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"},
 ]
 
 [[package]]

From 4398051385affb0be87418543b6e3f3f08220b5e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 10 Jan 2025 15:53:23 +0000
Subject: [PATCH 150/583] tests: smaller datasets in LFC tests (#10346)

## Problem

These two tests came up in #9537 as doing multi-gigabyte I/O, and from
inspection of the tests it doesn't seem like they need that to fulfil
their purpose.

## Summary of changes

- In test_local_file_cache_unlink, run fewer background threads with a
smaller number of rows. These background threads AFAICT exist to make
sure some I/O is going on while we unlink the LFC directory, but 5
threads should be enough for "some".
- In test_lfc_resize, tweak the test to validate that the cache size is
larger than the final size before resizing it, so that we're sure we're
writing enough data to really be doing something. Then decrease the
pgbench scale.
---
 test_runner/regress/test_lfc_resize.py       | 31 ++++++++++++++------
 test_runner/regress/test_local_file_cache.py |  4 +--
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py
index 377b0fb4d4..8762e6525b 100644
--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -30,7 +30,7 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
         ],
     )
     n_resize = 10
-    scale = 100
+    scale = 20
 
     def run_pgbench(connstr: str):
         log.info(f"Start a pgbench workload on pg {connstr}")
@@ -46,17 +46,36 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
     conn = endpoint.connect()
     cur = conn.cursor()
 
+    def get_lfc_size() -> tuple[int, int]:
+        lfc_file_path = endpoint.lfc_path()
+        lfc_file_size = lfc_file_path.stat().st_size
+        res = subprocess.run(
+            ["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True
+        )
+        lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0]
+        log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}")
+
+        return (lfc_file_size, lfc_file_blocks)
+
     # For as long as pgbench is running, twiddle the LFC size once a second.
     # Note that we launch this immediately, already while the "pgbench -i"
     # initialization step is still running. That's quite a different workload
     # than the actual pgbench benchamark run, so this gives us coverage of both.
     while thread.is_alive():
-        size = random.randint(1, 512)
+        # Vary the LFC size randomly within a range above what we will later
+        # decrease it to.  This should ensure that the final size decrease
+        # is really doing something.
+        size = random.randint(192, 512)
         cur.execute(f"alter system set neon.file_cache_size_limit='{size}MB'")
         cur.execute("select pg_reload_conf()")
         time.sleep(1)
+
     thread.join()
 
+    # Before shrinking the cache, check that it really is large now
+    (lfc_file_size, lfc_file_blocks) = get_lfc_size()
+    assert int(lfc_file_blocks) > 128 * 1024
+
     # At the end, set it at 100 MB, and perform a final check that the disk usage
     # of the file is in that ballbark.
     #
@@ -66,13 +85,7 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
     cur.execute("select pg_reload_conf()")
     nretries = 10
     while True:
-        lfc_file_path = endpoint.lfc_path()
-        lfc_file_size = lfc_file_path.stat().st_size
-        res = subprocess.run(
-            ["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True
-        )
-        lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0]
-        log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}")
+        (lfc_file_size, lfc_file_blocks) = get_lfc_size()
         assert lfc_file_size <= 512 * 1024 * 1024
 
         if int(lfc_file_blocks) <= 128 * 1024 or nretries == 0:
diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py
index 94c630ffcf..21c9e97a42 100644
--- a/test_runner/regress/test_local_file_cache.py
+++ b/test_runner/regress/test_local_file_cache.py
@@ -29,8 +29,8 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
     cur = endpoint.connect().cursor()
 
     stop = threading.Event()
-    n_rows = 100000
-    n_threads = 20
+    n_rows = 10000
+    n_threads = 5
     n_updates_per_connection = 1000
 
     cur.execute("CREATE TABLE lfctest (id int4 PRIMARY KEY, n int) WITH (fillfactor=10)")

From 425b77784045ed20ec1b7a2c58869aa4118e1a44 Mon Sep 17 00:00:00 2001
From: Cheng Chen <cheng@mooncakelabs.com>
Date: Fri, 10 Jan 2025 08:38:13 -0800
Subject: [PATCH 151/583] chore(compute): pg_mooncake v0.1.0 (#10337)

## Problem
Upgrade pg_mooncake to v0.1.0

## Summary of changes
---
 compute/compute-node.Dockerfile | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 06aaf9e7f4..303daec240 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1167,22 +1167,13 @@ FROM rust-extensions-build AS pg-mooncake-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-# The topmost commit in the `neon` branch at the time of writing this
-# https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
-# https://github.com/Mooncake-Labs/pg_mooncake/commit/077c92c452bb6896a7b7776ee95f039984f076af
-ENV PG_MOONCAKE_VERSION=077c92c452bb6896a7b7776ee95f039984f076af
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 
-RUN case "${PG_VERSION}" in \
-        'v14') \
-            echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \
-    esac && \
-    git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \
-    cd pg_mooncake-src && \
-    git checkout "${PG_MOONCAKE_VERSION}" && \
-    git submodule update --init --depth 1 --recursive && \
-    make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) && \
-    make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \
+RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \
+    echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \
+    mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \
+    make release -j $(getconf _NPROCESSORS_ONLN) && \
+    make install -j $(getconf _NPROCESSORS_ONLN) && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
 
 #########################################################################################

From 3cd5034eac3677223d8e2dacb772925fd22e0136 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 10 Jan 2025 18:35:16 +0000
Subject: [PATCH 152/583] storcon: don't assume host:port format in storcon
 client (#10347)

## Problem

https://github.com/neondatabase/infra/pull/2725 updated the scrubber to
use a non-host
port endpoint for storcon. That breaks when unwrapping the port.

## Summary of changes

Support both `host:port` and `host` formats for the storcon api.
---
 storage_controller/client/src/control_api.rs | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/storage_controller/client/src/control_api.rs b/storage_controller/client/src/control_api.rs
index a981b5020e..f8a2790769 100644
--- a/storage_controller/client/src/control_api.rs
+++ b/storage_controller/client/src/control_api.rs
@@ -1,7 +1,6 @@
 use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
 use reqwest::{Method, Url};
 use serde::{de::DeserializeOwned, Serialize};
-use std::str::FromStr;
 
 pub struct Client {
     base_url: Url,
@@ -31,16 +30,11 @@ impl Client {
         RQ: Serialize + Sized,
         RS: DeserializeOwned + Sized,
     {
-        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
-        // for general purpose API access.
-        let url = Url::from_str(&format!(
-            "http://{}:{}/{path}",
-            self.base_url.host_str().unwrap(),
-            self.base_url.port().unwrap()
-        ))
-        .unwrap();
-
-        let mut builder = self.client.request(method, url);
+        let request_path = self
+            .base_url
+            .join(&path)
+            .expect("Failed to build request path");
+        let mut builder = self.client.request(method, request_path);
         if let Some(body) = body {
             builder = builder.json(&body)
         }

From cdd34dfc1270b275646be524421b38ecf5d36665 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 10 Jan 2025 20:19:48 +0100
Subject: [PATCH 153/583] impr(utils/spsc_fold): add another test case (#10319)

Wondered about the case covered here while investigating #10309.
---
 libs/utils/src/sync/spsc_fold.rs | 34 ++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/libs/utils/src/sync/spsc_fold.rs b/libs/utils/src/sync/spsc_fold.rs
index 7bad4f1068..0cab291d51 100644
--- a/libs/utils/src/sync/spsc_fold.rs
+++ b/libs/utils/src/sync/spsc_fold.rs
@@ -453,4 +453,38 @@ mod tests {
         let err = recv_task.await.unwrap().expect_err("should error");
         assert!(matches!(err, RecvError::SenderGone));
     }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_receiver_drop_while_waiting_for_receiver_to_consume_unblocks_sender() {
+        let (mut sender, receiver) = channel();
+
+        let state = receiver.state.clone();
+
+        sender.send((), |_, _| unreachable!()).await.unwrap();
+
+        assert!(matches!(&*state.value.lock().unwrap(), &State::HasData(_)));
+
+        let unmergeable = sender.send((), |_, _| Err(()));
+        let mut unmergeable = std::pin::pin!(unmergeable);
+        tokio::select! {
+            _ = tokio::time::sleep(FOREVER) => {},
+            _ = &mut unmergeable => {
+                panic!("unmergeable should not complete");
+            },
+        }
+
+        assert!(matches!(
+            &*state.value.lock().unwrap(),
+            &State::SenderWaitsForReceiverToConsume(_)
+        ));
+
+        drop(receiver);
+
+        assert!(matches!(
+            &*state.value.lock().unwrap(),
+            &State::ReceiverGone
+        ));
+
+        unmergeable.await.unwrap_err();
+    }
 }

From 9b43204893900a890ef29f571a1e1a77e49d2e22 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 10 Jan 2025 20:21:01 +0100
Subject: [PATCH 154/583] fix(page_service): Timeline::gate held open while
 throttling (#10314)

When we moved throttling up from Timeline::get into page_service,
we stopped being sensitive to `Timeline::cancel`, even though we're
holding a Handle and thus a guard on the `Timeline::gate` open.

This PR rectifies the situation.

Refs

- Found while investigating #10309 (hung detach because gate kept open),
  but not expected to be the root cause of that issue because the
  affected tenants are not being throttled according to their metrics.
---
 pageserver/src/page_service.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 0c4a1b18f5..f6504bd3b5 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -618,6 +618,9 @@ impl BatchedFeMessage {
         };
         let throttled = tokio::select! {
             throttled = shard.pagestream_throttle.throttle(tokens) => { throttled }
+            _ = shard.cancel.cancelled() => {
+                return Err(QueryError::Shutdown);
+            }
             _ = cancel.cancelled() => {
                 return Err(QueryError::Shutdown);
             }

From 58332cb3615954bedb317bbbf311df47310d4a03 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 10 Jan 2025 21:35:50 +0100
Subject: [PATCH 155/583] pageserver: remove unused metric
 `pageserver_layers_visited_per_read_global` (#10141)

As of commit "pageserver: remove legacy read path" (#8601) we always use
vectored get, which has a separate metric.
---
 pageserver/src/metrics.rs              | 10 ----------
 test_runner/fixtures/metrics.py        |  1 -
 test_runner/regress/test_compaction.py | 12 +-----------
 3 files changed, 1 insertion(+), 22 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index a313a64080..5b8419fda9 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -91,15 +91,6 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "pageserver_layers_visited_per_read_global",
-        "Number of layers visited to reconstruct one key",
-        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_layers_visited_per_vectored_read_global",
@@ -3894,7 +3885,6 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
 
     // histograms
     [
-        &READ_NUM_LAYERS_VISITED,
         &VEC_READ_NUM_LAYERS_VISITED,
         &WAIT_LSN_TIME,
         &WAL_REDO_TIME,
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index c5295360c3..fa541bad17 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -131,7 +131,6 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = (
     "pageserver_getpage_reconstruct_seconds_sum",
     *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
     *histogram("pageserver_smgr_query_seconds_global"),
-    *histogram("pageserver_layers_visited_per_read_global"),
     *histogram("pageserver_getpage_get_reconstruct_data_seconds"),
     *histogram("pageserver_wait_lsn_seconds"),
     *histogram("pageserver_remote_operation_seconds"),
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index ae48a8fc27..fe0422088a 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -84,9 +84,6 @@ page_cache_size=10
     log.info("Checking layer access metrics ...")
 
     layer_access_metric_names = [
-        "pageserver_layers_visited_per_read_global_sum",
-        "pageserver_layers_visited_per_read_global_count",
-        "pageserver_layers_visited_per_read_global_bucket",
         "pageserver_layers_visited_per_vectored_read_global_sum",
         "pageserver_layers_visited_per_vectored_read_global_count",
         "pageserver_layers_visited_per_vectored_read_global_bucket",
@@ -97,12 +94,6 @@ page_cache_size=10
         layer_access_metrics = metrics.query_all(name)
         log.info(f"Got metrics: {layer_access_metrics}")
 
-    non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum")
-    non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count")
-    if non_vectored_count.value != 0:
-        non_vectored_average = non_vectored_sum.value / non_vectored_count.value
-    else:
-        non_vectored_average = 0
     vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
     vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
     if vectored_count.value > 0:
@@ -113,11 +104,10 @@ page_cache_size=10
         assert vectored_sum.value == 0
         vectored_average = 0
 
-    log.info(f"{non_vectored_average=} {vectored_average=}")
+    log.info(f"{vectored_average=}")
 
     # The upper bound for average number of layer visits below (8)
     # was chosen empirically for this workload.
-    assert non_vectored_average < 8
     assert vectored_average < 8
 
 
From b5d54ba52a92f263c087eff56ab7ced9499c1ae6 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 10 Jan 2025 15:53:00 -0500
Subject: [PATCH 156/583] refactor(pageserver): move queue logic to
 compaction.rs (#10330)

## Problem

close https://github.com/neondatabase/neon/issues/10031, part of
https://github.com/neondatabase/neon/issues/9114

## Summary of changes

Move the compaction job generation to `compaction.rs`, thus making the
code more readable and debuggable. We now also return running job
through the get compaction job API, versus before we only return
scheduled jobs.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/models.rs            |   2 +
 pageserver/src/http/routes.rs                |  14 +-
 pageserver/src/tenant.rs                     | 175 +++--------
 pageserver/src/tenant/timeline/compaction.rs | 293 ++++++++++++++++++-
 4 files changed, 324 insertions(+), 160 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 39390d7647..9af6c4021d 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -272,6 +272,8 @@ pub struct CompactInfoResponse {
     pub compact_key_range: Option<CompactKeyRange>,
     pub compact_lsn_range: Option<CompactLsnRange>,
     pub sub_compaction: bool,
+    pub running: bool,
+    pub job_id: usize,
 }
 
 #[derive(Serialize, Deserialize, Clone)]
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 60ef4c3702..94e0b101bd 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -97,8 +97,8 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::DEFAULT_PG_VERSION;
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
-    CompactInfoResponse, StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest,
-    TimelineGcRequest, TimelineInfo,
+    StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
+    TimelineInfo,
 };
 use utils::{
     auth::SwappableJwtAuth,
@@ -2052,15 +2052,7 @@ async fn timeline_compact_info_handler(
         let tenant = state
             .tenant_manager
             .get_attached_tenant_shard(tenant_shard_id)?;
-        let res = tenant.get_scheduled_compaction_tasks(timeline_id);
-        let mut resp = Vec::new();
-        for item in res {
-            resp.push(CompactInfoResponse {
-                compact_key_range: item.compact_key_range,
-                compact_lsn_range: item.compact_lsn_range,
-                sub_compaction: item.sub_compaction,
-            });
-        }
+        let resp = tenant.get_scheduled_compaction_tasks(timeline_id);
         json_response(StatusCode::OK, resp)
     }
     .instrument(info_span!("timeline_compact_info", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 8e61d09de7..2928c435cb 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -21,6 +21,7 @@ use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use pageserver_api::models;
+use pageserver_api::models::CompactInfoResponse;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::models::TimelineState;
@@ -37,21 +38,17 @@ use remote_timeline_client::manifest::{
 };
 use remote_timeline_client::UploadQueueNotReadyError;
 use std::collections::BTreeMap;
-use std::collections::VecDeque;
 use std::fmt;
 use std::future::Future;
 use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
-use timeline::compaction::GcCompactJob;
-use timeline::compaction::ScheduledCompactionTask;
+use timeline::compaction::GcCompactionQueue;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
 use timeline::offload::OffloadError;
-use timeline::CompactFlags;
 use timeline::CompactOptions;
-use timeline::CompactionError;
 use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
@@ -347,10 +344,8 @@ pub struct Tenant {
     /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
     compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
 
-    /// Scheduled compaction tasks. Currently, this can only be populated by triggering
-    /// a manual gc-compaction from the manual compaction API.
-    scheduled_compaction_tasks:
-        std::sync::Mutex<HashMap<TimelineId, VecDeque<ScheduledCompactionTask>>>,
+    /// Scheduled gc-compaction tasks.
+    scheduled_compaction_tasks: std::sync::Mutex<HashMap<TimelineId, Arc<GcCompactionQueue>>>,
 
     /// If the tenant is in Activating state, notify this to encourage it
     /// to proceed to Active as soon as possible, rather than waiting for lazy
@@ -2997,104 +2992,18 @@ impl Tenant {
                 if has_pending_l0_compaction_task {
                     Some(true)
                 } else {
-                    let mut has_pending_scheduled_compaction_task;
-                    let next_scheduled_compaction_task = {
-                        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-                        if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
-                            if !tline_pending_tasks.is_empty() {
-                                info!(
-                                    "{} tasks left in the compaction schedule queue",
-                                    tline_pending_tasks.len()
-                                );
-                            }
-                            let next_task = tline_pending_tasks.pop_front();
-                            has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
-                            next_task
-                        } else {
-                            has_pending_scheduled_compaction_task = false;
-                            None
-                        }
+                    let queue = {
+                        let guard = self.scheduled_compaction_tasks.lock().unwrap();
+                        guard.get(timeline_id).cloned()
                     };
-                    if let Some(mut next_scheduled_compaction_task) = next_scheduled_compaction_task
-                    {
-                        if !next_scheduled_compaction_task
-                            .options
-                            .flags
-                            .contains(CompactFlags::EnhancedGcBottomMostCompaction)
-                        {
-                            warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
-                        } else if next_scheduled_compaction_task.options.sub_compaction {
-                            info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
-                            let jobs: Vec<GcCompactJob> = timeline
-                                .gc_compaction_split_jobs(
-                                    GcCompactJob::from_compact_options(
-                                        next_scheduled_compaction_task.options.clone(),
-                                    ),
-                                    next_scheduled_compaction_task
-                                        .options
-                                        .sub_compaction_max_job_size_mb,
-                                )
-                                .await
-                                .map_err(CompactionError::Other)?;
-                            if jobs.is_empty() {
-                                info!("no jobs to run, skipping scheduled compaction task");
-                            } else {
-                                has_pending_scheduled_compaction_task = true;
-                                let jobs_len = jobs.len();
-                                let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-                                let tline_pending_tasks = guard.entry(*timeline_id).or_default();
-                                for (idx, job) in jobs.into_iter().enumerate() {
-                                    // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
-                                    // until we do further refactors to allow directly call `compact_with_gc`.
-                                    let mut flags: EnumSet<CompactFlags> = EnumSet::default();
-                                    flags |= CompactFlags::EnhancedGcBottomMostCompaction;
-                                    if job.dry_run {
-                                        flags |= CompactFlags::DryRun;
-                                    }
-                                    let options = CompactOptions {
-                                        flags,
-                                        sub_compaction: false,
-                                        compact_key_range: Some(job.compact_key_range.into()),
-                                        compact_lsn_range: Some(job.compact_lsn_range.into()),
-                                        sub_compaction_max_job_size_mb: None,
-                                    };
-                                    tline_pending_tasks.push_back(if idx == jobs_len - 1 {
-                                        ScheduledCompactionTask {
-                                            options,
-                                            // The last job in the queue sends the signal and releases the gc guard
-                                            result_tx: next_scheduled_compaction_task
-                                                .result_tx
-                                                .take(),
-                                            gc_block: next_scheduled_compaction_task
-                                                .gc_block
-                                                .take(),
-                                        }
-                                    } else {
-                                        ScheduledCompactionTask {
-                                            options,
-                                            result_tx: None,
-                                            gc_block: None,
-                                        }
-                                    });
-                                }
-                                info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
-                            }
-                        } else {
-                            let _ = timeline
-                                .compact_with_options(
-                                    cancel,
-                                    next_scheduled_compaction_task.options,
-                                    ctx,
-                                )
-                                .instrument(info_span!("scheduled_compact_timeline", %timeline_id))
-                                .await?;
-                            if let Some(tx) = next_scheduled_compaction_task.result_tx.take() {
-                                // TODO: we can send compaction statistics in the future
-                                tx.send(()).ok();
-                            }
-                        }
+                    if let Some(queue) = queue {
+                        let has_pending_tasks = queue
+                            .iteration(cancel, ctx, &self.gc_block, timeline)
+                            .await?;
+                        Some(has_pending_tasks)
+                    } else {
+                        Some(false)
                     }
-                    Some(has_pending_scheduled_compaction_task)
                 }
             } else {
                 None
@@ -3124,34 +3033,32 @@ impl Tenant {
     }
 
     /// Cancel scheduled compaction tasks
-    pub(crate) fn cancel_scheduled_compaction(
-        &self,
-        timeline_id: TimelineId,
-    ) -> Vec<ScheduledCompactionTask> {
+    pub(crate) fn cancel_scheduled_compaction(&self, timeline_id: TimelineId) {
         let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-        if let Some(tline_pending_tasks) = guard.get_mut(&timeline_id) {
-            let current_tline_pending_tasks = std::mem::take(tline_pending_tasks);
-            current_tline_pending_tasks.into_iter().collect()
-        } else {
-            Vec::new()
+        if let Some(q) = guard.get_mut(&timeline_id) {
+            q.cancel_scheduled();
         }
     }
 
     pub(crate) fn get_scheduled_compaction_tasks(
         &self,
         timeline_id: TimelineId,
-    ) -> Vec<CompactOptions> {
-        use itertools::Itertools;
-        let guard = self.scheduled_compaction_tasks.lock().unwrap();
-        guard
-            .get(&timeline_id)
-            .map(|tline_pending_tasks| {
-                tline_pending_tasks
-                    .iter()
-                    .map(|x| x.options.clone())
-                    .collect_vec()
-            })
-            .unwrap_or_default()
+    ) -> Vec<CompactInfoResponse> {
+        let res = {
+            let guard = self.scheduled_compaction_tasks.lock().unwrap();
+            guard.get(&timeline_id).map(|q| q.remaining_jobs())
+        };
+        let Some((running, remaining)) = res else {
+            return Vec::new();
+        };
+        let mut result = Vec::new();
+        if let Some((id, running)) = running {
+            result.extend(running.into_compact_info_resp(id, true));
+        }
+        for (id, job) in remaining {
+            result.extend(job.into_compact_info_resp(id, false));
+        }
+        result
     }
 
     /// Schedule a compaction task for a timeline.
@@ -3160,20 +3067,12 @@ impl Tenant {
         timeline_id: TimelineId,
         options: CompactOptions,
     ) -> anyhow::Result<tokio::sync::oneshot::Receiver<()>> {
-        let gc_guard = match self.gc_block.start().await {
-            Ok(guard) => guard,
-            Err(e) => {
-                bail!("cannot run gc-compaction because gc is blocked: {}", e);
-            }
-        };
         let (tx, rx) = tokio::sync::oneshot::channel();
         let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-        let tline_pending_tasks = guard.entry(timeline_id).or_default();
-        tline_pending_tasks.push_back(ScheduledCompactionTask {
-            options,
-            result_tx: Some(tx),
-            gc_block: Some(gc_guard),
-        });
+        let q = guard
+            .entry(timeline_id)
+            .or_insert_with(|| Arc::new(GcCompactionQueue::new()));
+        q.schedule_manual_compaction(options, Some(tx));
         Ok(rx)
     }
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 55cde8603e..05f8d476f9 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,7 +4,7 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.
 
-use std::collections::{BinaryHeap, HashMap, HashSet};
+use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
 
@@ -16,10 +16,12 @@ use super::{
 
 use anyhow::{anyhow, bail, Context};
 use bytes::Bytes;
+use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
 use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
+use pageserver_api::models::CompactInfoResponse;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
 use serde::Serialize;
 use tokio_util::sync::CancellationToken;
@@ -30,6 +32,7 @@ use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}
 use crate::page_cache;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
+use crate::tenant::gc_block::GcBlock;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::batch_split_writer::{
     BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -63,16 +66,284 @@ use super::CompactionError;
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
 const COMPACTION_DELTA_THRESHOLD: usize = 5;
 
-/// A scheduled compaction task.
-pub(crate) struct ScheduledCompactionTask {
-    /// It's unfortunate that we need to store a compact options struct here because the only outer
-    /// API we can call here is `compact_with_options` which does a few setup calls before starting the
-    /// actual compaction job... We should refactor this to store `GcCompactionJob` in the future.
-    pub options: CompactOptions,
-    /// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender.
-    pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
-    /// Hold the GC block. If this is a subcompaction, the last compaction job holds the gc block guard.
-    pub gc_block: Option<gc_block::Guard>,
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+pub struct GcCompactionJobId(pub usize);
+
+impl std::fmt::Display for GcCompactionJobId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+#[derive(Debug, Clone)]
+pub enum GcCompactionQueueItem {
+    Manual(CompactOptions),
+    SubCompactionJob(CompactOptions),
+    #[allow(dead_code)]
+    UpdateL2Lsn(Lsn),
+    Notify(GcCompactionJobId),
+}
+
+impl GcCompactionQueueItem {
+    pub fn into_compact_info_resp(
+        self,
+        id: GcCompactionJobId,
+        running: bool,
+    ) -> Option<CompactInfoResponse> {
+        match self {
+            GcCompactionQueueItem::Manual(options) => Some(CompactInfoResponse {
+                compact_key_range: options.compact_key_range,
+                compact_lsn_range: options.compact_lsn_range,
+                sub_compaction: options.sub_compaction,
+                running,
+                job_id: id.0,
+            }),
+            GcCompactionQueueItem::SubCompactionJob(options) => Some(CompactInfoResponse {
+                compact_key_range: options.compact_key_range,
+                compact_lsn_range: options.compact_lsn_range,
+                sub_compaction: options.sub_compaction,
+                running,
+                job_id: id.0,
+            }),
+            GcCompactionQueueItem::UpdateL2Lsn(_) => None,
+            GcCompactionQueueItem::Notify(_) => None,
+        }
+    }
+}
+
+struct GcCompactionQueueInner {
+    running: Option<(GcCompactionJobId, GcCompactionQueueItem)>,
+    queued: VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
+    notify: HashMap<GcCompactionJobId, tokio::sync::oneshot::Sender<()>>,
+    gc_guards: HashMap<GcCompactionJobId, gc_block::Guard>,
+    last_id: GcCompactionJobId,
+}
+
+impl GcCompactionQueueInner {
+    fn next_id(&mut self) -> GcCompactionJobId {
+        let id = self.last_id;
+        self.last_id = GcCompactionJobId(id.0 + 1);
+        id
+    }
+}
+
+/// A structure to store gc_compaction jobs.
+pub struct GcCompactionQueue {
+    /// All items in the queue, and the currently-running job.
+    inner: std::sync::Mutex<GcCompactionQueueInner>,
+    /// Ensure only one thread is consuming the queue.
+    consumer_lock: tokio::sync::Mutex<()>,
+}
+
+impl GcCompactionQueue {
+    pub fn new() -> Self {
+        GcCompactionQueue {
+            inner: std::sync::Mutex::new(GcCompactionQueueInner {
+                running: None,
+                queued: VecDeque::new(),
+                notify: HashMap::new(),
+                gc_guards: HashMap::new(),
+                last_id: GcCompactionJobId(0),
+            }),
+            consumer_lock: tokio::sync::Mutex::new(()),
+        }
+    }
+
+    pub fn cancel_scheduled(&self) {
+        let mut guard = self.inner.lock().unwrap();
+        guard.queued.clear();
+        guard.notify.clear();
+        guard.gc_guards.clear();
+    }
+
+    /// Schedule a manual compaction job.
+    pub fn schedule_manual_compaction(
+        &self,
+        options: CompactOptions,
+        notify: Option<tokio::sync::oneshot::Sender<()>>,
+    ) -> GcCompactionJobId {
+        let mut guard = self.inner.lock().unwrap();
+        let id = guard.next_id();
+        guard
+            .queued
+            .push_back((id, GcCompactionQueueItem::Manual(options)));
+        if let Some(notify) = notify {
+            guard.notify.insert(id, notify);
+        }
+        info!("scheduled compaction job id={}", id);
+        id
+    }
+
+    /// Trigger an auto compaction.
+    #[allow(dead_code)]
+    pub fn trigger_auto_compaction(&self, _: &Arc<Timeline>) {}
+
+    /// Notify the caller the job has finished and unblock GC.
+    fn notify_and_unblock(&self, id: GcCompactionJobId) {
+        info!("compaction job id={} finished", id);
+        let mut guard = self.inner.lock().unwrap();
+        if let Some(blocking) = guard.gc_guards.remove(&id) {
+            drop(blocking)
+        }
+        if let Some(tx) = guard.notify.remove(&id) {
+            let _ = tx.send(());
+        }
+    }
+
+    async fn handle_sub_compaction(
+        &self,
+        id: GcCompactionJobId,
+        options: CompactOptions,
+        timeline: &Arc<Timeline>,
+        gc_block: &GcBlock,
+    ) -> Result<(), CompactionError> {
+        info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
+        let jobs: Vec<GcCompactJob> = timeline
+            .gc_compaction_split_jobs(
+                GcCompactJob::from_compact_options(options.clone()),
+                options.sub_compaction_max_job_size_mb,
+            )
+            .await
+            .map_err(CompactionError::Other)?;
+        if jobs.is_empty() {
+            info!("no jobs to run, skipping scheduled compaction task");
+            self.notify_and_unblock(id);
+        } else {
+            let gc_guard = match gc_block.start().await {
+                Ok(guard) => guard,
+                Err(e) => {
+                    return Err(CompactionError::Other(anyhow!(
+                        "cannot run gc-compaction because gc is blocked: {}",
+                        e
+                    )));
+                }
+            };
+
+            let jobs_len = jobs.len();
+            let mut pending_tasks = Vec::new();
+            for job in jobs {
+                // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
+                // until we do further refactors to allow directly call `compact_with_gc`.
+                let mut flags: EnumSet<CompactFlags> = EnumSet::default();
+                flags |= CompactFlags::EnhancedGcBottomMostCompaction;
+                if job.dry_run {
+                    flags |= CompactFlags::DryRun;
+                }
+                let options = CompactOptions {
+                    flags,
+                    sub_compaction: false,
+                    compact_key_range: Some(job.compact_key_range.into()),
+                    compact_lsn_range: Some(job.compact_lsn_range.into()),
+                    sub_compaction_max_job_size_mb: None,
+                };
+                pending_tasks.push(GcCompactionQueueItem::SubCompactionJob(options));
+            }
+            pending_tasks.push(GcCompactionQueueItem::Notify(id));
+            {
+                let mut guard = self.inner.lock().unwrap();
+                guard.gc_guards.insert(id, gc_guard);
+                let mut tasks = Vec::new();
+                for task in pending_tasks {
+                    let id = guard.next_id();
+                    tasks.push((id, task));
+                }
+                tasks.reverse();
+                for item in tasks {
+                    guard.queued.push_front(item);
+                }
+            }
+            info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
+        }
+        Ok(())
+    }
+
+    /// Take a job from the queue and process it. Returns if there are still pending tasks.
+    pub async fn iteration(
+        &self,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+        gc_block: &GcBlock,
+        timeline: &Arc<Timeline>,
+    ) -> Result<bool, CompactionError> {
+        let _one_op_at_a_time_guard = self.consumer_lock.lock().await;
+        let has_pending_tasks;
+        let (id, item) = {
+            let mut guard = self.inner.lock().unwrap();
+            let Some((id, item)) = guard.queued.pop_front() else {
+                return Ok(false);
+            };
+            guard.running = Some((id, item.clone()));
+            has_pending_tasks = !guard.queued.is_empty();
+            (id, item)
+        };
+
+        match item {
+            GcCompactionQueueItem::Manual(options) => {
+                if !options
+                    .flags
+                    .contains(CompactFlags::EnhancedGcBottomMostCompaction)
+                {
+                    warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", options);
+                } else if options.sub_compaction {
+                    self.handle_sub_compaction(id, options, timeline, gc_block)
+                        .await?;
+                } else {
+                    let gc_guard = match gc_block.start().await {
+                        Ok(guard) => guard,
+                        Err(e) => {
+                            return Err(CompactionError::Other(anyhow!(
+                                "cannot run gc-compaction because gc is blocked: {}",
+                                e
+                            )));
+                        }
+                    };
+                    {
+                        let mut guard = self.inner.lock().unwrap();
+                        guard.gc_guards.insert(id, gc_guard);
+                    }
+                    let _ = timeline
+                        .compact_with_options(cancel, options, ctx)
+                        .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
+                        .await?;
+                    self.notify_and_unblock(id);
+                }
+            }
+            GcCompactionQueueItem::SubCompactionJob(options) => {
+                let _ = timeline
+                    .compact_with_options(cancel, options, ctx)
+                    .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
+                    .await?;
+            }
+            GcCompactionQueueItem::Notify(id) => {
+                self.notify_and_unblock(id);
+            }
+            GcCompactionQueueItem::UpdateL2Lsn(_) => {
+                unreachable!()
+            }
+        }
+        {
+            let mut guard = self.inner.lock().unwrap();
+            guard.running = None;
+        }
+        Ok(has_pending_tasks)
+    }
+
+    #[allow(clippy::type_complexity)]
+    pub fn remaining_jobs(
+        &self,
+    ) -> (
+        Option<(GcCompactionJobId, GcCompactionQueueItem)>,
+        VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
+    ) {
+        let guard = self.inner.lock().unwrap();
+        (guard.running.clone(), guard.queued.clone())
+    }
+
+    #[allow(dead_code)]
+    pub fn remaining_jobs_num(&self) -> usize {
+        let guard = self.inner.lock().unwrap();
+        guard.queued.len() + if guard.running.is_some() { 1 } else { 0 }
+    }
 }
 
 /// A job description for the gc-compaction job. This structure describes the rectangle range that the job will

From 23c0748cdd3d07c67f805d29281b1e0aea967a4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sat, 11 Jan 2025 03:52:45 +0100
Subject: [PATCH 157/583] Remove active column (#10335)

We don't need or want the `active` column. Remove it. Vlad pointed out
that this is safe.

Thanks to the separation of the schemata in earlier PRs, this is easy.

follow-up of #10205

Part of https://github.com/neondatabase/neon/issues/9981
---
 .../2025-01-09-160454_safekeepers_remove_active/down.sql   | 4 ++++
 .../2025-01-09-160454_safekeepers_remove_active/up.sql     | 1 +
 storage_controller/src/persistence.rs                      | 7 ++-----
 storage_controller/src/schema.rs                           | 1 -
 4 files changed, 7 insertions(+), 6 deletions(-)
 create mode 100644 storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql
 create mode 100644 storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql

diff --git a/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql
new file mode 100644
index 0000000000..c2624f858b
--- /dev/null
+++ b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql
@@ -0,0 +1,4 @@
+-- this sadly isn't a "true" revert of the migration, as the column is now at the end of the table.
+-- But preserving order is not a trivial operation.
+-- https://wiki.postgresql.org/wiki/Alter_column_position
+ALTER TABLE safekeepers ADD active BOOLEAN NOT NULL DEFAULT false;
diff --git a/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql
new file mode 100644
index 0000000000..d76f044eda
--- /dev/null
+++ b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql
@@ -0,0 +1 @@
+ALTER TABLE safekeepers DROP active;
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index cebf3e9594..beb014f0a8 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -1258,7 +1258,6 @@ pub(crate) struct SafekeeperPersistence {
     pub(crate) version: i64,
     pub(crate) host: String,
     pub(crate) port: i32,
-    pub(crate) active: bool,
     pub(crate) http_port: i32,
     pub(crate) availability_zone_id: String,
     pub(crate) scheduling_policy: String,
@@ -1270,7 +1269,6 @@ impl SafekeeperPersistence {
             SkSchedulingPolicy::from_str(&self.scheduling_policy).map_err(|e| {
                 DatabaseError::Logical(format!("can't construct SkSchedulingPolicy: {e:?}"))
             })?;
-        // omit the `active` flag on purpose: it is deprecated.
         Ok(SafekeeperDescribeResponse {
             id: NodeId(self.id as u64),
             region_id: self.region_id.clone(),
@@ -1295,7 +1293,8 @@ pub(crate) struct SafekeeperUpsert {
     pub(crate) version: i64,
     pub(crate) host: String,
     pub(crate) port: i32,
-    pub(crate) active: bool,
+    /// The active flag will not be stored in the database and will be ignored.
+    pub(crate) active: Option<bool>,
     pub(crate) http_port: i32,
     pub(crate) availability_zone_id: String,
 }
@@ -1311,7 +1310,6 @@ impl SafekeeperUpsert {
             version: self.version,
             host: &self.host,
             port: self.port,
-            active: self.active,
             http_port: self.http_port,
             availability_zone_id: &self.availability_zone_id,
             // None means a wish to not update this column. We expose abilities to update it via other means.
@@ -1328,7 +1326,6 @@ struct InsertUpdateSafekeeper<'a> {
     version: i64,
     host: &'a str,
     port: i32,
-    active: bool,
     http_port: i32,
     availability_zone_id: &'a str,
     scheduling_policy: Option<&'a str>,
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index 44c91619ab..14c30c296d 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -36,7 +36,6 @@ diesel::table! {
         version -> Int8,
         host -> Text,
         port -> Int4,
-        active -> Bool,
         http_port -> Int4,
         availability_zone_id -> Text,
         scheduling_policy -> Varchar,

From 70a3bf37a0d2df87ba4bab0f5d73466ecbabfb90 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 11 Jan 2025 15:09:55 +0200
Subject: [PATCH 158/583] Stop building 'compute-tools' image (#10333)

It's been unused from time immemorial.

---------

Co-authored-by: Matthias van de Meent <matthias@neon.tech>
---
 .github/workflows/build_and_test.yml | 47 ++--------------------------
 compute/compute-node.Dockerfile      | 11 -------
 compute_tools/src/bin/fast_import.rs |  2 +-
 docs/docker.md                       |  6 +---
 4 files changed, 5 insertions(+), 61 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 01f5c3ede9..cd95a5b16d 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -728,30 +728,6 @@ jobs:
           tags: |
             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
 
-      - name: Build compute-tools image
-        # compute-tools are Postgres independent, so build it only once
-        # We pick 16, because that builds on debian 11 with older glibc (and is
-        # thus compatible with newer glibc), rather than 17 on Debian 12, as
-        # that isn't guaranteed to be compatible with Debian 11
-        if: matrix.version.pg == 'v16'
-        uses: docker/build-push-action@v6
-        with:
-          target: compute-tools-image
-          context: .
-          build-args: |
-            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
-            DEBIAN_VERSION=${{ matrix.version.debian }}
-          provenance: false
-          push: true
-          pull: true
-          file: compute/compute-node.Dockerfile
-          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
-          tags: |
-            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
-
   compute-node-image:
     needs: [ compute-node-image-arch, tag ]
     permissions:
@@ -794,14 +770,6 @@ jobs:
                                              neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
                                              neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
 
-      - name: Create multi-arch compute-tools image
-        if: matrix.version.pg == 'v16'
-        run: |
-          docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
-                                          -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
-                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
-                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
-
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@v4
         with:
@@ -817,12 +785,6 @@ jobs:
           docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
                                                                                 neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
 
-      - name: Push multi-arch compute-tools image to ECR
-        if: matrix.version.pg == 'v16'
-        run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
-                                                                                neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
-
   vm-compute-node-image:
     needs: [ check-permissions, tag, compute-node-image ]
     runs-on: [ self-hosted, large ]
@@ -1001,9 +963,6 @@ jobs:
             docker buildx imagetools create -t $repo/neon:latest \
                                                $repo/neon:${{ needs.tag.outputs.build-tag }}
 
-            docker buildx imagetools create -t $repo/compute-tools:latest \
-                                               $repo/compute-tools:${{ needs.tag.outputs.build-tag }}
-
             for version in ${VERSIONS}; do
               docker buildx imagetools create -t $repo/compute-node-${version}:latest \
                                                  $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
@@ -1032,7 +991,7 @@ jobs:
       - name: Copy all images to prod ECR
         if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
         run: |
-          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do
+          for image in neon {vm-,}compute-node-{v14,v15,v16,v17}; do
             docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
                                                369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
           done
@@ -1044,7 +1003,7 @@ jobs:
     with:
       client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
       image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
+      images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
       registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
@@ -1056,7 +1015,7 @@ jobs:
     with:
       client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
       image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
+      images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
       registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 303daec240..255dafa401 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1288,17 +1288,6 @@ USER nonroot
 COPY --chown=nonroot . .
 RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy
 
-#########################################################################################
-#
-# Final compute-tools image
-#
-#########################################################################################
-
-FROM debian:$DEBIAN_FLAVOR AS compute-tools-image
-
-COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
-COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import
-
 #########################################################################################
 #
 # Layer "pgbouncer"
diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index 793ec4cf10..f554362751 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -17,7 +17,7 @@
 //!
 //! # Local Testing
 //!
-//! - Comment out most of the pgxns in The Dockerfile.compute-tools to speed up the build.
+//! - Comment out most of the pgxns in compute-node.Dockerfile to speed up the build.
 //! - Build the image with the following command:
 //!
 //! ```bash
diff --git a/docs/docker.md b/docs/docker.md
index 0914a00082..ae74c2b2ab 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -7,15 +7,11 @@ Currently we build two main images:
 - [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
 - [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/compute-node.Dockerfile).
 
-And additional intermediate image:
-
-- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.
-
 ## Build pipeline
 
 We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs
 
-1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14)
+1. `neondatabase/compute-node-v17` (and -16, -v15, -v14)
 
 2. `neondatabase/neon`
 

From 846e8fdce4a9e6a06941c865575350343007c2a4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 11 Jan 2025 16:20:50 +0200
Subject: [PATCH 159/583] Remove obsolete hnsw extension (#8008)

This has been deprecated and disabled for new installations for a long
time. Let's remove it for good.
---
 compute/compute-node.Dockerfile |  14 -
 pgxn/hnsw/Makefile              |  26 --
 pgxn/hnsw/README.md             |  25 --
 pgxn/hnsw/hnsw--0.1.0.sql       |  29 --
 pgxn/hnsw/hnsw.c                | 590 --------------------------------
 pgxn/hnsw/hnsw.control          |   4 -
 pgxn/hnsw/hnsw.h                |  15 -
 pgxn/hnsw/hnswalg.cpp           | 379 --------------------
 pgxn/hnsw/hnswalg.h             |  69 ----
 pgxn/hnsw/test/expected/knn.out |  28 --
 pgxn/hnsw/test/sql/knn.sql      |  13 -
 11 files changed, 1192 deletions(-)
 delete mode 100644 pgxn/hnsw/Makefile
 delete mode 100644 pgxn/hnsw/README.md
 delete mode 100644 pgxn/hnsw/hnsw--0.1.0.sql
 delete mode 100644 pgxn/hnsw/hnsw.c
 delete mode 100644 pgxn/hnsw/hnsw.control
 delete mode 100644 pgxn/hnsw/hnsw.h
 delete mode 100644 pgxn/hnsw/hnswalg.cpp
 delete mode 100644 pgxn/hnsw/hnswalg.h
 delete mode 100644 pgxn/hnsw/test/expected/knn.out
 delete mode 100644 pgxn/hnsw/test/sql/knn.sql

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 255dafa401..f4df507b74 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1258,20 +1258,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
     make -j $(getconf _NPROCESSORS_ONLN) \
         PG_CONFIG=/usr/local/pgsql/bin/pg_config \
         -C pgxn/neon_rmgr \
-        -s install && \
-    case "${PG_VERSION}" in \
-        "v14" | "v15") \
-        ;; \
-        "v16" | "v17") \
-            echo "Skipping HNSW for PostgreSQL ${PG_VERSION}" && exit 0 \
-        ;; \
-        *) \
-            echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-        esac && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/hnsw \
         -s install
 
 #########################################################################################
diff --git a/pgxn/hnsw/Makefile b/pgxn/hnsw/Makefile
deleted file mode 100644
index 66436b5920..0000000000
--- a/pgxn/hnsw/Makefile
+++ /dev/null
@@ -1,26 +0,0 @@
-EXTENSION = hnsw
-EXTVERSION = 0.1.0
-
-MODULE_big = hnsw
-DATA = $(wildcard *--*.sql)
-OBJS = hnsw.o hnswalg.o
-
-TESTS = $(wildcard test/sql/*.sql)
-REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS))
-REGRESS_OPTS = --inputdir=test --load-extension=hnsw
-
-# For auto-vectorization:
-# - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html
-PG_CFLAGS += -O3
-PG_CXXFLAGS +=  -O3 -std=c++11
-PG_LDFLAGS += -lstdc++
-
-all: $(EXTENSION)--$(EXTVERSION).sql
-
-PG_CONFIG ?= pg_config
-PGXS := $(shell $(PG_CONFIG) --pgxs)
-include $(PGXS)
-
-dist:
-	mkdir -p dist
-	git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip master
diff --git a/pgxn/hnsw/README.md b/pgxn/hnsw/README.md
deleted file mode 100644
index bc9c8d571c..0000000000
--- a/pgxn/hnsw/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-# Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors
-
-This ANN extension of Postgres is based
-on [ivf-hnsw](https://github.com/dbaranchuk/ivf-hnsw.git) implementation of [HNSW](https://www.pinecone.io/learn/hnsw),
-the code for the current state-of-the-art billion-scale nearest neighbor search system presented in the paper:
-
-[Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](http://openaccess.thecvf.com/content_ECCV_2018/html/Dmitry_Baranchuk_Revisiting_the_Inverted_ECCV_2018_paper.html),
-<br>
-Dmitry Baranchuk, Artem Babenko, Yury Malkov
-
-# Postgres extension
-
-HNSW index is hold in memory (built on demand) and it's maxial size is limited
-by `maxelements` index parameter. Another required parameter is nubmer of dimensions (if it is not specified in column type).
-Optional parameter `ef` specifies number of neighbors which are considered during index construction and search (corresponds `efConstruction` and `efSearch` parameters
-described in the article).
-
-# Example of usage:
-
-```
-create extension hnsw;
-create table embeddings(id integer primary key, payload real[]);
-create index on embeddings using hnsw(payload) with (maxelements=1000000, dims=100, m=32);
-select id from embeddings order by payload <-> array[1.0, 2.0,...] limit 100;
-```
\ No newline at end of file
diff --git a/pgxn/hnsw/hnsw--0.1.0.sql b/pgxn/hnsw/hnsw--0.1.0.sql
deleted file mode 100644
index ebf424326d..0000000000
--- a/pgxn/hnsw/hnsw--0.1.0.sql
+++ /dev/null
@@ -1,29 +0,0 @@
--- complain if script is sourced in psql, rather than via CREATE EXTENSION
-\echo Use "CREATE EXTENSION hnsw" to load this file. \quit
-
--- functions
-
-CREATE FUNCTION l2_distance(real[], real[]) RETURNS real
-	AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
-
--- operators
-
-CREATE OPERATOR <-> (
-	LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2_distance,
-	COMMUTATOR = '<->'
-);
-
--- access method
-
-CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler
-	AS 'MODULE_PATHNAME' LANGUAGE C;
-
-CREATE ACCESS METHOD hnsw TYPE INDEX HANDLER hnsw_handler;
-
-COMMENT ON ACCESS METHOD hnsw IS 'hnsw index access method';
-
--- opclasses
-
-CREATE OPERATOR CLASS knn_ops
-	DEFAULT FOR TYPE real[] USING hnsw AS
-	OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops;
diff --git a/pgxn/hnsw/hnsw.c b/pgxn/hnsw/hnsw.c
deleted file mode 100644
index e624cb831f..0000000000
--- a/pgxn/hnsw/hnsw.c
+++ /dev/null
@@ -1,590 +0,0 @@
-#include "postgres.h"
-
-#include "access/amapi.h"
-#include "access/generic_xlog.h"
-#include "access/relation.h"
-#include "access/reloptions.h"
-#include "access/tableam.h"
-#include "catalog/index.h"
-#include "commands/vacuum.h"
-#include "nodes/execnodes.h"
-#include "storage/bufmgr.h"
-#include "utils/guc.h"
-#include "utils/selfuncs.h"
-
-#include <math.h>
-#include <float.h>
-
-#include "hnsw.h"
-
-PG_MODULE_MAGIC;
-
-typedef struct {
-	int32 vl_len_;		/* varlena header (do not touch directly!) */
-	int dims;
-	int maxelements;
-	int efConstruction;
-	int efSearch;
-	int M;
-} HnswOptions;
-
-static relopt_kind hnsw_relopt_kind;
-
-typedef struct {
-	HierarchicalNSW* hnsw;
-	size_t curr;
-	size_t n_results;
-	ItemPointer results;
-} HnswScanOpaqueData;
-
-typedef HnswScanOpaqueData* HnswScanOpaque;
-
-typedef struct {
-	Oid relid;
-	uint32 status;
-	HierarchicalNSW* hnsw;
-} HnswHashEntry;
-
-
-#define SH_PREFIX			 hnsw_index
-#define SH_ELEMENT_TYPE		 HnswHashEntry
-#define SH_KEY_TYPE			 Oid
-#define SH_KEY				 relid
-#define SH_STORE_HASH
-#define SH_GET_HASH(tb, a)	 ((a)->relid)
-#define SH_HASH_KEY(tb, key) (key)
-#define SH_EQUAL(tb, a, b)	((a) == (b))
-#define SH_SCOPE			static inline
-#define SH_DEFINE
-#define SH_DECLARE
-#include "lib/simplehash.h"
-
-#define INDEX_HASH_SIZE     11
-
-#define DEFAULT_EF_SEARCH   64
-
-PGDLLEXPORT void _PG_init(void);
-
-static hnsw_index_hash *hnsw_indexes;
-
-/*
- * Initialize index options and variables
- */
-void
-_PG_init(void)
-{
-	hnsw_relopt_kind = add_reloption_kind();
-	add_int_reloption(hnsw_relopt_kind, "dims", "Number of dimensions",
-					  0, 0, INT_MAX, AccessExclusiveLock);
-	add_int_reloption(hnsw_relopt_kind, "maxelements", "Maximal number of elements",
-					  0, 0, INT_MAX, AccessExclusiveLock);
-	add_int_reloption(hnsw_relopt_kind, "m", "Number of neighbors of each vertex",
-					  100, 0, INT_MAX, AccessExclusiveLock);
-	add_int_reloption(hnsw_relopt_kind, "efconstruction", "Number of inspected neighbors during index construction",
-					  16, 1, INT_MAX, AccessExclusiveLock);
-	add_int_reloption(hnsw_relopt_kind, "efsearch", "Number of inspected neighbors during index search",
-					  64, 1, INT_MAX, AccessExclusiveLock);
-	hnsw_indexes = hnsw_index_create(TopMemoryContext, INDEX_HASH_SIZE, NULL);
-}
-
-
-static void
-hnsw_build_callback(Relation index, ItemPointer tid, Datum *values,
-					bool *isnull, bool tupleIsAlive, void *state)
-{
-	HierarchicalNSW* hnsw = (HierarchicalNSW*) state;
-	ArrayType* array;
-	int n_items;
-	label_t label = 0;
-
-	/* Skip nulls */
-	if (isnull[0])
-		return;
-
-	array = DatumGetArrayTypeP(values[0]);
-	n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
-	if (n_items != hnsw_dimensions(hnsw))
-	{
-		elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
-			 n_items, hnsw_dimensions(hnsw));
-	}
-
-	memcpy(&label, tid, sizeof(*tid));
-	hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label);
-}
-
-static void
-hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel)
-{
-	IndexInfo* indexInfo = BuildIndexInfo(indexRel);
-	Assert(indexInfo->ii_NumIndexAttrs == 1);
-	table_index_build_scan(heapRel, indexRel, indexInfo,
-						   true, true, hnsw_build_callback, (void *) hnsw, NULL);
-}
-
-#ifdef __APPLE__
-
-#include <sys/types.h>
-#include <sys/sysctl.h>
-
-static void
-hnsw_check_available_memory(Size requested)
-{
-	size_t total;
-	if (sysctlbyname("hw.memsize", NULL, &total, NULL, 0) < 0)
-		elog(ERROR, "Failed to get amount of RAM: %m");
-
-	if ((Size)NBuffers*BLCKSZ + requested >= total)
-		elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
-			requested, total - (Size)NBuffers*BLCKSZ);
-}
-
-#else
-
-#include <sys/sysinfo.h>
-
-static void
-hnsw_check_available_memory(Size requested)
-{
-	struct sysinfo si;
-	Size total;
-	if (sysinfo(&si) < 0)
-		elog(ERROR, "Failed to get amount of RAM: %m");
-
-	total = si.totalram*si.mem_unit;
-	if ((Size)NBuffers*BLCKSZ + requested >= total)
-		elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
-			requested, total - (Size)NBuffers*BLCKSZ);
-}
-
-#endif
-
-static HierarchicalNSW*
-hnsw_get_index(Relation indexRel, Relation heapRel)
-{
-	HierarchicalNSW* hnsw;
-	Oid indexoid = RelationGetRelid(indexRel);
-	HnswHashEntry* entry = hnsw_index_lookup(hnsw_indexes, indexoid);
-	if (entry == NULL)
-	{
-		size_t dims, maxelements;
-		size_t M;
-		size_t maxM;
-		size_t size_links_level0;
-		size_t size_data_per_element;
-		size_t data_size;
-		dsm_handle handle = indexoid << 1; /* make it even */
-		void* impl_private = NULL;
-		void* mapped_address = NULL;
-		Size  mapped_size = 0;
-		Size  shmem_size;
-		bool exists = true;
-		bool found;
-		HnswOptions *opts = (HnswOptions *) indexRel->rd_options;
-		if (opts == NULL || opts->maxelements == 0 || opts->dims == 0) {
-			elog(ERROR, "HNSW index requires 'maxelements' and 'dims' to be specified");
-		}
-		dims = opts->dims;
-		maxelements = opts->maxelements;
-		M = opts->M;
-		maxM = M * 2;
-		data_size = dims * sizeof(coord_t);
-		size_links_level0 = (maxM + 1) * sizeof(idx_t);
-		size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
-		shmem_size =  hnsw_sizeof() + maxelements * size_data_per_element;
-
-		hnsw_check_available_memory(shmem_size);
-
-		/* first try to attach to existed index */
-		if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
-						 &mapped_address, &mapped_size, DEBUG1))
-		{
-			/* index doesn't exists: try to create it */
-			if (!dsm_impl_op(DSM_OP_CREATE, handle, shmem_size, &impl_private,
-							 &mapped_address, &mapped_size, DEBUG1))
-			{
-				/* We can do it under shared lock, so some other backend may
-				 * try to initialize index. If create is failed because index already
-				 * created by somebody else, then try to attach to it once again
-				 */
-				if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
-								 &mapped_address, &mapped_size, ERROR))
-				{
-					return NULL;
-				}
-			}
-			else
-			{
-				exists = false;
-			}
-		}
-		Assert(mapped_size == shmem_size);
-		hnsw = (HierarchicalNSW*)mapped_address;
-
-		if (!exists)
-		{
-			hnsw_init(hnsw, dims, maxelements, M, maxM, opts->efConstruction);
-			hnsw_populate(hnsw, indexRel, heapRel);
-		}
-		entry = hnsw_index_insert(hnsw_indexes, indexoid, &found);
-		Assert(!found);
-		entry->hnsw = hnsw;
-	}
-	else
-	{
-		hnsw = entry->hnsw;
-	}
-	return hnsw;
-}
-
-/*
- * Start or restart an index scan
- */
-static IndexScanDesc
-hnsw_beginscan(Relation index, int nkeys, int norderbys)
-{
-	IndexScanDesc scan = RelationGetIndexScan(index, nkeys, norderbys);
-	HnswScanOpaque so = (HnswScanOpaque) palloc(sizeof(HnswScanOpaqueData));
-	Relation heap = relation_open(index->rd_index->indrelid, NoLock);
-	so->hnsw = hnsw_get_index(index, heap);
-	relation_close(heap, NoLock);
-	so->curr = 0;
-	so->n_results = 0;
-	so->results = NULL;
-	scan->opaque = so;
-	return scan;
-}
-
-/*
- * Start or restart an index scan
- */
-static void
-hnsw_rescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys)
-{
-	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
-	if (so->results)
-	{
-		pfree(so->results);
-		so->results = NULL;
-	}
-	so->curr = 0;
-	if (orderbys && scan->numberOfOrderBys > 0)
-		memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData));
-}
-
-/*
- * Fetch the next tuple in the given scan
- */
-static bool
-hnsw_gettuple(IndexScanDesc scan, ScanDirection dir)
-{
-	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
-
-	/*
-	 * Index can be used to scan backward, but Postgres doesn't support
-	 * backward scan on operators
-	 */
-	Assert(ScanDirectionIsForward(dir));
-
-	if (so->curr == 0)
-	{
-		Datum		value;
-		ArrayType*	array;
-		int         n_items;
-		size_t      n_results;
-		label_t*    results;
-		HnswOptions *opts = (HnswOptions *) scan->indexRelation->rd_options;
-		size_t      efSearch = opts ? opts->efSearch : DEFAULT_EF_SEARCH;
-
-		/* Safety check */
-		if (scan->orderByData == NULL)
-			elog(ERROR, "cannot scan HNSW index without order");
-
-		/* No items will match if null */
-		if (scan->orderByData->sk_flags & SK_ISNULL)
-			return false;
-
-		value = scan->orderByData->sk_argument;
-		array = DatumGetArrayTypeP(value);
-		n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
-		if (n_items != hnsw_dimensions(so->hnsw))
-		{
-			elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
-				 n_items, hnsw_dimensions(so->hnsw));
-		}
-
-		if (!hnsw_search(so->hnsw, (coord_t*)ARR_DATA_PTR(array), efSearch, &n_results, &results))
-			elog(ERROR, "HNSW index search failed");
-		so->results = (ItemPointer)palloc(n_results*sizeof(ItemPointerData));
-		so->n_results = n_results;
-		for (size_t i = 0; i < n_results; i++)
-		{
-			memcpy(&so->results[i], &results[i], sizeof(so->results[i]));
-		}
-		free(results);
-	}
-	if (so->curr >= so->n_results)
-	{
-		return false;
-	}
-	else
-	{
-		scan->xs_heaptid = so->results[so->curr++];
-		scan->xs_recheckorderby = false;
-		return true;
-	}
-}
-
-/*
- * End a scan and release resources
- */
-static void
-hnsw_endscan(IndexScanDesc scan)
-{
-	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
-	if (so->results)
-		pfree(so->results);
-	pfree(so);
-	scan->opaque = NULL;
-}
-
-
-/*
- * Estimate the cost of an index scan
- */
-static void
-hnsw_costestimate(PlannerInfo *root, IndexPath *path, double loop_count,
-				 Cost *indexStartupCost, Cost *indexTotalCost,
-				 Selectivity *indexSelectivity, double *indexCorrelation
-				 ,double *indexPages
-)
-{
-	GenericCosts costs;
-
-	/* Never use index without order */
-	if (path->indexorderbys == NULL)
-	{
-		*indexStartupCost = DBL_MAX;
-		*indexTotalCost = DBL_MAX;
-		*indexSelectivity = 0;
-		*indexCorrelation = 0;
-		*indexPages = 0;
-		return;
-	}
-
-	MemSet(&costs, 0, sizeof(costs));
-
-	genericcostestimate(root, path, loop_count, &costs);
-
-	/* Startup cost and total cost are same */
-	*indexStartupCost = costs.indexTotalCost;
-	*indexTotalCost = costs.indexTotalCost;
-	*indexSelectivity = costs.indexSelectivity;
-	*indexCorrelation = costs.indexCorrelation;
-	*indexPages = costs.numIndexPages;
-}
-
-/*
- * Parse and validate the reloptions
- */
-static bytea *
-hnsw_options(Datum reloptions, bool validate)
-{
-	static const relopt_parse_elt tab[] = {
-		{"dims", RELOPT_TYPE_INT, offsetof(HnswOptions, dims)},
-		{"maxelements", RELOPT_TYPE_INT, offsetof(HnswOptions, maxelements)},
-		{"efconstruction", RELOPT_TYPE_INT, offsetof(HnswOptions, efConstruction)},
-		{"efsearch", RELOPT_TYPE_INT, offsetof(HnswOptions, efSearch)},
-		{"m", RELOPT_TYPE_INT, offsetof(HnswOptions, M)}
-	};
-
-	return (bytea *) build_reloptions(reloptions, validate,
-									  hnsw_relopt_kind,
-									  sizeof(HnswOptions),
-									  tab, lengthof(tab));
-}
-
-/*
- * Validate catalog entries for the specified operator class
- */
-static bool
-hnsw_validate(Oid opclassoid)
-{
-	return true;
-}
-
-/*
- * Build the index for a logged table
- */
-static IndexBuildResult *
-hnsw_build(Relation heap, Relation index, IndexInfo *indexInfo)
-{
-	HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
-	IndexBuildResult* result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
-	result->heap_tuples = result->index_tuples = hnsw_count(hnsw);
-
-	return result;
-}
-
-/*
- * Insert a tuple into the index
- */
-static bool
-hnsw_insert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid,
-			  Relation heap, IndexUniqueCheck checkUnique,
-			  bool indexUnchanged,
-			  IndexInfo *indexInfo)
-{
-	HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
-	Datum value;
-	ArrayType* array;
-	int n_items;
-	label_t label = 0;
-
-	/* Skip nulls */
-	if (isnull[0])
-		return false;
-
-	/* Detoast value */
-	value = PointerGetDatum(PG_DETOAST_DATUM(values[0]));
-	array = DatumGetArrayTypeP(value);
-	n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
-	if (n_items != hnsw_dimensions(hnsw))
-	{
-		elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
-			 n_items, hnsw_dimensions(hnsw));
-	}
-	memcpy(&label, heap_tid, sizeof(*heap_tid));
-	if (!hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label))
-		elog(ERROR, "HNSW index insert failed");
-	return true;
-}
-
-/*
- * Build the index for an unlogged table
- */
-static void
-hnsw_buildempty(Relation index)
-{
-	/* index will be constructed on dema nd when accessed */
-}
-
-/*
- * Clean up after a VACUUM operation
- */
-static IndexBulkDeleteResult *
-hnsw_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
-{
-	Relation	rel = info->index;
-
-	if (stats == NULL)
-		return NULL;
-
-	stats->num_pages = RelationGetNumberOfBlocks(rel);
-
-	return stats;
-}
-
-/*
- * Bulk delete tuples from the index
- */
-static IndexBulkDeleteResult *
-hnsw_bulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
-				IndexBulkDeleteCallback callback, void *callback_state)
-{
-	if (stats == NULL)
-		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
-	return stats;
-}
-
-/*
- * Define index handler
- *
- * See https://www.postgresql.org/docs/current/index-api.html
- */
-PGDLLEXPORT PG_FUNCTION_INFO_V1(hnsw_handler);
-Datum
-hnsw_handler(PG_FUNCTION_ARGS)
-{
-	IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
-
-	amroutine->amstrategies = 0;
-	amroutine->amsupport = 0;
-	amroutine->amoptsprocnum = 0;
-	amroutine->amcanorder = false;
-	amroutine->amcanorderbyop = true;
-	amroutine->amcanbackward = false;	/* can change direction mid-scan */
-	amroutine->amcanunique = false;
-	amroutine->amcanmulticol = false;
-	amroutine->amoptionalkey = true;
-	amroutine->amsearcharray = false;
-	amroutine->amsearchnulls = false;
-	amroutine->amstorage = false;
-	amroutine->amclusterable = false;
-	amroutine->ampredlocks = false;
-	amroutine->amcanparallel = false;
-	amroutine->amcaninclude = false;
-	amroutine->amusemaintenanceworkmem = false; /* not used during VACUUM */
-	amroutine->amparallelvacuumoptions = VACUUM_OPTION_PARALLEL_BULKDEL;
-	amroutine->amkeytype = InvalidOid;
-
-	/* Interface functions */
-	amroutine->ambuild = hnsw_build;
-	amroutine->ambuildempty = hnsw_buildempty;
-	amroutine->aminsert = hnsw_insert;
-	amroutine->ambulkdelete = hnsw_bulkdelete;
-	amroutine->amvacuumcleanup = hnsw_vacuumcleanup;
-	amroutine->amcanreturn = NULL;	/* tuple not included in heapsort */
-	amroutine->amcostestimate = hnsw_costestimate;
-	amroutine->amoptions = hnsw_options;
-	amroutine->amproperty = NULL;	/* TODO AMPROP_DISTANCE_ORDERABLE */
-	amroutine->ambuildphasename = NULL;
-	amroutine->amvalidate = hnsw_validate;
-	amroutine->amadjustmembers = NULL;
-	amroutine->ambeginscan = hnsw_beginscan;
-	amroutine->amrescan = hnsw_rescan;
-	amroutine->amgettuple = hnsw_gettuple;
-	amroutine->amgetbitmap = NULL;
-	amroutine->amendscan = hnsw_endscan;
-	amroutine->ammarkpos = NULL;
-	amroutine->amrestrpos = NULL;
-
-	/* Interface functions to support parallel index scans */
-	amroutine->amestimateparallelscan = NULL;
-	amroutine->aminitparallelscan = NULL;
-	amroutine->amparallelrescan = NULL;
-
-	PG_RETURN_POINTER(amroutine);
-}
-
-/*
- * Get the L2 distance between vectors
- */
-PGDLLEXPORT PG_FUNCTION_INFO_V1(l2_distance);
-Datum
-l2_distance(PG_FUNCTION_ARGS)
-{
-	ArrayType  *a = PG_GETARG_ARRAYTYPE_P(0);
-	ArrayType  *b = PG_GETARG_ARRAYTYPE_P(1);
-	int         a_dim = ArrayGetNItems(ARR_NDIM(a), ARR_DIMS(a));
-	int         b_dim = ArrayGetNItems(ARR_NDIM(b), ARR_DIMS(b));
-	dist_t 		distance = 0.0;
-	dist_t		diff;
-	coord_t	   *ax = (coord_t*)ARR_DATA_PTR(a);
-	coord_t	   *bx = (coord_t*)ARR_DATA_PTR(b);
-
-	if (a_dim != b_dim)
-	{
-		ereport(ERROR,
-				(errcode(ERRCODE_DATA_EXCEPTION),
-				 errmsg("different array dimensions %d and %d", a_dim, b_dim)));
-	}
-
-	for (int i = 0; i < a_dim; i++)
-	{
-		diff = ax[i] - bx[i];
-		distance += diff * diff;
-	}
-
-	PG_RETURN_FLOAT4((dist_t)sqrt(distance));
-}
diff --git a/pgxn/hnsw/hnsw.control b/pgxn/hnsw/hnsw.control
deleted file mode 100644
index fbfa1a5b47..0000000000
--- a/pgxn/hnsw/hnsw.control
+++ /dev/null
@@ -1,4 +0,0 @@
-comment = '** Deprecated ** Please use pg_embedding instead'
-default_version = '0.1.0'
-module_pathname = '$libdir/hnsw'
-relocatable = true
diff --git a/pgxn/hnsw/hnsw.h b/pgxn/hnsw/hnsw.h
deleted file mode 100644
index d4065ab8fe..0000000000
--- a/pgxn/hnsw/hnsw.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#pragma once
-
-typedef float    coord_t;
-typedef float    dist_t;
-typedef uint32_t idx_t;
-typedef uint64_t label_t;
-
-typedef struct HierarchicalNSW HierarchicalNSW;
-
-bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results);
-bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label);
-void hnsw_init(HierarchicalNSW* hnsw, size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
-int  hnsw_dimensions(HierarchicalNSW* hnsw);
-size_t hnsw_count(HierarchicalNSW* hnsw);
-size_t hnsw_sizeof(void);
diff --git a/pgxn/hnsw/hnswalg.cpp b/pgxn/hnsw/hnswalg.cpp
deleted file mode 100644
index f6de3b8314..0000000000
--- a/pgxn/hnsw/hnswalg.cpp
+++ /dev/null
@@ -1,379 +0,0 @@
-#include "hnswalg.h"
-
-#if defined(__GNUC__)
-#define PORTABLE_ALIGN32 __attribute__((aligned(32)))
-#define PREFETCH(addr,hint) __builtin_prefetch(addr, 0, hint)
-#else
-#define PORTABLE_ALIGN32 __declspec(align(32))
-#define PREFETCH(addr,hint)
-#endif
-
-HierarchicalNSW::HierarchicalNSW(size_t dim_, size_t maxelements_, size_t M_, size_t maxM_, size_t efConstruction_)
-{
-    dim = dim_;
-    data_size = dim * sizeof(coord_t);
-
-    efConstruction = efConstruction_;
-
-    maxelements = maxelements_;
-    M = M_;
-    maxM = maxM_;
-    size_links_level0 = (maxM + 1) * sizeof(idx_t);
-    size_data_per_element = size_links_level0 + data_size  + sizeof(label_t);
-    offset_data = size_links_level0;
-	offset_label = offset_data + data_size;
-
-    enterpoint_node = 0;
-    cur_element_count = 0;
-#ifdef __x86_64__
-    use_avx2 = __builtin_cpu_supports("avx2");
-#endif
-}
-
-std::priority_queue<std::pair<dist_t, idx_t>> HierarchicalNSW::searchBaseLayer(const coord_t *point, size_t ef)
-{
-	std::vector<uint32_t> visited;
-	visited.resize((cur_element_count + 31) >> 5);
-
-    std::priority_queue<std::pair<dist_t, idx_t >> topResults;
-    std::priority_queue<std::pair<dist_t, idx_t >> candidateSet;
-
-    dist_t dist = fstdistfunc(point, getDataByInternalId(enterpoint_node));
-
-    topResults.emplace(dist, enterpoint_node);
-    candidateSet.emplace(-dist, enterpoint_node);
-    visited[enterpoint_node >> 5] = 1 << (enterpoint_node & 31);
-    dist_t lowerBound = dist;
-
-    while (!candidateSet.empty())
-    {
-        std::pair<dist_t, idx_t> curr_el_pair = candidateSet.top();
-        if (-curr_el_pair.first > lowerBound)
-            break;
-
-        candidateSet.pop();
-        idx_t curNodeNum = curr_el_pair.second;
-
-        idx_t* data = get_linklist0(curNodeNum);
-        size_t size = *data++;
-
-        PREFETCH(getDataByInternalId(*data), 0);
-
-        for (size_t j = 0; j < size; ++j) {
-            size_t tnum = *(data + j);
-
-            PREFETCH(getDataByInternalId(*(data + j + 1)), 0);
-
-            if (!(visited[tnum >> 5] & (1 << (tnum & 31)))) {
-				visited[tnum >> 5] |= 1 << (tnum & 31);
-
-                dist = fstdistfunc(point, getDataByInternalId(tnum));
-
-                if (topResults.top().first > dist || topResults.size() < ef) {
-                    candidateSet.emplace(-dist, tnum);
-
-                    PREFETCH(get_linklist0(candidateSet.top().second), 0);
-                    topResults.emplace(dist, tnum);
-
-                    if (topResults.size() > ef)
-                        topResults.pop();
-
-                    lowerBound = topResults.top().first;
-                }
-            }
-        }
-    }
-    return topResults;
-}
-
-
-void HierarchicalNSW::getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN)
-{
-    if (topResults.size() < NN)
-        return;
-
-    std::priority_queue<std::pair<dist_t, idx_t>> resultSet;
-    std::vector<std::pair<dist_t, idx_t>> returnlist;
-
-    while (topResults.size() > 0) {
-        resultSet.emplace(-topResults.top().first, topResults.top().second);
-        topResults.pop();
-    }
-
-    while (resultSet.size()) {
-        if (returnlist.size() >= NN)
-            break;
-        std::pair<dist_t, idx_t> curen = resultSet.top();
-        dist_t dist_to_query = -curen.first;
-        resultSet.pop();
-        bool good = true;
-        for (std::pair<dist_t, idx_t> curen2 : returnlist) {
-            dist_t curdist = fstdistfunc(getDataByInternalId(curen2.second),
-                                         getDataByInternalId(curen.second));
-            if (curdist < dist_to_query) {
-                good = false;
-                break;
-            }
-        }
-        if (good) returnlist.push_back(curen);
-    }
-    for (std::pair<dist_t, idx_t> elem : returnlist)
-        topResults.emplace(-elem.first, elem.second);
-}
-
-void HierarchicalNSW::mutuallyConnectNewElement(const coord_t *point, idx_t cur_c,
-                               std::priority_queue<std::pair<dist_t, idx_t>> topResults)
-{
-    getNeighborsByHeuristic(topResults, M);
-
-    std::vector<idx_t> res;
-    res.reserve(M);
-    while (topResults.size() > 0) {
-        res.push_back(topResults.top().second);
-        topResults.pop();
-    }
-    {
-        idx_t* data = get_linklist0(cur_c);
-        if (*data)
-            throw std::runtime_error("Should be blank");
-
-        *data++ = res.size();
-
-        for (size_t idx = 0; idx < res.size(); idx++) {
-            if (data[idx])
-                throw std::runtime_error("Should be blank");
-            data[idx] = res[idx];
-        }
-    }
-    for (size_t idx = 0; idx < res.size(); idx++) {
-        if (res[idx] == cur_c)
-            throw std::runtime_error("Connection to the same element");
-
-        size_t resMmax = maxM;
-        idx_t *ll_other = get_linklist0(res[idx]);
-        idx_t sz_link_list_other = *ll_other;
-
-        if (sz_link_list_other > resMmax || sz_link_list_other < 0)
-            throw std::runtime_error("Bad sz_link_list_other");
-
-        if (sz_link_list_other < resMmax) {
-            idx_t *data = ll_other + 1;
-            data[sz_link_list_other] = cur_c;
-            *ll_other = sz_link_list_other + 1;
-        } else {
-            // finding the "weakest" element to replace it with the new one
-            idx_t *data = ll_other + 1;
-            dist_t d_max = fstdistfunc(getDataByInternalId(cur_c), getDataByInternalId(res[idx]));
-            // Heuristic:
-            std::priority_queue<std::pair<dist_t, idx_t>> candidates;
-            candidates.emplace(d_max, cur_c);
-
-            for (size_t j = 0; j < sz_link_list_other; j++)
-                candidates.emplace(fstdistfunc(getDataByInternalId(data[j]), getDataByInternalId(res[idx])), data[j]);
-
-            getNeighborsByHeuristic(candidates, resMmax);
-
-            size_t indx = 0;
-            while (!candidates.empty()) {
-                data[indx] = candidates.top().second;
-                candidates.pop();
-                indx++;
-            }
-            *ll_other = indx;
-        }
-    }
-}
-
-void HierarchicalNSW::addPoint(const coord_t *point, label_t label)
-{
-    if (cur_element_count >= maxelements) {
-        throw std::runtime_error("The number of elements exceeds the specified limit");
-    }
-    idx_t cur_c = cur_element_count++;
-    memset((char *) get_linklist0(cur_c), 0, size_data_per_element);
-    memcpy(getDataByInternalId(cur_c), point, data_size);
-    memcpy(getExternalLabel(cur_c), &label, sizeof label);
-
-    // Do nothing for the first element
-    if (cur_c != 0) {
-        std::priority_queue <std::pair<dist_t, idx_t>> topResults = searchBaseLayer(point, efConstruction);
-        mutuallyConnectNewElement(point, cur_c, topResults);
-    }
-};
-
-std::priority_queue<std::pair<dist_t, label_t>> HierarchicalNSW::searchKnn(const coord_t *query, size_t k)
-{
-	std::priority_queue<std::pair<dist_t, label_t>> topResults;
-	auto topCandidates = searchBaseLayer(query, k);
-    while (topCandidates.size() > k) {
-        topCandidates.pop();
-	}
-	while (!topCandidates.empty()) {
-		std::pair<dist_t, idx_t> rez = topCandidates.top();
-		label_t label;
-		memcpy(&label, getExternalLabel(rez.second), sizeof(label));
-		topResults.push(std::pair<dist_t, label_t>(rez.first, label));
-		topCandidates.pop();
-	}
-
-    return topResults;
-};
-
-dist_t fstdistfunc_scalar(const coord_t *x, const coord_t *y, size_t n)
-{
-    dist_t 	distance = 0.0;
-
-    for (size_t i = 0; i < n; i++)
-    {
-        dist_t diff = x[i] - y[i];
-        distance += diff * diff;
-    }
-    return distance;
-
-}
-
-#ifdef __x86_64__
-#include <immintrin.h>
-
-__attribute__((target("avx2")))
-dist_t fstdistfunc_avx2(const coord_t *x, const coord_t *y, size_t n)
-{
-    const size_t TmpResSz = sizeof(__m256) / sizeof(float);
-    float PORTABLE_ALIGN32 TmpRes[TmpResSz];
-    size_t qty16 = n / 16;
-    const float *pEnd1 = x + (qty16 * 16);
-    __m256 diff, v1, v2;
-    __m256 sum = _mm256_set1_ps(0);
-
-    while (x < pEnd1) {
-        v1 = _mm256_loadu_ps(x);
-        x += 8;
-        v2 = _mm256_loadu_ps(y);
-        y += 8;
-        diff = _mm256_sub_ps(v1, v2);
-        sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
-
-        v1 = _mm256_loadu_ps(x);
-        x += 8;
-        v2 = _mm256_loadu_ps(y);
-        y += 8;
-        diff = _mm256_sub_ps(v1, v2);
-        sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
-    }
-    _mm256_store_ps(TmpRes, sum);
-    float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
-    return (res);
-}
-
-dist_t fstdistfunc_sse(const coord_t *x, const coord_t *y, size_t n)
-{
-    const size_t TmpResSz = sizeof(__m128) / sizeof(float);
-    float PORTABLE_ALIGN32 TmpRes[TmpResSz];
-    size_t qty16 = n / 16;
-    const float *pEnd1 = x + (qty16 * 16);
-
-    __m128 diff, v1, v2;
-    __m128 sum = _mm_set1_ps(0);
-
-    while (x < pEnd1) {
-        v1 = _mm_loadu_ps(x);
-        x += 4;
-        v2 = _mm_loadu_ps(y);
-        y += 4;
-        diff = _mm_sub_ps(v1, v2);
-        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-
-        v1 = _mm_loadu_ps(x);
-        x += 4;
-        v2 = _mm_loadu_ps(y);
-        y += 4;
-        diff = _mm_sub_ps(v1, v2);
-        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-
-        v1 = _mm_loadu_ps(x);
-        x += 4;
-        v2 = _mm_loadu_ps(y);
-        y += 4;
-        diff = _mm_sub_ps(v1, v2);
-        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-
-        v1 = _mm_loadu_ps(x);
-        x += 4;
-        v2 = _mm_loadu_ps(y);
-        y += 4;
-        diff = _mm_sub_ps(v1, v2);
-        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-    }
-    _mm_store_ps(TmpRes, sum);
-    float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
-    return res;
-}
-#endif
-
-dist_t HierarchicalNSW::fstdistfunc(const coord_t *x, const coord_t *y)
-{
-#ifndef __x86_64__
-    return fstdistfunc_scalar(x, y, dim);
-#else
-    if(use_avx2)
-        return fstdistfunc_avx2(x, y, dim);
-
-    return fstdistfunc_sse(x, y, dim);
-#endif
-}
-
-bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results)
-{
-	try
-	{
-		auto result = hnsw->searchKnn(point, efSearch);
-		size_t nResults = result.size();
-		*results = (label_t*)malloc(nResults*sizeof(label_t));
-		for (size_t i = nResults; i-- != 0;)
-		{
-			(*results)[i] = result.top().second;
-			result.pop();
-		}
-		*n_results = nResults;
-		return true;
-	}
-	catch (std::exception& x)
-	{
-		return false;
-	}
-}
-
-bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label)
-{
-	try
-	{
-		hnsw->addPoint(point, label);
-		return true;
-	}
-	catch (std::exception& x)
-	{
-		fprintf(stderr, "Catch %s\n", x.what());
-		return false;
-	}
-}
-
-void hnsw_init(HierarchicalNSW* hnsw, size_t dims, size_t maxelements, size_t M, size_t maxM, size_t efConstruction)
-{
-	new ((void*)hnsw) HierarchicalNSW(dims, maxelements, M, maxM, efConstruction);
-}
-
-
-int hnsw_dimensions(HierarchicalNSW* hnsw)
-{
-	return (int)hnsw->dim;
-}
-
-size_t hnsw_count(HierarchicalNSW* hnsw)
-{
-	return hnsw->cur_element_count;
-}
-
-size_t hnsw_sizeof(void)
-{
-	return sizeof(HierarchicalNSW);
-}
diff --git a/pgxn/hnsw/hnswalg.h b/pgxn/hnsw/hnswalg.h
deleted file mode 100644
index f38aeac362..0000000000
--- a/pgxn/hnsw/hnswalg.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#pragma once
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <unordered_map>
-#include <unordered_set>
-#include <map>
-#include <cmath>
-#include <queue>
-#include <stdexcept>
-
-extern "C" {
-#include "hnsw.h"
-}
-
-struct HierarchicalNSW
-{
-	size_t maxelements;
-	size_t cur_element_count;
-
-	idx_t  enterpoint_node;
-
-	size_t dim;
-	size_t data_size;
-	size_t offset_data;
-	size_t offset_label;
-	size_t size_data_per_element;
-	size_t M;
-	size_t maxM;
-	size_t size_links_level0;
-	size_t efConstruction;
-
-#ifdef __x86_64__
-	bool	use_avx2;
-#endif
-
-	char   data_level0_memory[0]; // varying size
-
-  public:
-	HierarchicalNSW(size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
-	~HierarchicalNSW();
-
-
-	inline coord_t *getDataByInternalId(idx_t internal_id) const {
-		return (coord_t *)&data_level0_memory[internal_id * size_data_per_element + offset_data];
-	}
-
-	inline idx_t *get_linklist0(idx_t internal_id) const {
-		return (idx_t*)&data_level0_memory[internal_id * size_data_per_element];
-	}
-
-	inline label_t *getExternalLabel(idx_t internal_id) const {
-		return (label_t *)&data_level0_memory[internal_id * size_data_per_element + offset_label];
-	}
-
-	std::priority_queue<std::pair<dist_t, idx_t>> searchBaseLayer(const coord_t *x, size_t ef);
-
-	void getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN);
-
-	void mutuallyConnectNewElement(const coord_t *x, idx_t id, std::priority_queue<std::pair<dist_t, idx_t>> topResults);
-
-	void addPoint(const coord_t *point, label_t label);
-
-	std::priority_queue<std::pair<dist_t, label_t>> searchKnn(const coord_t *query_data, size_t k);
-
-	dist_t fstdistfunc(const coord_t *x, const coord_t *y);
-};
diff --git a/pgxn/hnsw/test/expected/knn.out b/pgxn/hnsw/test/expected/knn.out
deleted file mode 100644
index a1cee4525e..0000000000
--- a/pgxn/hnsw/test/expected/knn.out
+++ /dev/null
@@ -1,28 +0,0 @@
-SET enable_seqscan = off;
-CREATE TABLE t (val real[]);
-INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
-CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
-INSERT INTO t (val) VALUES (array[1,2,4]);
-explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
-                             QUERY PLAN                             
---------------------------------------------------------------------
- Index Scan using t_val_idx on t  (cost=4.02..8.06 rows=3 width=36)
-   Order By: (val <-> '{3,3,3}'::real[])
-(2 rows)
-
-SELECT * FROM t ORDER BY val <-> array[3,3,3];
-   val   
----------
- {1,2,3}
- {1,2,4}
- {1,1,1}
- {0,0,0}
-(4 rows)
-
-SELECT COUNT(*) FROM t;
- count 
--------
-     5
-(1 row)
-
-DROP TABLE t;
diff --git a/pgxn/hnsw/test/sql/knn.sql b/pgxn/hnsw/test/sql/knn.sql
deleted file mode 100644
index 0635bda4a2..0000000000
--- a/pgxn/hnsw/test/sql/knn.sql
+++ /dev/null
@@ -1,13 +0,0 @@
-SET enable_seqscan = off;
-
-CREATE TABLE t (val real[]);
-INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
-CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
-
-INSERT INTO t (val) VALUES (array[1,2,4]);
-
-explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
-SELECT * FROM t ORDER BY val <-> array[3,3,3];
-SELECT COUNT(*) FROM t;
-
-DROP TABLE t;

From 8327f68043e692c77f70d6a6dafa463636c01578 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 11 Jan 2025 19:39:27 +0200
Subject: [PATCH 160/583] Minor cleanup of extension build commands (#10356)

There used to be some pg version dependencies in these extensions, but
now that there isn't, follow the simpler pattern used in other
extensions. No change in the produced images.
---
 compute/compute-node.Dockerfile | 34 ++++-----------------------------
 1 file changed, 4 insertions(+), 30 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index f4df507b74..449e12af5d 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -976,22 +976,9 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p
 
 FROM rust-extensions-build-pgrx12 AS pg-jsonschema-pg-build
 ARG PG_VERSION
-# version 0.3.3 supports v17
 # last release v0.3.3 - Oct 16, 2024
-#
-# there were no breaking changes
-# so we can use the same version for all postgres versions
-RUN case "${PG_VERSION}" in \
-    "v14" | "v15" | "v16" | "v17") \
-        export PG_JSONSCHEMA_VERSION=0.3.3 \
-        export PG_JSONSCHEMA_CHECKSUM=40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac \
-    ;; \
-    *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-    ;; \
-    esac && \
-    wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v${PG_JSONSCHEMA_VERSION}.tar.gz -O pg_jsonschema.tar.gz && \
-    echo "${PG_JSONSCHEMA_CHECKSUM} pg_jsonschema.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.gz -O pg_jsonschema.tar.gz && \
+    echo "40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac pg_jsonschema.tar.gz" | sha256sum --check && \
     mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
     # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
     # `unsafe-postgres` feature allows to build pgx extensions
@@ -1012,22 +999,9 @@ RUN case "${PG_VERSION}" in \
 FROM rust-extensions-build-pgrx12 AS pg-graphql-pg-build
 ARG PG_VERSION
 
-# version 1.5.9 supports v17
 # last release v1.5.9 - Oct 16, 2024
-#
-# there were no breaking changes
-# so we can use the same version for all postgres versions
-RUN case "${PG_VERSION}" in \
-    "v14" | "v15" | "v16" | "v17") \
-        export PG_GRAPHQL_VERSION=1.5.9 \
-        export PG_GRAPHQL_CHECKSUM=cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 \
-    ;; \
-    *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-    ;; \
-    esac && \
-    wget https://github.com/supabase/pg_graphql/archive/refs/tags/v${PG_GRAPHQL_VERSION}.tar.gz -O pg_graphql.tar.gz && \
-    echo "${PG_GRAPHQL_CHECKSUM} pg_graphql.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \
+    echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \
     mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \

From cd982a82ecfd2d96c7251ff7203df38d0ed44539 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 13 Jan 2025 10:44:59 +0100
Subject: [PATCH 161/583] pageserver,safekeeper: increase heap profiling
 frequency to 2 MB (#10362)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

Currently, the heap profiling frequency is every 1 MB allocated. Taking
a profile stack trace takes about 1 µs, and allocating 1 MB takes about
15 µs, so the overhead is about 6.7% which is a bit high. This is a
fixed cost regardless of whether heap profiles are actually accessed.

## Summary of changes

Increase the heap profiling sample frequency from 1 MB to 2 MB, which
reduces the overhead to about 3.3%. This seems acceptable, considering
performance-sensitive code will avoid allocations as far as possible
anyway.
---
 pageserver/src/bin/pageserver.rs  | 6 ++++--
 safekeeper/benches/receive_wal.rs | 5 ++---
 safekeeper/src/bin/safekeeper.rs  | 6 ++++--
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 567a69da3b..921c6a5092 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -53,10 +53,12 @@ project_build_tag!(BUILD_TAG);
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
-/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
+/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21).
+/// This adds roughly 3% overhead for allocations on average, which is acceptable considering
+/// performance-sensitive code will avoid allocations as far as possible anyway.
 #[allow(non_upper_case_globals)]
 #[export_name = "malloc_conf"]
-pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
 
 const PID_FILE_NAME: &str = "pageserver.pid";
 
diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs
index 996c4d9b8c..19c6662e74 100644
--- a/safekeeper/benches/receive_wal.rs
+++ b/safekeeper/benches/receive_wal.rs
@@ -21,14 +21,13 @@ const KB: usize = 1024;
 const MB: usize = 1024 * KB;
 const GB: usize = 1024 * MB;
 
-/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB.
-/// This mirrors the configuration in bin/safekeeper.rs.
+/// Use jemalloc and enable profiling, to mirror bin/safekeeper.rs.
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
 #[allow(non_upper_case_globals)]
 #[export_name = "malloc_conf"]
-pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
 
 // Register benchmarks with Criterion.
 criterion_group!(
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 13f6e34575..bc7af02185 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -51,10 +51,12 @@ use utils::{
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
-/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
+/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21).
+/// This adds roughly 3% overhead for allocations on average, which is acceptable considering
+/// performance-sensitive code will avoid allocations as far as possible anyway.
 #[allow(non_upper_case_globals)]
 #[export_name = "malloc_conf"]
-pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
 
 const PID_FILE_NAME: &str = "safekeeper.pid";
 const ID_FILE_NAME: &str = "safekeeper.id";

From 22a6460010490857c927d1218b21f81d2fb0c06d Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 13 Jan 2025 11:01:18 +0100
Subject: [PATCH 162/583] libs/utils: add `force` parameter for `/profile/cpu`
 (#10361)

## Problem

It's only possible to take one CPU profile at a time. With Grafana
continuous profiling, a (low-frequency) CPU profile will always be
running, making it hard to take an ad hoc CPU profile at the same time.

Resolves #10072.

## Summary of changes

Add a `force` parameter for `/profile/cpu` which will end and return an
already running CPU profile, starting a new one for the current caller.
---
 libs/utils/src/http/endpoint.rs | 46 ++++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index 4b4aa88d6b..a6ba685447 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -15,7 +15,7 @@ use once_cell::sync::Lazy;
 use regex::Regex;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
-use tokio::sync::{mpsc, Mutex};
+use tokio::sync::{mpsc, Mutex, Notify};
 use tokio_stream::wrappers::ReceiverStream;
 use tokio_util::io::ReaderStream;
 use tracing::{debug, info, info_span, warn, Instrument};
@@ -358,25 +358,41 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
         Some(1001..) => return Err(ApiError::BadRequest(anyhow!("frequency must be <=1000 Hz"))),
         Some(frequency) => frequency,
     };
-
-    // Only allow one profiler at a time.
-    static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
-    let _lock = PROFILE_LOCK
-        .try_lock()
-        .map_err(|_| ApiError::Conflict("profiler already running".into()))?;
+    let force: bool = parse_query_param(&req, "force")?.unwrap_or_default();
 
     // Take the profile.
-    let report = tokio::task::spawn_blocking(move || {
+    static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
+    static PROFILE_CANCEL: Lazy<Notify> = Lazy::new(Notify::new);
+
+    let report = {
+        // Only allow one profiler at a time. If force is true, cancel a running profile (e.g. a
+        // Grafana continuous profile). We use a try_lock() loop when cancelling instead of waiting
+        // for a lock(), to avoid races where the notify isn't currently awaited.
+        let _lock = loop {
+            match PROFILE_LOCK.try_lock() {
+                Ok(lock) => break lock,
+                Err(_) if force => PROFILE_CANCEL.notify_waiters(),
+                Err(_) => return Err(ApiError::Conflict("profiler already running".into())),
+            }
+            tokio::time::sleep(Duration::from_millis(1)).await; // don't busy-wait
+        };
+
         let guard = ProfilerGuardBuilder::default()
             .frequency(frequency_hz)
             .blocklist(&["libc", "libgcc", "pthread", "vdso"])
-            .build()?;
-        std::thread::sleep(Duration::from_secs(seconds));
-        guard.report().build()
-    })
-    .await
-    .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
-    .map_err(|pprof_err| ApiError::InternalServerError(pprof_err.into()))?;
+            .build()
+            .map_err(|err| ApiError::InternalServerError(err.into()))?;
+
+        tokio::select! {
+            _ = tokio::time::sleep(Duration::from_secs(seconds)) => {},
+            _ = PROFILE_CANCEL.notified() => {},
+        };
+
+        guard
+            .report()
+            .build()
+            .map_err(|err| ApiError::InternalServerError(err.into()))?
+    };
 
     // Return the report in the requested format.
     match format {

From de199d71e18888b059cc8846387d933b103f323e Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 13 Jan 2025 10:34:36 +0000
Subject: [PATCH 163/583] chore: Address lints introduced in rust 1.85.0 beta
 (#10340)

With a new beta build of the rust compiler, it's good to check out the
new lints. Either to find false positives, or find flaws in our code.
Additionally, it helps reduce the effort required to update to 1.85 in 6
weeks.
---
 control_plane/src/local_env.rs                         |  1 -
 libs/pq_proto/src/lib.rs                               |  2 +-
 libs/utils/src/generation.rs                           |  6 +++---
 libs/utils/src/lsn.rs                                  |  2 +-
 pageserver/src/tenant/config.rs                        |  4 ++--
 pageserver/src/tenant/disk_btree.rs                    | 10 +++++-----
 pageserver/src/tenant/storage_layer/inmemory_layer.rs  |  4 ++--
 .../timeline/walreceiver/walreceiver_connection.rs     |  2 +-
 pageserver/src/walingest.rs                            |  2 +-
 9 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 5b82acb3a5..2fe4cd5202 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -483,7 +483,6 @@ impl LocalEnv {
             .iter()
             .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
             .map(|&(_, timeline_id)| timeline_id)
-            .map(TimelineId::from)
     }
 
     pub fn timeline_name_mappings(&self) -> HashMap<TenantTimelineId, String> {
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 94714359a3..50b2c69d24 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -44,7 +44,7 @@ pub struct ProtocolVersion(u32);
 
 impl ProtocolVersion {
     pub const fn new(major: u16, minor: u16) -> Self {
-        Self((major as u32) << 16 | minor as u32)
+        Self(((major as u32) << 16) | minor as u32)
     }
     pub const fn minor(self) -> u16 {
         self.0 as u16
diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs
index 5970836033..44565ee6a2 100644
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -112,9 +112,9 @@ impl Serialize for Generation {
             // We should never be asked to serialize a None. Structures
             // that include an optional generation should convert None to an
             // Option<Generation>::None
-            Err(serde::ser::Error::custom(
-                "Tried to serialize invalid generation ({self})",
-            ))
+            Err(serde::ser::Error::custom(format!(
+                "Tried to serialize invalid generation ({self:?})"
+            )))
         }
     }
 }
diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs
index f188165600..c874fa30ff 100644
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -260,7 +260,7 @@ impl FromStr for Lsn {
         {
             let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
             let right_num = u32::from_str_radix(right, 16).map_err(|_| LsnParseError)?;
-            Ok(Lsn((left_num as u64) << 32 | right_num as u64))
+            Ok(Lsn(((left_num as u64) << 32) | right_num as u64))
         } else {
             Err(LsnParseError)
         }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index d54dded778..edf2e6a3aa 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -11,7 +11,7 @@
 pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
-use pageserver_api::models::{self, TenantConfigPatch, ThrottleConfig};
+use pageserver_api::models::{self, TenantConfigPatch};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
@@ -597,7 +597,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
                 .map(humantime),
             heatmap_period: value.heatmap_period.map(humantime),
             lazy_slru_download: value.lazy_slru_download,
-            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
+            timeline_get_throttle: value.timeline_get_throttle,
             image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
             lsn_lease_length: value.lsn_lease_length.map(humantime),
             lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index c77342b144..bb9df020b5 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -84,17 +84,17 @@ impl Value {
 
     fn to_u64(self) -> u64 {
         let b = &self.0;
-        (b[0] as u64) << 32
-            | (b[1] as u64) << 24
-            | (b[2] as u64) << 16
-            | (b[3] as u64) << 8
+        ((b[0] as u64) << 32)
+            | ((b[1] as u64) << 24)
+            | ((b[2] as u64) << 16)
+            | ((b[3] as u64) << 8)
             | b[4] as u64
     }
 
     fn to_blknum(self) -> u32 {
         let b = &self.0;
         assert!(b[0] == 0x80);
-        (b[1] as u32) << 24 | (b[2] as u32) << 16 | (b[3] as u32) << 8 | b[4] as u32
+        ((b[1] as u32) << 24) | ((b[2] as u32) << 16) | ((b[3] as u32) << 8) | b[4] as u32
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 71e53da20f..2b67f55a17 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -112,8 +112,8 @@ const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
 ///
 /// Layout:
 /// - 1 bit: `will_init`
-/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len`
-/// - [`MAX_SUPPORTED_POS_BITS`]: `pos`
+/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`][]: `len`
+/// - [`MAX_SUPPORTED_POS_BITS`](IndexEntry::MAX_SUPPORTED_POS_BITS): `pos`
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct IndexEntry(u64);
 
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index d74faa1af5..3a8796add8 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -403,7 +403,7 @@ pub(super) async fn handle_walreceiver_connection(
                 // need to advance last record LSN on all shards. If we've not ingested the latest
                 // record, then set the LSN of the modification past it. This way all shards
                 // advance their last record LSN at the same time.
-                let needs_last_record_lsn_advance = match next_record_lsn.map(Lsn::from) {
+                let needs_last_record_lsn_advance = match next_record_lsn {
                     Some(lsn) if lsn > modification.get_lsn() => {
                         modification.set_lsn(lsn).unwrap();
                         true
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index e5b23fed51..7253af8507 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -308,7 +308,7 @@ impl WalIngest {
             epoch -= 1;
         }
 
-        Ok((epoch as u64) << 32 | xid as u64)
+        Ok(((epoch as u64) << 32) | xid as u64)
     }
 
     async fn ingest_clear_vm_bits(

From 12053cf83260da4da46f4039d5bbfb4cbb8e57da Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 13 Jan 2025 11:18:14 +0000
Subject: [PATCH 164/583] storage controller: improve consistency_check_api
 (#10363)

## Problem

Limitations found while using this to investigate
https://github.com/neondatabase/neon/issues/10234:
- If we hit a node consistency issue, we drop out and don't check shards
for consistency
- The messages printed after a shard consistency issue are huge, and
grafana appears to drop them.

## Summary of changes

- Defer node consistency errors until the end of the function, so that
we always proceed to check shards for consistency
- Print out smaller log lines that just point out the diffs between
expected and persistent state
---
 storage_controller/src/service.rs | 59 ++++++++++++++++++++++++++++---
 1 file changed, 54 insertions(+), 5 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 265b2798d2..430d884548 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5256,7 +5256,8 @@ impl Service {
         expect_nodes.sort_by_key(|n| n.node_id);
         nodes.sort_by_key(|n| n.node_id);
 
-        if nodes != expect_nodes {
+        // Errors relating to nodes are deferred so that we don't skip the shard checks below if we have a node error
+        let node_result = if nodes != expect_nodes {
             tracing::error!("Consistency check failed on nodes.");
             tracing::error!(
                 "Nodes in memory: {}",
@@ -5268,10 +5269,12 @@ impl Service {
                 serde_json::to_string(&nodes)
                     .map_err(|e| ApiError::InternalServerError(e.into()))?
             );
-            return Err(ApiError::InternalServerError(anyhow::anyhow!(
+            Err(ApiError::InternalServerError(anyhow::anyhow!(
                 "Node consistency failure"
-            )));
-        }
+            )))
+        } else {
+            Ok(())
+        };
 
         let mut persistent_shards = self.persistence.load_active_tenant_shards().await?;
         persistent_shards
@@ -5281,6 +5284,7 @@ impl Service {
 
         if persistent_shards != expect_shards {
             tracing::error!("Consistency check failed on shards.");
+
             tracing::error!(
                 "Shards in memory: {}",
                 serde_json::to_string(&expect_shards)
@@ -5291,12 +5295,57 @@ impl Service {
                 serde_json::to_string(&persistent_shards)
                     .map_err(|e| ApiError::InternalServerError(e.into()))?
             );
+
+            // The total dump log lines above are useful in testing but in the field grafana will
+            // usually just drop them because they're so large. So we also do some explicit logging
+            // of just the diffs.
+            let persistent_shards = persistent_shards
+                .into_iter()
+                .map(|tsp| (tsp.get_tenant_shard_id().unwrap(), tsp))
+                .collect::<HashMap<_, _>>();
+            let expect_shards = expect_shards
+                .into_iter()
+                .map(|tsp| (tsp.get_tenant_shard_id().unwrap(), tsp))
+                .collect::<HashMap<_, _>>();
+            for (tenant_shard_id, persistent_tsp) in &persistent_shards {
+                match expect_shards.get(tenant_shard_id) {
+                    None => {
+                        tracing::error!(
+                            "Shard {} found in database but not in memory",
+                            tenant_shard_id
+                        );
+                    }
+                    Some(expect_tsp) => {
+                        if expect_tsp != persistent_tsp {
+                            tracing::error!(
+                                "Shard {} is inconsistent.  In memory: {}, database has: {}",
+                                tenant_shard_id,
+                                serde_json::to_string(expect_tsp).unwrap(),
+                                serde_json::to_string(&persistent_tsp).unwrap()
+                            );
+                        }
+                    }
+                }
+            }
+
+            // Having already logged any differences, log any shards that simply aren't present in the database
+            for (tenant_shard_id, memory_tsp) in &expect_shards {
+                if !persistent_shards.contains_key(tenant_shard_id) {
+                    tracing::error!(
+                        "Shard {} found in memory but not in database: {}",
+                        tenant_shard_id,
+                        serde_json::to_string(memory_tsp)
+                            .map_err(|e| ApiError::InternalServerError(e.into()))?
+                    );
+                }
+            }
+
             return Err(ApiError::InternalServerError(anyhow::anyhow!(
                 "Shard consistency failure"
             )));
         }
 
-        Ok(())
+        node_result
     }
 
     /// For debug/support: a JSON dump of the [`Scheduler`].  Returns a response so that

From 09fe3b025c42e56d78812e1ff329551e2da17720 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 13 Jan 2025 13:35:39 +0200
Subject: [PATCH 165/583] Add a websockets tunnel and a test for the proxy's
 websockets support. (#3823)

For testing the proxy's websockets support.

I wrote this to test https://github.com/neondatabase/neon/issues/3822.
Unfortunately, that bug can *not* be reproduced with this tunnel. The
bug only appears when the client pipelines the first query with the
authentication messages. The tunnel doesn't do that.

---

Update (@conradludgate 2025-01-10):

We have since added some websocket tests, but they manually implemented
a very simplistic setup of the postgres protocol. Introducing the tunnel
would make more complex testing simpler in the future.

---------

Co-authored-by: Conrad Ludgate <conradludgate@gmail.com>
---
 test_runner/regress/test_proxy_websockets.py |  55 +++++++
 test_runner/websocket_tunnel.py              | 154 +++++++++++++++++++
 2 files changed, 209 insertions(+)
 create mode 100755 test_runner/websocket_tunnel.py

diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py
index ea01252ce4..f14317a39f 100644
--- a/test_runner/regress/test_proxy_websockets.py
+++ b/test_runner/regress/test_proxy_websockets.py
@@ -1,10 +1,15 @@
 from __future__ import annotations
 
+import asyncio
 import ssl
 
+import asyncpg
 import pytest
+import websocket_tunnel
 import websockets
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonProxy
+from fixtures.port_distributor import PortDistributor
 
 
 @pytest.mark.asyncio
@@ -196,3 +201,53 @@ async def test_websockets_pipelined(static_proxy: NeonProxy):
         # close
         await websocket.send(b"X\x00\x00\x00\x04")
         await websocket.wait_closed()
+
+
+@pytest.mark.asyncio
+async def test_websockets_tunneled(static_proxy: NeonProxy, port_distributor: PortDistributor):
+    static_proxy.safe_psql("create user ws_auth with password 'ws' superuser")
+
+    user = "ws_auth"
+    password = "ws"
+
+    ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+    ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt"))
+
+    # Launch a tunnel service so that we can speak the websockets protocol to
+    # the proxy
+    tunnel_port = port_distributor.get_port()
+    tunnel_server = await websocket_tunnel.start_server(
+        "127.0.0.1",
+        tunnel_port,
+        f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+        ssl_context,
+    )
+    log.info(f"websockets tunnel listening for connections on port {tunnel_port}")
+
+    async with tunnel_server:
+
+        async def run_tunnel():
+            try:
+                async with tunnel_server:
+                    await tunnel_server.serve_forever()
+            except Exception as e:
+                log.error(f"Error in tunnel task: {e}")
+
+        tunnel_task = asyncio.create_task(run_tunnel())
+
+        # Ok, the tunnel is now running. Check that we can connect to the proxy's
+        # websocket interface, through the tunnel
+        tunnel_connstring = f"postgres://{user}:{password}@127.0.0.1:{tunnel_port}/postgres"
+
+        log.info(f"connecting to {tunnel_connstring}")
+        conn = await asyncpg.connect(tunnel_connstring)
+        res = await conn.fetchval("SELECT 123")
+        assert res == 123
+        await conn.close()
+        log.info("Ran a query successfully through the tunnel")
+
+    tunnel_server.close()
+    try:
+        await tunnel_task
+    except asyncio.CancelledError:
+        pass
diff --git a/test_runner/websocket_tunnel.py b/test_runner/websocket_tunnel.py
new file mode 100755
index 0000000000..facdb19140
--- /dev/null
+++ b/test_runner/websocket_tunnel.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+#
+# This program helps to test the WebSocket tunneling in proxy. It listens for a TCP
+# connection on a port, and when you connect to it, it opens a websocket connection,
+# and forwards all the traffic to the websocket connection, wrapped in WebSocket binary
+# frames.
+#
+# This is used in the test_proxy::test_websockets test, but it is handy for manual testing too.
+#
+# Usage for manual testing:
+#
+# ## Launch Posgres on port 3000:
+# postgres -D data -p3000
+#
+# ## Launch proxy with WSS enabled:
+# openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj '/CN=*.neon.localtest.me'
+# ./target/debug/proxy --wss 127.0.0.1:40433 --http 127.0.0.1:28080 --mgmt 127.0.0.1:9099 --proxy 127.0.0.1:4433 --tls-key server.key --tls-cert server.crt --auth-backend postgres
+#
+# ## Launch the tunnel:
+#
+# poetry run ./test_runner/websocket_tunnel.py --ws-port 40433 --ws-url "wss://ep-test.neon.localtest.me"
+#
+# ## Now you can connect with psql:
+# psql "postgresql://heikki@localhost:40433/postgres"
+#
+
+import argparse
+import asyncio
+import logging
+import ssl
+from ssl import Purpose
+
+import websockets
+from fixtures.log_helper import log
+
+
+# Enable verbose logging of all the traffic
+def enable_verbose_logging():
+    logger = logging.getLogger("websockets")
+    logger.setLevel(logging.DEBUG)
+    logger.addHandler(logging.StreamHandler())
+
+
+async def start_server(tcp_listen_host, tcp_listen_port, ws_url, ctx):
+    server = await asyncio.start_server(
+        lambda r, w: handle_client(r, w, ws_url, ctx), tcp_listen_host, tcp_listen_port
+    )
+    return server
+
+
+async def handle_tcp_to_websocket(tcp_reader, ws):
+    try:
+        while not tcp_reader.at_eof():
+            data = await tcp_reader.read(1024)
+
+            await ws.send(data)
+    except websockets.exceptions.ConnectionClosedError as e:
+        log.debug(f"connection closed: {e}")
+    except websockets.exceptions.ConnectionClosedOK:
+        log.debug("connection closed")
+    except Exception as e:
+        log.error(e)
+
+
+async def handle_websocket_to_tcp(ws, tcp_writer):
+    try:
+        async for message in ws:
+            tcp_writer.write(message)
+            await tcp_writer.drain()
+    except websockets.exceptions.ConnectionClosedError as e:
+        log.debug(f"connection closed: {e}")
+    except websockets.exceptions.ConnectionClosedOK:
+        log.debug("connection closed")
+    except Exception as e:
+        log.error(e)
+
+
+async def handle_client(tcp_reader, tcp_writer, ws_url: str, ctx: ssl.SSLContext):
+    try:
+        log.info("Received TCP connection. Connecting to websockets proxy.")
+
+        async with websockets.connect(ws_url, ssl=ctx) as ws:
+            try:
+                log.info("Connected to websockets proxy")
+
+                async with asyncio.TaskGroup() as tg:
+                    task1 = tg.create_task(handle_tcp_to_websocket(tcp_reader, ws))
+                    task2 = tg.create_task(handle_websocket_to_tcp(ws, tcp_writer))
+
+                    done, pending = await asyncio.wait(
+                        [task1, task2], return_when=asyncio.FIRST_COMPLETED
+                    )
+                    tcp_writer.close()
+                    await ws.close()
+
+            except* Exception as ex:
+                log.error(ex.exceptions)
+    except Exception as e:
+        log.error(e)
+
+
+async def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--tcp-listen-addr",
+        default="localhost",
+        help="TCP addr to listen on",
+    )
+    parser.add_argument(
+        "--tcp-listen-port",
+        default="40444",
+        help="TCP port to listen on",
+    )
+
+    parser.add_argument(
+        "--ws-url",
+        default="wss://localhost/",
+        help="websocket URL to connect to. This determines the Host header sent to the server",
+    )
+    parser.add_argument(
+        "--ws-host",
+        default="127.0.0.1",
+        help="websockets host to connect to",
+    )
+    parser.add_argument(
+        "--ws-port",
+        type=int,
+        default=443,
+        help="websockets port to connect to",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="enable verbose logging",
+    )
+    args = parser.parse_args()
+
+    if args.verbose:
+        enable_verbose_logging()
+
+    ctx = ssl.create_default_context(Purpose.SERVER_AUTH)
+    ctx.check_hostname = False
+    ctx.verify_mode = ssl.CERT_NONE
+
+    server = await start_server(args.tcp_listen_addr, args.tcp_listen_port, args.ws_url, ctx)
+    print(
+        f"Listening for connections at {args.tcp_listen_addr}:{args.tcp_listen_port}, forwarding them to {args.ws_host}:{args.ws_port}"
+    )
+    async with server:
+        await server.serve_forever()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())

From 0b9032065ef1e140ad54e09e81aaf00a1b809e87 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 13 Jan 2025 14:14:23 +0100
Subject: [PATCH 166/583] utils: allow 60-second CPU profiles (#10367)

Taking continuous profiles every 20 seconds is likely too expensive (in
dollar terms). Let's try 60-second profiles. We can now interrupt
running profiles via `?force=true`, so this should be fine.
---
 libs/utils/src/http/endpoint.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index a6ba685447..ca65c39ad6 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -350,8 +350,8 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
     };
     let seconds = match parse_query_param(&req, "seconds")? {
         None => 5,
-        Some(seconds @ 1..=30) => seconds,
-        Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-30 secs"))),
+        Some(seconds @ 1..=60) => seconds,
+        Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-60 secs"))),
     };
     let frequency_hz = match parse_query_param(&req, "frequency")? {
         None => 99,

From d1bc36f53688452dfacb7e38c546d49fce73c9f3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 13 Jan 2025 13:31:57 +0000
Subject: [PATCH 167/583] storage controller: fix retries of compute hook
 notifications while a secondary node is offline (#10352)

## Problem

We would sometimes fail to retry compute notifications:
1. Try and send, set compute_notify_failure if we can't
2. On next reconcile, reconcile() fails for some other reason (e.g.
tried to talk to an offline node), and we fail the `result.is_ok() &&
must_notify` condition around the re-sending.

Closes: https://github.com/neondatabase/cloud/issues/22612

## Summary of changes

- Clarify the meaning of the reconcile result: it should be Ok(()) if
configuring attached location worked, even if secondary or detach
locations cannot be reached.
- Skip trying to talk to secondaries if they're offline
- Even if reconcile fails and we can't send the compute notification (we
can't send it because we're not sure if it's really attached), make sure
we save the `compute_notify_failure` flag so that subsequent reconciler
runs will try again
- Add a regression test for the above
---
 storage_controller/src/reconciler.rs          |  21 +++-
 storage_controller/src/tenant_shard.rs        |   7 +-
 .../regress/test_storage_controller.py        | 116 ++++++++++++++++++
 3 files changed, 139 insertions(+), 5 deletions(-)

diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index e0a854fff7..6f584e7267 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -696,6 +696,11 @@ impl Reconciler {
     /// First we apply special case handling (e.g. for live migrations), and then a
     /// general case reconciliation where we walk through the intent by pageserver
     /// and call out to the pageserver to apply the desired state.
+    ///
+    /// An Ok(()) result indicates that we successfully attached the tenant, but _not_ that
+    /// all locations for the tenant are in the expected state. When nodes that are to be detached
+    /// or configured as secondary are unavailable, we may return Ok(()) but leave the shard in a
+    /// state where it still requires later reconciliation.
     pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
         // Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it
         self.maybe_refresh_observed().await?;
@@ -784,10 +789,18 @@ impl Reconciler {
                     tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
                 }
                 _ => {
-                    // In all cases other than a matching observed configuration, we will
-                    // reconcile this location.
-                    tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
-                    changes.push((node.clone(), wanted_conf))
+                    // Only try and configure secondary locations on nodes that are available.  This
+                    // allows the reconciler to "succeed" while some secondaries are offline (e.g. after
+                    // a node failure, where the failed node will have a secondary intent)
+                    if node.is_available() {
+                        tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
+                        changes.push((node.clone(), wanted_conf))
+                    } else {
+                        tracing::info!(node_id=%node.get_id(), "Skipping configuration as secondary, node is unavailable");
+                        self.observed
+                            .locations
+                            .insert(node.get_id(), ObservedStateLocation { conf: None });
+                    }
                 }
             }
         }
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index c17989a316..e0a71b5822 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1122,10 +1122,15 @@ impl TenantShard {
         let result = reconciler.reconcile().await;
 
         // If we know we had a pending compute notification from some previous action, send a notification irrespective
-        // of whether the above reconcile() did any work
+        // of whether the above reconcile() did any work.  It has to be Ok() though, because otherwise we might be
+        // sending a notification of a location that isn't really attached.
         if result.is_ok() && must_notify {
             // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
             reconciler.compute_notify().await.ok();
+        } else if must_notify {
+            // Carry this flag so that the reconciler's result will indicate that it still needs to retry
+            // the compute hook notification eventually.
+            reconciler.compute_notify_failure = true;
         }
 
         // Update result counter
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index da6d5b8622..616aee758d 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -822,6 +822,122 @@ def test_storage_controller_stuck_compute_hook(
     env.storage_controller.consistency_check()
 
 
+@run_only_on_default_postgres("postgres behavior is not relevant")
+def test_storage_controller_compute_hook_retry(
+    httpserver: HTTPServer,
+    neon_env_builder: NeonEnvBuilder,
+    httpserver_listen_address: ListenAddress,
+):
+    """
+    Test that when a reconciler can't do its compute hook notification, it will keep
+    trying until it succeeds.
+
+    Reproducer for https://github.com/neondatabase/cloud/issues/22612
+    """
+
+    neon_env_builder.num_pageservers = 2
+    (host, port) = httpserver_listen_address
+    neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"
+
+    handle_params = {"status": 200}
+
+    notifications = []
+
+    def handler(request: Request):
+        status = handle_params["status"]
+        log.info(f"Notify request[{status}]: {request}")
+        notifications.append(request.json)
+        return Response(status=status)
+
+    httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
+
+    # Start running
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    env.create_tenant(tenant_id, placement_policy='{"Attached": 1}')
+
+    # Initial notification from tenant creation
+    assert len(notifications) == 1
+    expect: dict[str, list[dict[str, int]] | str | None | int] = {
+        "tenant_id": str(tenant_id),
+        "stripe_size": None,
+        "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
+        "preferred_az": DEFAULT_AZ_ID,
+    }
+    assert notifications[0] == expect
+
+    # Block notifications, and fail a node
+    handle_params["status"] = 423
+    env.pageservers[0].stop()
+    env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
+    env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
+
+    # Avoid waiting for heartbeats
+    env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})
+
+    # Make reconciler run and fail: it should leave itself in a state where the shard will retry notification later,
+    # and we will check that that happens
+    notifications = []
+    try:
+        assert env.storage_controller.reconcile_all() == 1
+    except StorageControllerApiException as e:
+        assert "Control plane tenant busy" in str(e)
+    assert len(notifications) == 1
+    assert (
+        env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "is_pending_compute_notification"
+        ]
+        is True
+    )
+
+    # Try reconciling again, it should try notifying again
+    notifications = []
+    try:
+        assert env.storage_controller.reconcile_all() == 1
+    except StorageControllerApiException as e:
+        assert "Control plane tenant busy" in str(e)
+    assert len(notifications) == 1
+    assert (
+        env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "is_pending_compute_notification"
+        ]
+        is True
+    )
+
+    # The describe API should indicate that a notification is pending
+    assert (
+        env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "is_pending_compute_notification"
+        ]
+        is True
+    )
+
+    # Unblock notifications: reconcile should work now
+    handle_params["status"] = 200
+    notifications = []
+    assert env.storage_controller.reconcile_all() == 1
+    assert len(notifications) == 1
+    assert (
+        env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "is_pending_compute_notification"
+        ]
+        is False
+    )
+
+    # Reconciler should be idle now that it succeeded in its compute notification
+    notifications = []
+    assert env.storage_controller.reconcile_all() == 0
+    assert len(notifications) == 0
+    assert (
+        env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "is_pending_compute_notification"
+        ]
+        is False
+    )
+
+
 @run_only_on_default_postgres("this test doesn't start an endpoint")
 def test_storage_controller_compute_hook_revert(
     httpserver: HTTPServer,

From b2d0e1a519616aea5e0d477c1f12940e44d2fee5 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 13 Jan 2025 14:13:02 +0000
Subject: [PATCH 168/583] Link OpenSSL dynamically (#10302)

## Problem
Statically linked OpenSSL is buggy in multithreaded environment:
- https://github.com/neondatabase/cloud/issues/16155
- https://github.com/neondatabase/neon/issues/8275

## Summary of changes
- Link OpenSSL dynamically (revert OpenSSL part from
https://github.com/neondatabase/neon/pull/8074)

Before:
```
ldd /usr/local/v17/lib/libpq.so
        linux-vdso.so.1 (0x0000ffffb5ce4000)
        libm.so.6 => /lib/aarch64-linux-gnu/libm.so.6 (0x0000ffffb5c10000)
        libc.so.6 => /lib/aarch64-linux-gnu/libc.so.6 (0x0000ffffb5650000)
        /lib/ld-linux-aarch64.so.1 (0x0000ffffb5ca7000)
```

After:
```
ldd /usr/local/v17/lib/libpq.so
        linux-vdso.so.1 (0x0000ffffbf3e8000)
        libssl.so.3 => /lib/aarch64-linux-gnu/libssl.so.3 (0x0000ffffbf260000)
        libcrypto.so.3 => /lib/aarch64-linux-gnu/libcrypto.so.3 (0x0000ffffbec00000)
        libm.so.6 => /lib/aarch64-linux-gnu/libm.so.6 (0x0000ffffbf1c0000)
        libc.so.6 => /lib/aarch64-linux-gnu/libc.so.6 (0x0000ffffbea50000)
        /lib/ld-linux-aarch64.so.1 (0x0000ffffbf3ab000)
```
---
 Dockerfile             |  1 +
 Makefile               |  3 ---
 build-tools.Dockerfile | 15 ---------------
 3 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index d3659f917a..2e4f8e5546 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -71,6 +71,7 @@ RUN set -e \
         ca-certificates \
 	# System postgres for use with client libraries (e.g. in storage controller)
         postgresql-15 \
+        openssl \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
     && useradd -d /data neon \
     && chown -R neon:neon /data
diff --git a/Makefile b/Makefile
index 9cffc74508..22ebfea7d5 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,6 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Where to install Postgres, default is ./pg_install, maybe useful for package managers
 POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/
 
-OPENSSL_PREFIX_DIR := /usr/local/openssl
 ICU_PREFIX_DIR := /usr/local/icu
 
 #
@@ -26,11 +25,9 @@ endif
 ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
 	# Exclude static build openssl, icu for local build (MacOS, Linux)
 	# Only keep for build type release and debug
-	PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include
 	PG_CONFIGURE_OPTS += --with-icu
 	PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION'
 	PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm'
-	PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread'
 endif
 
 UNAME_S := $(shell uname -s)
diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 79210a2e1b..cf6439d004 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -190,21 +190,6 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
     && make install \
     && rm -rf ../lcov.tar.gz
 
-# Compile and install the static OpenSSL library
-ENV OPENSSL_VERSION=1.1.1w
-ENV OPENSSL_PREFIX=/usr/local/openssl
-RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
-    echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
-    cd /tmp && \
-    tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
-    rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
-    cd /tmp/openssl-${OPENSSL_VERSION} && \
-    ./config --prefix=${OPENSSL_PREFIX}  -static --static no-shared -fPIC && \
-    make -j "$(nproc)" && \
-    make install && \
-    cd /tmp && \
-    rm -rf /tmp/openssl-${OPENSSL_VERSION}
-
 # Use the same version of libicu as the compute nodes so that
 # clusters created using inidb on pageserver can be used by computes.
 #

From b31ed0acd1c5e20281556de66fa52d797c156826 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 13 Jan 2025 15:23:42 +0100
Subject: [PATCH 169/583] utils: add ?force=true hint for CPU profiler (#10368)

This makes it less annoying to try to take a CPU profile when a
continuous profile is already running.
---
 libs/utils/src/http/endpoint.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index ca65c39ad6..9f38373ca0 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -372,7 +372,11 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
             match PROFILE_LOCK.try_lock() {
                 Ok(lock) => break lock,
                 Err(_) if force => PROFILE_CANCEL.notify_waiters(),
-                Err(_) => return Err(ApiError::Conflict("profiler already running".into())),
+                Err(_) => {
+                    return Err(ApiError::Conflict(
+                        "profiler already running (use ?force=true to cancel it)".into(),
+                    ))
+                }
             }
             tokio::time::sleep(Duration::from_millis(1)).await; // don't busy-wait
         };

From ceacc29609058f7ba18288ac141e7c55a4f01382 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 13 Jan 2025 16:26:11 +0200
Subject: [PATCH 170/583] Start with minimal prefetch distance to minimize
 prefetch overhead for exact or limited index scans (#10359)

## Problem

See https://neondb.slack.com/archives/C04DGM6SMTM/p1736526089437179

In case of queries index scan with LIMIT clause, multiple backends can
concurrently send larger number of duplicated prefetch requests which
are not stored in LFC and so actually do useless job.

Current implementation of index prefetch starts with maximal prefetch
distance (10 by default now) when there are no key bounds, so in queries
with LIMIT clause like `select * from T order by pk limit 1` compute can
send a lot of useless prefetch requests to page server.

## Summary of changes

Always start with minimal prefetch distance even if there are not key
boundaries.

Related Postgres PRs:
https://github.com/neondatabase/postgres/pull/552
https://github.com/neondatabase/postgres/pull/551
https://github.com/neondatabase/postgres/pull/550
https://github.com/neondatabase/postgres/pull/549

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index c2f65b3201..210a0ba3af 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit c2f65b3201591e02ce45b66731392f98d3388e73
+Subproject commit 210a0ba3afd8134ea910b203f274b165bd4f05d7
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index f262d631ad..d3141e17a7 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit f262d631ad477a1819e84a183e5a7ef561830085
+Subproject commit d3141e17a7155e3d07c8deba4a10c748a29ba1e6
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 97f9fde349..f63b141cfb 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 97f9fde349c6de6d573f5ce96db07eca60ce6185
+Subproject commit f63b141cfb0c813725a6b2574049565bff643018
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 7e3f3974bc..9c9e9a78a9 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 7e3f3974bc8895938308f94d0e96879ffae638cd
+Subproject commit 9c9e9a78a93aebec2f6a2f54644442d35ffa245c
diff --git a/vendor/revisions.json b/vendor/revisions.json
index bff2f70931..d182b88008 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.2",
-    "7e3f3974bc8895938308f94d0e96879ffae638cd"
+    "9c9e9a78a93aebec2f6a2f54644442d35ffa245c"
   ],
   "v16": [
     "16.6",
-    "97f9fde349c6de6d573f5ce96db07eca60ce6185"
+    "f63b141cfb0c813725a6b2574049565bff643018"
   ],
   "v15": [
     "15.10",
-    "f262d631ad477a1819e84a183e5a7ef561830085"
+    "d3141e17a7155e3d07c8deba4a10c748a29ba1e6"
   ],
   "v14": [
     "14.15",
-    "c2f65b3201591e02ce45b66731392f98d3388e73"
+    "210a0ba3afd8134ea910b203f274b165bd4f05d7"
   ]
 }

From ef8bfacd6b1a9fa55923e0158ba096afc6090143 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 13 Jan 2025 14:52:43 +0000
Subject: [PATCH 171/583] storage controller: API + CLI for migrating secondary
 locations (#10284)

## Problem

Currently, if we want to move a secondary there isn't a neat way to do
that: we just have migration API for the attached location, and it is
only clean to use that if you've manually created a secondary via
pageserver API in the place you're going to move it to.

Secondary migration API enables:
- Moving the secondary somewhere because we would like to later move the
attached location there.
- Move the secondary location because we just want to reclaim some disk
space from its current location.

## Summary of changes

- Add `/migrate_secondary` API
- Add `tenant-shard-migrate-secondary` CLI
- Add tests for above
---
 control_plane/src/storage_controller.rs       |  5 +-
 control_plane/storcon_cli/src/main.rs         | 31 ++++--
 libs/pageserver_api/src/controller_api.rs     |  1 -
 storage_controller/src/compute_hook.rs        | 10 +-
 storage_controller/src/http.rs                | 36 ++++++-
 storage_controller/src/service.rs             | 63 ++++++++++++
 .../regress/test_storage_controller.py        | 96 +++++++++++++++----
 7 files changed, 210 insertions(+), 32 deletions(-)

diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 22d2420ed4..c41ff22d15 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -822,10 +822,7 @@ impl StorageController {
         self.dispatch(
             Method::PUT,
             format!("control/v1/tenant/{tenant_shard_id}/migrate"),
-            Some(TenantShardMigrateRequest {
-                tenant_shard_id,
-                node_id,
-            }),
+            Some(TenantShardMigrateRequest { node_id }),
         )
         .await
     }
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 617b2cd1ba..1653b3c845 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -112,6 +112,13 @@ enum Command {
         #[arg(long)]
         node: NodeId,
     },
+    /// Migrate the secondary location for a tenant shard to a specific pageserver.
+    TenantShardMigrateSecondary {
+        #[arg(long)]
+        tenant_shard_id: TenantShardId,
+        #[arg(long)]
+        node: NodeId,
+    },
     /// Cancel any ongoing reconciliation for this shard
     TenantShardCancelReconcile {
         #[arg(long)]
@@ -540,10 +547,7 @@ async fn main() -> anyhow::Result<()> {
             tenant_shard_id,
             node,
         } => {
-            let req = TenantShardMigrateRequest {
-                tenant_shard_id,
-                node_id: node,
-            };
+            let req = TenantShardMigrateRequest { node_id: node };
 
             storcon_client
                 .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
@@ -553,6 +557,20 @@ async fn main() -> anyhow::Result<()> {
                 )
                 .await?;
         }
+        Command::TenantShardMigrateSecondary {
+            tenant_shard_id,
+            node,
+        } => {
+            let req = TenantShardMigrateRequest { node_id: node };
+
+            storcon_client
+                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
+                    Method::PUT,
+                    format!("control/v1/tenant/{tenant_shard_id}/migrate_secondary"),
+                    Some(req),
+                )
+                .await?;
+        }
         Command::TenantShardCancelReconcile { tenant_shard_id } => {
             storcon_client
                 .dispatch::<(), ()>(
@@ -915,10 +933,7 @@ async fn main() -> anyhow::Result<()> {
                             .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
                                 Method::PUT,
                                 format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
-                                Some(TenantShardMigrateRequest {
-                                    tenant_shard_id: mv.tenant_shard_id,
-                                    node_id: mv.to,
-                                }),
+                                Some(TenantShardMigrateRequest { node_id: mv.to }),
                             )
                             .await
                             .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 7eb3547183..ba0eb0e4ae 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -179,7 +179,6 @@ pub struct TenantDescribeResponseShard {
 /// specifies some constraints, e.g. asking it to get off particular node(s)
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateRequest {
-    pub tenant_shard_id: TenantShardId,
     pub node_id: NodeId,
 }
 
diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index 69db48f8d1..3884a6df46 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -124,7 +124,10 @@ impl ComputeHookTenant {
                 if let Some(shard_idx) = shard_idx {
                     sharded.shards.remove(shard_idx);
                 } else {
-                    tracing::warn!("Shard not found while handling detach")
+                    // This is a valid but niche case, where the tenant was previously attached
+                    // as a Secondary location and then detached, so has no previously notified
+                    // state.
+                    tracing::info!("Shard not found while handling detach")
                 }
             }
             ComputeHookTenant::Unsharded(_) => {
@@ -761,7 +764,10 @@ impl ComputeHook {
         let mut state_locked = self.state.lock().unwrap();
         match state_locked.entry(tenant_shard_id.tenant_id) {
             Entry::Vacant(_) => {
-                tracing::warn!("Compute hook tenant not found for detach");
+                // This is a valid but niche case, where the tenant was previously attached
+                // as a Secondary location and then detached, so has no previously notified
+                // state.
+                tracing::info!("Compute hook tenant not found for detach");
             }
             Entry::Occupied(mut e) => {
                 let sharded = e.get().is_sharded();
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 5385e4ee0b..c8df4ffe28 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -690,7 +690,8 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
     };
 
     let state = get_state(&req);
-    let nodes = state.service.node_list().await?;
+    let mut nodes = state.service.node_list().await?;
+    nodes.sort_by_key(|n| n.get_id());
     let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
 
     json_response(StatusCode::OK, api_nodes)
@@ -1005,6 +1006,29 @@ async fn handle_tenant_shard_migrate(
     )
 }
 
+async fn handle_tenant_shard_migrate_secondary(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
+    let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
+    json_response(
+        StatusCode::OK,
+        service
+            .tenant_shard_migrate_secondary(tenant_shard_id, migrate_req)
+            .await?,
+    )
+}
+
 async fn handle_tenant_shard_cancel_reconcile(
     service: Arc<Service>,
     req: Request<Body>,
@@ -1855,6 +1879,16 @@ pub fn make_router(
                 RequestName("control_v1_tenant_migrate"),
             )
         })
+        .put(
+            "/control/v1/tenant/:tenant_shard_id/migrate_secondary",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    handle_tenant_shard_migrate_secondary,
+                    RequestName("control_v1_tenant_migrate_secondary"),
+                )
+            },
+        )
         .put(
             "/control/v1/tenant/:tenant_shard_id/cancel_reconcile",
             |r| {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 430d884548..8aa263f0c3 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5055,6 +5055,69 @@ impl Service {
         Ok(TenantShardMigrateResponse {})
     }
 
+    pub(crate) async fn tenant_shard_migrate_secondary(
+        &self,
+        tenant_shard_id: TenantShardId,
+        migrate_req: TenantShardMigrateRequest,
+    ) -> Result<TenantShardMigrateResponse, ApiError> {
+        let waiter = {
+            let mut locked = self.inner.write().unwrap();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
+
+            let Some(node) = nodes.get(&migrate_req.node_id) else {
+                return Err(ApiError::BadRequest(anyhow::anyhow!(
+                    "Node {} not found",
+                    migrate_req.node_id
+                )));
+            };
+
+            if !node.is_available() {
+                // Warn but proceed: the caller may intend to manually adjust the placement of
+                // a shard even if the node is down, e.g. if intervening during an incident.
+                tracing::warn!("Migrating to unavailable node {node}");
+            }
+
+            let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant shard not found").into(),
+                ));
+            };
+
+            if shard.intent.get_secondary().len() == 1
+                && shard.intent.get_secondary()[0] == migrate_req.node_id
+            {
+                tracing::info!(
+                    "Migrating secondary to {node}: intent is unchanged {:?}",
+                    shard.intent
+                );
+            } else if shard.intent.get_attached() == &Some(migrate_req.node_id) {
+                tracing::info!("Migrating secondary to {node}: already attached where we were asked to create a secondary");
+            } else {
+                let old_secondaries = shard.intent.get_secondary().clone();
+                for secondary in old_secondaries {
+                    shard.intent.remove_secondary(scheduler, secondary);
+                }
+
+                shard.intent.push_secondary(scheduler, migrate_req.node_id);
+                shard.sequence = shard.sequence.next();
+                tracing::info!(
+                    "Migrating secondary to {node}: new intent {:?}",
+                    shard.intent
+                );
+            }
+
+            self.maybe_reconcile_shard(shard, nodes)
+        };
+
+        if let Some(waiter) = waiter {
+            waiter.wait_timeout(RECONCILE_TIMEOUT).await?;
+        } else {
+            tracing::info!("Migration is a no-op");
+        }
+
+        Ok(TenantShardMigrateResponse {})
+    }
+
     /// 'cancel' in this context means cancel any ongoing reconcile
     pub(crate) async fn tenant_shard_cancel_reconcile(
         &self,
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 616aee758d..3a55e75589 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1052,7 +1052,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
     that just hits the endpoints to check that they don't bitrot.
     """
 
-    neon_env_builder.num_pageservers = 2
+    neon_env_builder.num_pageservers = 3
     env = neon_env_builder.init_start()
 
     tenant_id = TenantId.generate()
@@ -1077,7 +1077,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
         "GET", f"{env.storage_controller_api}/debug/v1/scheduler"
     )
     # Two nodes, in a dict of node_id->node
-    assert len(response.json()["nodes"]) == 2
+    assert len(response.json()["nodes"]) == 3
     assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
     assert all(v["may_schedule"] for v in response.json()["nodes"].values())
 
@@ -1088,13 +1088,25 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
         headers=env.storage_controller.headers(TokenScope.ADMIN),
     )
 
+    # Secondary migration API: superficial check that it migrates
+    secondary_dest = env.pageservers[2].id
+    env.storage_controller.request(
+        "PUT",
+        f"{env.storage_controller_api}/control/v1/tenant/{tenant_id}-0002/migrate_secondary",
+        headers=env.storage_controller.headers(TokenScope.ADMIN),
+        json={"tenant_shard_id": f"{tenant_id}-0002", "node_id": secondary_dest},
+    )
+    assert env.storage_controller.tenant_describe(tenant_id)["shards"][0]["node_secondary"] == [
+        secondary_dest
+    ]
+
     # Node unclean drop API
     response = env.storage_controller.request(
         "POST",
         f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop",
         headers=env.storage_controller.headers(TokenScope.ADMIN),
     )
-    assert len(env.storage_controller.node_list()) == 1
+    assert len(env.storage_controller.node_list()) == 2
 
     # Tenant unclean drop API
     response = env.storage_controller.request(
@@ -1812,7 +1824,13 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     """
     output_dir = neon_env_builder.test_output_dir
     shard_count = 4
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    env.create_tenant(tenant_id, placement_policy='{"Attached":1}', shard_count=shard_count)
+
     base_args = [env.neon_binpath / "storcon_cli", "--api", env.storage_controller_api]
 
     def storcon_cli(args):
@@ -1841,7 +1859,7 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     # List nodes
     node_lines = storcon_cli(["nodes"])
     # Table header, footer, and one line of data
-    assert len(node_lines) == 5
+    assert len(node_lines) == 7
     assert "localhost" in node_lines[3]
 
     # Pause scheduling onto a node
@@ -1859,10 +1877,21 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     storcon_cli(["node-configure", "--node-id", "1", "--availability", "offline"])
     assert "Offline" in storcon_cli(["nodes"])[3]
 
+    # Restore node, verify status changes in CLI output
+    env.pageservers[0].start()
+
+    def is_online():
+        assert "Offline" not in storcon_cli(["nodes"])
+
+    wait_until(is_online)
+
+    # Let everything stabilize after node failure to avoid interfering with subsequent steps
+    env.storage_controller.reconcile_until_idle(timeout_secs=10)
+
     # List tenants
     tenant_lines = storcon_cli(["tenants"])
     assert len(tenant_lines) == 5
-    assert str(env.initial_tenant) in tenant_lines[3]
+    assert str(tenant_id) in tenant_lines[3]
 
     # Setting scheduling policies intentionally result in warnings, they're for rare use.
     env.storage_controller.allowed_errors.extend(
@@ -1870,23 +1899,58 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     )
 
     # Describe a tenant
-    tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)])
+    tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(tenant_id)])
     assert len(tenant_lines) >= 3 + shard_count * 2
-    assert str(env.initial_tenant) in tenant_lines[0]
+    assert str(tenant_id) in tenant_lines[0]
+
+    # Migrate an attached location
+    def other_ps_id(current_ps_id):
+        return (
+            env.pageservers[0].id
+            if current_ps_id == env.pageservers[1].id
+            else env.pageservers[1].id
+        )
+
+    storcon_cli(
+        [
+            "tenant-shard-migrate",
+            "--tenant-shard-id",
+            f"{tenant_id}-0004",
+            "--node",
+            str(
+                other_ps_id(
+                    env.storage_controller.tenant_describe(tenant_id)["shards"][0]["node_attached"]
+                )
+            ),
+        ]
+    )
+
+    # Migrate a secondary location
+    storcon_cli(
+        [
+            "tenant-shard-migrate-secondary",
+            "--tenant-shard-id",
+            f"{tenant_id}-0004",
+            "--node",
+            str(
+                other_ps_id(
+                    env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+                        "node_secondary"
+                    ][0]
+                )
+            ),
+        ]
+    )
 
     # Pause changes on a tenant
-    storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"])
+    storcon_cli(["tenant-policy", "--tenant-id", str(tenant_id), "--scheduling", "stop"])
     assert "Stop" in storcon_cli(["tenants"])[3]
 
     # Cancel ongoing reconcile on a tenant
-    storcon_cli(
-        ["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{env.initial_tenant}-0104"]
-    )
+    storcon_cli(["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{tenant_id}-0104"])
 
     # Change a tenant's placement
-    storcon_cli(
-        ["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"]
-    )
+    storcon_cli(["tenant-policy", "--tenant-id", str(tenant_id), "--placement", "secondary"])
     assert "Secondary" in storcon_cli(["tenants"])[3]
 
     # Modify a tenant's config
@@ -1894,7 +1958,7 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
         [
             "patch-tenant-config",
             "--tenant-id",
-            str(env.initial_tenant),
+            str(tenant_id),
             "--config",
             json.dumps({"pitr_interval": "1m"}),
         ]

From 96243af651f67747c9eeaf800f50777c75454143 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 13 Jan 2025 17:01:13 +0200
Subject: [PATCH 172/583] Stop building unnecessary extension tarballs (#10355)

We build "custom extensions" from a different repository nowadays.
---
 compute/compute-node.Dockerfile | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 449e12af5d..f346f402d4 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -170,7 +170,6 @@ RUN case "${PG_VERSION}" in \
     wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \
     echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \
     mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
     ./autogen.sh && \
     ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -220,11 +219,7 @@ RUN case "${PG_VERSION}" in \
     cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. && \
     ninja -j $(getconf _NPROCESSORS_ONLN) && \
     ninja -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
 
 #########################################################################################
 #
@@ -842,13 +837,8 @@ RUN case "${PG_VERSION}" in "v17") \
     wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
     echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
     mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control
 
 #########################################################################################
 #

From a338aee132497193c3587acd267babac66758d01 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 13 Jan 2025 15:20:46 +0000
Subject: [PATCH 173/583] feat(local_proxy): use ed25519 signatures with
 pg_session_jwt (#10290)

Generally ed25519 seems to be much preferred for cryptographic strength
to P256 nowadays, and it is NIST approved finally. We should use it
where we can as it's also faster than p256.

This PR makes the re-signed JWTs between local_proxy and pg_session_jwt
use ed25519.

This does introduce a new dependency on ed25519, but I do recall some
Neon Authorise customers asking for support for ed25519, so I am
justifying this dependency addition in the context that we can then
introduce support for customer ed25519 keys

sources:
* https://csrc.nist.gov/pubs/fips/186-5/final subsection 7 (EdDSA)
* https://datatracker.ietf.org/doc/html/rfc8037#section-3.1
---
 Cargo.lock                              | 55 +++++++++++++++++++++++++
 compute/compute-node.Dockerfile         |  4 +-
 proxy/Cargo.toml                        |  1 +
 proxy/src/serverless/backend.rs         | 16 ++++---
 proxy/src/serverless/local_conn_pool.rs | 28 +++++++------
 5 files changed, 85 insertions(+), 19 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f727741883..08453120c7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1605,6 +1605,32 @@ dependencies = [
  "typenum",
 ]
 
+[[package]]
+name = "curve25519-dalek"
+version = "4.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "curve25519-dalek-derive",
+ "digest",
+ "fiat-crypto",
+ "rustc_version",
+ "subtle",
+]
+
+[[package]]
+name = "curve25519-dalek-derive"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
+]
+
 [[package]]
 name = "darling"
 version = "0.20.1"
@@ -1875,6 +1901,28 @@ dependencies = [
  "spki 0.7.3",
 ]
 
+[[package]]
+name = "ed25519"
+version = "2.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53"
+dependencies = [
+ "signature 2.2.0",
+]
+
+[[package]]
+name = "ed25519-dalek"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a3daa8e81a3963a60642bcc1f90a670680bd4a77535faa384e9d1c79d620871"
+dependencies = [
+ "curve25519-dalek",
+ "ed25519",
+ "rand_core 0.6.4",
+ "sha2",
+ "subtle",
+]
+
 [[package]]
 name = "either"
 version = "1.8.1"
@@ -2113,6 +2161,12 @@ dependencies = [
  "subtle",
 ]
 
+[[package]]
+name = "fiat-crypto"
+version = "0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d"
+
 [[package]]
 name = "filetime"
 version = "0.2.22"
@@ -4745,6 +4799,7 @@ dependencies = [
  "consumption_metrics",
  "dashmap 5.5.0",
  "ecdsa 0.16.9",
+ "ed25519-dalek",
  "env_logger 0.10.2",
  "fallible-iterator",
  "flate2",
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index f346f402d4..2d38796d77 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1055,8 +1055,8 @@ ARG PG_VERSION
 # NOTE: local_proxy depends on the version of pg_session_jwt
 # Do not update without approve from proxy team
 # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
-RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2-v17.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "c8ecbed9cb8c6441bce5134a176002b043018adf9d05a08e457dda233090a86e pg_session_jwt.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
     mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 2f63ee3acc..f362a45035 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -106,6 +106,7 @@ jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] }
 signature = "2"
 ecdsa = "0.16"
 p256 = { version = "0.13", features = ["jwk"] }
+ed25519-dalek = { version = "2", default-features = false, features = ["rand_core"] }
 rsa = "0.9"
 
 workspace_hack.workspace = true
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index b398c3ddd0..6d5fb13681 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -3,9 +3,9 @@ use std::sync::Arc;
 use std::time::Duration;
 
 use async_trait::async_trait;
+use ed25519_dalek::SigningKey;
 use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer};
-use p256::ecdsa::SigningKey;
-use p256::elliptic_curve::JwkEcKey;
+use jose_jwk::jose_b64;
 use rand::rngs::OsRng;
 use tokio::net::{lookup_host, TcpStream};
 use tracing::field::display;
@@ -354,9 +354,15 @@ impl PoolingBackend {
     }
 }
 
-fn create_random_jwk() -> (SigningKey, JwkEcKey) {
-    let key = SigningKey::random(&mut OsRng);
-    let jwk = p256::PublicKey::from(key.verifying_key()).to_jwk();
+fn create_random_jwk() -> (SigningKey, jose_jwk::Key) {
+    let key = SigningKey::generate(&mut OsRng);
+
+    let jwk = jose_jwk::Key::Okp(jose_jwk::Okp {
+        crv: jose_jwk::OkpCurves::Ed25519,
+        x: jose_b64::serde::Bytes::from(key.verifying_key().to_bytes().to_vec()),
+        d: None,
+    });
+
     (key, jwk)
 }
 
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index c51a2bc9ba..fe33f0ff65 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -16,17 +16,16 @@ use std::sync::Arc;
 use std::task::{ready, Poll};
 use std::time::Duration;
 
+use ed25519_dalek::{Signature, Signer, SigningKey};
 use futures::future::poll_fn;
 use futures::Future;
 use indexmap::IndexMap;
 use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
-use p256::ecdsa::{Signature, SigningKey};
 use parking_lot::RwLock;
 use postgres_client::tls::NoTlsStream;
 use postgres_client::types::ToSql;
 use postgres_client::AsyncMessage;
 use serde_json::value::RawValue;
-use signature::Signer;
 use tokio::net::TcpStream;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
@@ -42,7 +41,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::Metrics;
 
 pub(crate) const EXT_NAME: &str = "pg_session_jwt";
-pub(crate) const EXT_VERSION: &str = "0.1.2";
+pub(crate) const EXT_VERSION: &str = "0.2.0";
 pub(crate) const EXT_SCHEMA: &str = "auth";
 
 #[derive(Clone)]
@@ -339,8 +338,8 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
     let cap = jwt.capacity();
 
     // we only need an empty header with the alg specified.
-    // base64url(r#"{"alg":"ES256"}"#) == "eyJhbGciOiJFUzI1NiJ9"
-    jwt.push_str("eyJhbGciOiJFUzI1NiJ9.");
+    // base64url(r#"{"alg":"EdDSA"}"#) == "eyJhbGciOiJFZERTQSJ9"
+    jwt.push_str("eyJhbGciOiJFZERTQSJ9.");
 
     // encode the jwt payload in-place
     base64::encode_config_buf(payload, base64::URL_SAFE_NO_PAD, &mut jwt);
@@ -366,14 +365,14 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
 #[cfg(test)]
 #[expect(clippy::unwrap_used)]
 mod tests {
-    use p256::ecdsa::SigningKey;
+    use ed25519_dalek::SigningKey;
     use typed_json::json;
 
     use super::resign_jwt;
 
     #[test]
     fn jwt_token_snapshot() {
-        let key = SigningKey::from_bytes(&[1; 32].into()).unwrap();
+        let key = SigningKey::from_bytes(&[1; 32]);
         let data =
             json!({"foo":"bar","jti":"foo\nbar","nested":{"jti":"tricky nesting"}}).to_string();
 
@@ -381,12 +380,17 @@ mod tests {
 
         // To validate the JWT, copy the JWT string and paste it into https://jwt.io/.
         // In the public-key box, paste the following jwk public key
-        // `{"kty":"EC","crv":"P-256","x":"b_A7lJJBzh2t1DUZ5pYOCoW0GmmgXDKBA6orzhWUyhY","y":"PE91OlW_AdxT9sCwx-7ni0DG_30lqW4igrmJzvccFEo"}`
+        // `{"kty":"OKP","crv":"Ed25519","x":"iojj3XQJ8ZX9UtstPLpdcspnCb8dlBIb83SIAbQPb1w"}`
+        // Note - jwt.io doesn't support EdDSA :(
+        // https://github.com/jsonwebtoken/jsonwebtoken.github.io/issues/509
 
-        // let pub_key = p256::ecdsa::VerifyingKey::from(&key);
-        // let pub_key = p256::PublicKey::from(pub_key);
-        // println!("{}", pub_key.to_jwk_string());
+        // let jwk = jose_jwk::Key::Okp(jose_jwk::Okp {
+        //     crv: jose_jwk::OkpCurves::Ed25519,
+        //     x: jose_jwk::jose_b64::serde::Bytes::from(key.verifying_key().to_bytes().to_vec()),
+        //     d: None,
+        // });
+        // println!("{}", serde_json::to_string(&jwk).unwrap());
 
-        assert_eq!(jwt, "eyJhbGciOiJFUzI1NiJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.pYf0LxoJ8sDgpmsYOgrbNecOSipnPBEGwnZzB-JhW2cONrKlqRsgXwK8_cOsyolGy-hTTe8GXbWTl_UdpF5RyA");
+        assert_eq!(jwt, "eyJhbGciOiJFZERTQSJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.Cvyc2By33KI0f0obystwdy8PN111L3Sc9_Mr2CU3XshtSqSdxuRxNEZGbb_RvyJf2IzheC_s7aBZ-jLeQ9N0Bg");
     }
 }

From e9ed53b14f2a7feef862606626ae7d790a1346ce Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 13 Jan 2025 10:43:01 -0500
Subject: [PATCH 174/583] feat(pageserver): support inherited sparse keyspace
 (#10313)

## Problem

In preparation to https://github.com/neondatabase/neon/issues/9516. We
need to store rel size and directory data in the sparse keyspace, but it
does not support inheritance yet.

## Summary of changes

Add a new type of keyspace "sparse but inherited" into the system.

On the read path: we don't remove the key range when we descend into the
ancestor. The search will stop when (1) the full key range is covered by
image layers (which has already been implemented before), or (2) we
reach the end of the ancestor chain.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/key.rs         |  39 ++++++-
 pageserver/src/tenant.rs               | 135 ++++++++++++++++++++++++-
 pageserver/src/tenant/storage_layer.rs |   4 +-
 pageserver/src/tenant/timeline.rs      |  25 ++++-
 4 files changed, 193 insertions(+), 10 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index f0cd713c38..328dea5dec 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -706,7 +706,7 @@ pub fn repl_origin_key_range() -> Range<Key> {
 /// Non inherited range for vectored get.
 pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
 /// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
-pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
+pub const SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
 
 impl Key {
     // AUX_FILES currently stores only data for logical replication (slots etc), and
@@ -714,7 +714,42 @@ impl Key {
     // switch (and generally it likely should be optional), so ignore these.
     #[inline(always)]
     pub fn is_inherited_key(self) -> bool {
-        !NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self)
+        if self.is_sparse() {
+            self.is_inherited_sparse_key()
+        } else {
+            !NON_INHERITED_RANGE.contains(&self)
+        }
+    }
+
+    #[inline(always)]
+    pub fn is_sparse(self) -> bool {
+        self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
+    }
+
+    /// Check if the key belongs to the inherited keyspace.
+    fn is_inherited_sparse_key(self) -> bool {
+        debug_assert!(self.is_sparse());
+        self.field1 == RELATION_SIZE_PREFIX
+    }
+
+    pub fn sparse_non_inherited_keyspace() -> Range<Key> {
+        // The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace
+        debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX);
+        Key {
+            field1: AUX_KEY_PREFIX,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }..Key {
+            field1: REPL_ORIGIN_KEY_PREFIX + 1,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }
     }
 
     #[inline(always)]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2928c435cb..070593b104 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5682,7 +5682,7 @@ mod tests {
     use bytes::{Bytes, BytesMut};
     use hex_literal::hex;
     use itertools::Itertools;
-    use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
+    use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX};
     use pageserver_api::keyspace::KeySpace;
     use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
     use pageserver_api::value::Value;
@@ -7741,7 +7741,18 @@ mod tests {
         let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
         let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
         let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap();
+        let base_key_overwrite = Key::from_hex("620000000033333333444444445500000003").unwrap();
+
+        let base_inherited_key = Key::from_hex("610000000033333333444444445500000000").unwrap();
+        let base_inherited_key_child =
+            Key::from_hex("610000000033333333444444445500000001").unwrap();
+        let base_inherited_key_nonexist =
+            Key::from_hex("610000000033333333444444445500000002").unwrap();
+        let base_inherited_key_overwrite =
+            Key::from_hex("610000000033333333444444445500000003").unwrap();
+
         assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
+        assert_eq!(base_inherited_key.field1, RELATION_SIZE_PREFIX);
 
         let tline = tenant
             .create_test_timeline_with_layers(
@@ -7750,7 +7761,18 @@ mod tests {
                 DEFAULT_PG_VERSION,
                 &ctx,
                 Vec::new(), // delta layers
-                vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers
+                vec![(
+                    Lsn(0x20),
+                    vec![
+                        (base_inherited_key, test_img("metadata inherited key 1")),
+                        (
+                            base_inherited_key_overwrite,
+                            test_img("metadata key overwrite 1a"),
+                        ),
+                        (base_key, test_img("metadata key 1")),
+                        (base_key_overwrite, test_img("metadata key overwrite 1b")),
+                    ],
+                )], // image layers
                 Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
             )
             .await?;
@@ -7764,7 +7786,18 @@ mod tests {
                 Vec::new(), // delta layers
                 vec![(
                     Lsn(0x30),
-                    vec![(base_key_child, test_img("metadata key 2"))],
+                    vec![
+                        (
+                            base_inherited_key_child,
+                            test_img("metadata inherited key 2"),
+                        ),
+                        (
+                            base_inherited_key_overwrite,
+                            test_img("metadata key overwrite 2a"),
+                        ),
+                        (base_key_child, test_img("metadata key 2")),
+                        (base_key_overwrite, test_img("metadata key overwrite 2b")),
+                    ],
                 )], // image layers
                 Lsn(0x30),
             )
@@ -7786,6 +7819,26 @@ mod tests {
             get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?,
             None
         );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key_overwrite, lsn, &ctx).await?,
+            Some(test_img("metadata key overwrite 1b"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_inherited_key, lsn, &ctx).await?,
+            Some(test_img("metadata inherited key 1"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_inherited_key_child, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_inherited_key_nonexist, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_inherited_key_overwrite, lsn, &ctx).await?,
+            Some(test_img("metadata key overwrite 1a"))
+        );
 
         // test vectored get on child timeline
         assert_eq!(
@@ -7800,6 +7853,82 @@ mod tests {
             get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?,
             None
         );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_inherited_key, lsn, &ctx).await?,
+            Some(test_img("metadata inherited key 1"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_inherited_key_child, lsn, &ctx).await?,
+            Some(test_img("metadata inherited key 2"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_inherited_key_nonexist, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key_overwrite, lsn, &ctx).await?,
+            Some(test_img("metadata key overwrite 2b"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_inherited_key_overwrite, lsn, &ctx).await?,
+            Some(test_img("metadata key overwrite 2a"))
+        );
+
+        // test vectored scan on parent timeline
+        let mut reconstruct_state = ValuesReconstructState::new();
+        let res = tline
+            .get_vectored_impl(
+                KeySpace::single(Key::metadata_key_range()),
+                lsn,
+                &mut reconstruct_state,
+                &ctx,
+            )
+            .await?;
+
+        assert_eq!(
+            res.into_iter()
+                .map(|(k, v)| (k, v.unwrap()))
+                .collect::<Vec<_>>(),
+            vec![
+                (base_inherited_key, test_img("metadata inherited key 1")),
+                (
+                    base_inherited_key_overwrite,
+                    test_img("metadata key overwrite 1a")
+                ),
+                (base_key, test_img("metadata key 1")),
+                (base_key_overwrite, test_img("metadata key overwrite 1b")),
+            ]
+        );
+
+        // test vectored scan on child timeline
+        let mut reconstruct_state = ValuesReconstructState::new();
+        let res = child
+            .get_vectored_impl(
+                KeySpace::single(Key::metadata_key_range()),
+                lsn,
+                &mut reconstruct_state,
+                &ctx,
+            )
+            .await?;
+
+        assert_eq!(
+            res.into_iter()
+                .map(|(k, v)| (k, v.unwrap()))
+                .collect::<Vec<_>>(),
+            vec![
+                (base_inherited_key, test_img("metadata inherited key 1")),
+                (
+                    base_inherited_key_child,
+                    test_img("metadata inherited key 2")
+                ),
+                (
+                    base_inherited_key_overwrite,
+                    test_img("metadata key overwrite 2a")
+                ),
+                (base_key_child, test_img("metadata key 2")),
+                (base_key_overwrite, test_img("metadata key overwrite 2b")),
+            ]
+        );
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index b8206fca5a..3913637ca0 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -12,7 +12,7 @@ pub mod merge_iterator;
 
 use crate::context::{AccessStatsBehavior, RequestContext};
 use bytes::Bytes;
-use pageserver_api::key::{Key, NON_INHERITED_SPARSE_RANGE};
+use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::value::Value;
@@ -209,7 +209,7 @@ impl ValuesReconstructState {
             .keys
             .entry(*key)
             .or_insert(Ok(VectoredValueReconstructState::default()));
-        let is_sparse_key = NON_INHERITED_SPARSE_RANGE.contains(key);
+        let is_sparse_key = key.is_sparse();
         if let Ok(state) = state {
             let key_done = match state.situation {
                 ValueReconstructSituation::Complete => {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c1b71262e0..f7227efeba 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -27,7 +27,7 @@ use pageserver_api::{
     config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
     key::{
         KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
-        NON_INHERITED_SPARSE_RANGE,
+        SPARSE_RANGE,
     },
     keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
     models::{
@@ -3221,7 +3221,7 @@ impl Timeline {
             // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
             // stalling compaction.
             keyspace.remove_overlapping_with(&KeySpace {
-                ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
+                ranges: vec![NON_INHERITED_RANGE, Key::sparse_non_inherited_keyspace()],
             });
 
             // Keyspace is fully retrieved
@@ -3242,7 +3242,11 @@ impl Timeline {
             // keys from `keyspace`, we expect there to be no overlap between it and the image covered key
             // space. If that's not the case, we had at least one key encounter a gap in the image layer
             // and stop the search as a result of that.
-            let removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
+            let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
+            // Do not fire missing key error for sparse keys.
+            removed.remove_overlapping_with(&KeySpace {
+                ranges: vec![SPARSE_RANGE],
+            });
             if !removed.is_empty() {
                 break Some(removed);
             }
@@ -3257,6 +3261,21 @@ impl Timeline {
             timeline = &*timeline_owned;
         };
 
+        // Remove sparse keys from the keyspace so that it doesn't fire errors.
+        let missing_keyspace = if let Some(missing_keyspace) = missing_keyspace {
+            let mut missing_keyspace = missing_keyspace;
+            missing_keyspace.remove_overlapping_with(&KeySpace {
+                ranges: vec![SPARSE_RANGE],
+            });
+            if missing_keyspace.is_empty() {
+                None
+            } else {
+                Some(missing_keyspace)
+            }
+        } else {
+            None
+        };
+
         if let Some(missing_keyspace) = missing_keyspace {
             return Err(GetVectoredError::MissingKey(MissingKeyError {
                 key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */

From fd1368d31e2d159a518ac78dd6d5146a45a6de72 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 13 Jan 2025 19:33:00 +0000
Subject: [PATCH 175/583] storcon: rework scheduler optimisation, prioritize AZ
 (#9916)

## Problem

We want to do a more robust job of scheduling tenants into their home
AZ: https://github.com/neondatabase/neon/issues/8264.

Closes:  https://github.com/neondatabase/neon/issues/8969

## Summary of changes

### Scope

This PR combines prioritizing AZ with a larger rework of how we do
optimisation. The rationale is that just bumping AZ in the order of
Score attributes is a very tiny change: the interesting part is lining
up all the optimisation logic to respect this properly, which means
rewriting it to use the same scores as the scheduler, rather than the
fragile hand-crafted logic that we had before. Separating these changes
out is possible, but would involve doing two rounds of test updates
instead of one.

### Scheduling optimisation

`TenantShard`'s `optimize_attachment` and `optimize_secondary` methods
now both use the scheduler to pick a new "favourite" location. Then
there is some refined logic for whether + how to migrate to it:
- To decide if a new location is sufficiently "better", we generate
scores using some projected ScheduleContexts that exclude the shard
under consideration, so that we avoid migrating from a node with
AffinityScore(2) to a node with AffinityScore(1), only to migrate back
later.
- Score types get a `for_optimization` method so that when we compare
scores, we will only do an optimisation if the scores differ by their
highest-ranking attributes, not just because one pageserver is lower in
utilization. Eventually we _will_ want a mode that does this, but doing
it here would make scheduling logic unstable and harder to test, and to
do this correctly one needs to know the size of the tenant that one is
migrating.
- When we find a new attached location that we would like to move to, we
will create a new secondary location there, even if we already had one
on some other node. This handles the case where we have a home AZ A, and
want to migrate the attachment between pageservers in that AZ while
retaining a secondary location in some other AZ as well.
- A unit test is added for
https://github.com/neondatabase/neon/issues/8969, which is implicitly
fixed by reworking optimisation to use the same scheduling scores as
scheduling.
---
 libs/pageserver_api/src/controller_api.rs     |  10 +
 storage_controller/Cargo.toml                 |   2 +-
 storage_controller/src/drain_utils.rs         |   2 +-
 storage_controller/src/reconciler.rs          |  16 +-
 storage_controller/src/scheduler.rs           | 599 ++++++++---
 storage_controller/src/service.rs             | 244 ++++-
 .../src/service/context_iterator.rs           |   9 +-
 storage_controller/src/tenant_shard.rs        | 978 +++++++++++++-----
 .../performance/test_sharding_autosplit.py    |  21 +-
 .../test_storage_controller_scale.py          | 115 +-
 test_runner/regress/test_sharding.py          |  32 +-
 .../regress/test_storage_controller.py        |  40 +-
 12 files changed, 1598 insertions(+), 470 deletions(-)

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index ba0eb0e4ae..f3aefc6df9 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -367,6 +367,16 @@ pub enum PlacementPolicy {
     Detached,
 }
 
+impl PlacementPolicy {
+    pub fn want_secondaries(&self) -> usize {
+        match self {
+            PlacementPolicy::Attached(secondary_count) => *secondary_count,
+            PlacementPolicy::Secondary => 1,
+            PlacementPolicy::Detached => 0,
+        }
+    }
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateResponse {}
 
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 5f3319512d..caaa22d0a5 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -55,4 +55,4 @@ r2d2 = { version = "0.8.10" }
 utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
 control_plane = { path = "../control_plane" }
-workspace_hack = { version = "0.1", path = "../workspace_hack" }
+workspace_hack = { version = "0.1", path = "../workspace_hack" }
\ No newline at end of file
diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/drain_utils.rs
index 47f4276ff2..8b7be88078 100644
--- a/storage_controller/src/drain_utils.rs
+++ b/storage_controller/src/drain_utils.rs
@@ -112,7 +112,7 @@ impl TenantShardDrain {
             }
         }
 
-        match scheduler.node_preferred(tenant_shard.intent.get_secondary()) {
+        match tenant_shard.preferred_secondary(scheduler) {
             Some(node) => Some(node),
             None => {
                 tracing::warn!(
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 6f584e7267..adced3b77d 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -826,7 +826,21 @@ impl Reconciler {
             if self.cancel.is_cancelled() {
                 return Err(ReconcileError::Cancel);
             }
-            self.location_config(&node, conf, None, false).await?;
+            // We only try to configure secondary locations if the node is available.  This does
+            // not stop us succeeding with the reconcile, because our core goal is to make the
+            // shard _available_ (the attached location), and configuring secondary locations
+            // can be done lazily when the node becomes available (via background reconciliation).
+            if node.is_available() {
+                self.location_config(&node, conf, None, false).await?;
+            } else {
+                // If the node is unavailable, we skip and consider the reconciliation successful: this
+                // is a common case where a pageserver is marked unavailable: we demote a location on
+                // that unavailable pageserver to secondary.
+                tracing::info!("Skipping configuring secondary location {node}, it is unavailable");
+                self.observed
+                    .locations
+                    .insert(node.get_id(), ObservedStateLocation { conf: None });
+            }
         }
 
         // The condition below identifies a detach. We must have no attached intent and
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 51a4cf35be..04a594dcac 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -32,6 +32,9 @@ pub(crate) struct SchedulerNode {
     shard_count: usize,
     /// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`].
     attached_shard_count: usize,
+    /// How many shards have a location on this node (via [`crate::tenant_shard::IntentState`]) _and_ this node
+    /// is in their preferred AZ (i.e. this is their 'home' location)
+    home_shard_count: usize,
     /// Availability zone id in which the node resides
     az: AvailabilityZone,
 
@@ -47,6 +50,12 @@ pub(crate) trait NodeSchedulingScore: Debug + Ord + Copy + Sized {
         preferred_az: &Option<AvailabilityZone>,
         context: &ScheduleContext,
     ) -> Option<Self>;
+
+    /// Return a score that drops any components based on node utilization: this is useful
+    /// for finding scores for scheduling optimisation, when we want to avoid rescheduling
+    /// shards due to e.g. disk usage, to avoid flapping.
+    fn for_optimization(&self) -> Self;
+
     fn is_overloaded(&self) -> bool;
     fn node_id(&self) -> NodeId;
 }
@@ -136,17 +145,13 @@ impl PartialOrd for SecondaryAzMatch {
 /// Ordering is given by member declaration order (top to bottom).
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
 pub(crate) struct NodeAttachmentSchedulingScore {
-    /// The number of shards belonging to the tenant currently being
-    /// scheduled that are attached to this node.
-    affinity_score: AffinityScore,
     /// Flag indicating whether this node matches the preferred AZ
     /// of the shard. For equal affinity scores, nodes in the matching AZ
     /// are considered first.
     az_match: AttachmentAzMatch,
-    /// Size of [`ScheduleContext::attached_nodes`] for the current node.
-    /// This normally tracks the number of attached shards belonging to the
-    /// tenant being scheduled that are already on this node.
-    attached_shards_in_context: usize,
+    /// The number of shards belonging to the tenant currently being
+    /// scheduled that are attached to this node.
+    affinity_score: AffinityScore,
     /// Utilisation score that combines shard count and disk utilisation
     utilization_score: u64,
     /// Total number of shards attached to this node. When nodes have identical utilisation, this
@@ -177,13 +182,25 @@ impl NodeSchedulingScore for NodeAttachmentSchedulingScore {
                 .copied()
                 .unwrap_or(AffinityScore::FREE),
             az_match: AttachmentAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())),
-            attached_shards_in_context: context.attached_nodes.get(node_id).copied().unwrap_or(0),
             utilization_score: utilization.cached_score(),
             total_attached_shard_count: node.attached_shard_count,
             node_id: *node_id,
         })
     }
 
+    /// For use in scheduling optimisation, where we only want to consider the aspects
+    /// of the score that can only be resolved by moving things (such as inter-shard affinity
+    /// and AZ affinity), and ignore aspects that reflect the total utilization of a node (which
+    /// can fluctuate for other reasons)
+    fn for_optimization(&self) -> Self {
+        Self {
+            utilization_score: 0,
+            total_attached_shard_count: 0,
+            node_id: NodeId(0),
+            ..*self
+        }
+    }
+
     fn is_overloaded(&self) -> bool {
         PageserverUtilization::is_overloaded(self.utilization_score)
     }
@@ -208,9 +225,9 @@ pub(crate) struct NodeSecondarySchedulingScore {
     affinity_score: AffinityScore,
     /// Utilisation score that combines shard count and disk utilisation
     utilization_score: u64,
-    /// Total number of shards attached to this node. When nodes have identical utilisation, this
-    /// acts as an anti-affinity between attached shards.
-    total_attached_shard_count: usize,
+    /// Anti-affinity with other non-home locations: this gives the behavior that secondaries
+    /// will spread out across the nodes in an AZ.
+    total_non_home_shard_count: usize,
     /// Convenience to make selection deterministic in tests and empty systems
     node_id: NodeId,
 }
@@ -237,11 +254,20 @@ impl NodeSchedulingScore for NodeSecondarySchedulingScore {
                 .copied()
                 .unwrap_or(AffinityScore::FREE),
             utilization_score: utilization.cached_score(),
-            total_attached_shard_count: node.attached_shard_count,
+            total_non_home_shard_count: (node.shard_count - node.home_shard_count),
             node_id: *node_id,
         })
     }
 
+    fn for_optimization(&self) -> Self {
+        Self {
+            utilization_score: 0,
+            total_non_home_shard_count: 0,
+            node_id: NodeId(0),
+            ..*self
+        }
+    }
+
     fn is_overloaded(&self) -> bool {
         PageserverUtilization::is_overloaded(self.utilization_score)
     }
@@ -293,6 +319,10 @@ impl AffinityScore {
     pub(crate) fn inc(&mut self) {
         self.0 += 1;
     }
+
+    pub(crate) fn dec(&mut self) {
+        self.0 -= 1;
+    }
 }
 
 impl std::ops::Add for AffinityScore {
@@ -324,9 +354,6 @@ pub(crate) struct ScheduleContext {
     /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
     pub(crate) nodes: HashMap<NodeId, AffinityScore>,
 
-    /// Specifically how many _attached_ locations are on each node
-    pub(crate) attached_nodes: HashMap<NodeId, usize>,
-
     pub(crate) mode: ScheduleMode,
 }
 
@@ -334,7 +361,6 @@ impl ScheduleContext {
     pub(crate) fn new(mode: ScheduleMode) -> Self {
         Self {
             nodes: HashMap::new(),
-            attached_nodes: HashMap::new(),
             mode,
         }
     }
@@ -348,25 +374,31 @@ impl ScheduleContext {
         }
     }
 
-    pub(crate) fn push_attached(&mut self, node_id: NodeId) {
-        let entry = self.attached_nodes.entry(node_id).or_default();
-        *entry += 1;
-    }
-
-    pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
-        self.nodes
-            .get(&node_id)
-            .copied()
-            .unwrap_or(AffinityScore::FREE)
-    }
-
-    pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
-        self.attached_nodes.get(&node_id).copied().unwrap_or(0)
+    /// Remove `shard`'s contributions to this context.  This is useful when considering scheduling
+    /// this shard afresh, where we don't want it to e.g. experience anti-affinity to its current location.
+    pub(crate) fn project_detach(&self, shard: &TenantShard) -> Self {
+        let mut new_context = self.clone();
+
+        if let Some(attached) = shard.intent.get_attached() {
+            if let Some(score) = new_context.nodes.get_mut(attached) {
+                score.dec();
+            }
+        }
+
+        for secondary in shard.intent.get_secondary() {
+            if let Some(score) = new_context.nodes.get_mut(secondary) {
+                score.dec();
+            }
+        }
+
+        new_context
     }
 
+    /// For test, track the sum of AffinityScore values, which is effectively how many
+    /// attached or secondary locations have been registered with this context.
     #[cfg(test)]
-    pub(crate) fn attach_count(&self) -> usize {
-        self.attached_nodes.values().sum()
+    pub(crate) fn location_count(&self) -> usize {
+        self.nodes.values().map(|i| i.0).sum()
     }
 }
 
@@ -388,6 +420,7 @@ impl Scheduler {
                 SchedulerNode {
                     shard_count: 0,
                     attached_shard_count: 0,
+                    home_shard_count: 0,
                     may_schedule: node.may_schedule(),
                     az: node.get_availability_zone_id().clone(),
                 },
@@ -415,6 +448,7 @@ impl Scheduler {
                 SchedulerNode {
                     shard_count: 0,
                     attached_shard_count: 0,
+                    home_shard_count: 0,
                     may_schedule: node.may_schedule(),
                     az: node.get_availability_zone_id().clone(),
                 },
@@ -427,6 +461,9 @@ impl Scheduler {
                     Some(node) => {
                         node.shard_count += 1;
                         node.attached_shard_count += 1;
+                        if Some(&node.az) == shard.preferred_az() {
+                            node.home_shard_count += 1;
+                        }
                     }
                     None => anyhow::bail!(
                         "Tenant {} references nonexistent node {}",
@@ -438,7 +475,12 @@ impl Scheduler {
 
             for node_id in shard.intent.get_secondary() {
                 match expect_nodes.get_mut(node_id) {
-                    Some(node) => node.shard_count += 1,
+                    Some(node) => {
+                        node.shard_count += 1;
+                        if Some(&node.az) == shard.preferred_az() {
+                            node.home_shard_count += 1;
+                        }
+                    }
                     None => anyhow::bail!(
                         "Tenant {} references nonexistent node {}",
                         shard.tenant_shard_id,
@@ -482,13 +524,20 @@ impl Scheduler {
     ///
     /// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
     /// [`Self::new`] or [`Self::node_upsert`])
-    pub(crate) fn update_node_ref_counts(&mut self, node_id: NodeId, update: RefCountUpdate) {
+    pub(crate) fn update_node_ref_counts(
+        &mut self,
+        node_id: NodeId,
+        preferred_az: Option<&AvailabilityZone>,
+        update: RefCountUpdate,
+    ) {
         let Some(node) = self.nodes.get_mut(&node_id) else {
             debug_assert!(false);
             tracing::error!("Scheduler missing node {node_id}");
             return;
         };
 
+        let is_home_az = Some(&node.az) == preferred_az;
+
         match update {
             RefCountUpdate::PromoteSecondary => {
                 node.attached_shard_count += 1;
@@ -496,19 +545,31 @@ impl Scheduler {
             RefCountUpdate::Attach => {
                 node.shard_count += 1;
                 node.attached_shard_count += 1;
+                if is_home_az {
+                    node.home_shard_count += 1;
+                }
             }
             RefCountUpdate::Detach => {
                 node.shard_count -= 1;
                 node.attached_shard_count -= 1;
+                if is_home_az {
+                    node.home_shard_count -= 1;
+                }
             }
             RefCountUpdate::DemoteAttached => {
                 node.attached_shard_count -= 1;
             }
             RefCountUpdate::AddSecondary => {
                 node.shard_count += 1;
+                if is_home_az {
+                    node.home_shard_count += 1;
+                }
             }
             RefCountUpdate::RemoveSecondary => {
                 node.shard_count -= 1;
+                if is_home_az {
+                    node.home_shard_count -= 1;
+                }
             }
         }
 
@@ -594,6 +655,7 @@ impl Scheduler {
                 entry.insert(SchedulerNode {
                     shard_count: 0,
                     attached_shard_count: 0,
+                    home_shard_count: 0,
                     may_schedule: node.may_schedule(),
                     az: node.get_availability_zone_id().clone(),
                 });
@@ -607,33 +669,20 @@ impl Scheduler {
         }
     }
 
-    /// Where we have several nodes to choose from, for example when picking a secondary location
-    /// to promote to an attached location, this method may be used to pick the best choice based
-    /// on the scheduler's knowledge of utilization and availability.
-    ///
-    /// If the input is empty, or all the nodes are not elegible for scheduling, return None: the
-    /// caller can pick a node some other way.
-    pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option<NodeId> {
-        if nodes.is_empty() {
-            return None;
-        }
-
-        // TODO: When the utilization score returned by the pageserver becomes meaningful,
-        // schedule based on that instead of the shard count.
-        let node = nodes
-            .iter()
-            .map(|node_id| {
-                let may_schedule = self
-                    .nodes
-                    .get(node_id)
-                    .map(|n| !matches!(n.may_schedule, MaySchedule::No))
-                    .unwrap_or(false);
-                (*node_id, may_schedule)
-            })
-            .max_by_key(|(_n, may_schedule)| *may_schedule);
-
-        // If even the preferred node has may_schedule==false, return None
-        node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
+    /// Calculate a single node's score, used in optimizer logic to compare specific
+    /// nodes' scores.
+    pub(crate) fn compute_node_score<Score>(
+        &mut self,
+        node_id: NodeId,
+        preferred_az: &Option<AvailabilityZone>,
+        context: &ScheduleContext,
+    ) -> Option<Score>
+    where
+        Score: NodeSchedulingScore,
+    {
+        self.nodes
+            .get_mut(&node_id)
+            .and_then(|node| Score::generate(&node_id, node, preferred_az, context))
     }
 
     /// Compute a schedulling score for each node that the scheduler knows of
@@ -727,7 +776,7 @@ impl Scheduler {
             tracing::info!(
             "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
             scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>()
-        );
+       );
         }
 
         // Note that we do not update shard count here to reflect the scheduling: that
@@ -743,47 +792,74 @@ impl Scheduler {
     }
 
     /// For choosing which AZ to schedule a new shard into, use this.  It will return the
-    /// AZ with the lowest median utilization.
+    /// AZ with the the lowest number of shards currently scheduled in this AZ as their home
+    /// location.
     ///
     /// We use an AZ-wide measure rather than simply selecting the AZ of the least-loaded
     /// node, because while tenants start out single sharded, when they grow and undergo
-    /// shard-split, they will occupy space on many nodes within an AZ.
+    /// shard-split, they will occupy space on many nodes within an AZ.  It is important
+    /// that we pick the AZ in a way that balances this _future_ load.
     ///
-    /// We use median rather than total free space or mean utilization, because
-    /// we wish to avoid preferring AZs that have low-load nodes resulting from
-    /// recent replacements.
-    ///
-    /// The practical result is that we will pick an AZ based on its median node, and
-    /// then actually _schedule_ the new shard onto the lowest-loaded node in that AZ.
+    /// Once we've picked an AZ, subsequent scheduling within that AZ will be driven by
+    /// nodes' utilization scores.
     pub(crate) fn get_az_for_new_tenant(&self) -> Option<AvailabilityZone> {
         if self.nodes.is_empty() {
             return None;
         }
 
-        let mut scores_by_az = HashMap::new();
-        for (node_id, node) in &self.nodes {
-            let az_scores = scores_by_az.entry(&node.az).or_insert_with(Vec::new);
-            let score = match &node.may_schedule {
-                MaySchedule::Yes(utilization) => utilization.score(),
-                MaySchedule::No => PageserverUtilization::full().score(),
-            };
-            az_scores.push((node_id, node, score));
+        #[derive(Default)]
+        struct AzScore {
+            home_shard_count: usize,
+            scheduleable: bool,
         }
 
-        // Sort by utilization.  Also include the node ID to break ties.
-        for scores in scores_by_az.values_mut() {
-            scores.sort_by_key(|i| (i.2, i.0));
+        let mut azs: HashMap<&AvailabilityZone, AzScore> = HashMap::new();
+        for node in self.nodes.values() {
+            let az = azs.entry(&node.az).or_default();
+            az.home_shard_count += node.home_shard_count;
+            az.scheduleable |= matches!(node.may_schedule, MaySchedule::Yes(_));
         }
 
-        let mut median_by_az = scores_by_az
+        // If any AZs are schedulable, then filter out the non-schedulable ones (i.e. AZs where
+        // all nodes are overloaded or otherwise unschedulable).
+        if azs.values().any(|i| i.scheduleable) {
+            azs.retain(|_, i| i.scheduleable);
+        }
+
+        // Find the AZ with the lowest number of shards currently allocated
+        Some(
+            azs.into_iter()
+                .min_by_key(|i| (i.1.home_shard_count, i.0))
+                .unwrap()
+                .0
+                .clone(),
+        )
+    }
+
+    pub(crate) fn get_node_az(&self, node_id: &NodeId) -> Option<AvailabilityZone> {
+        self.nodes.get(node_id).map(|n| n.az.clone())
+    }
+
+    /// For use when choosing a preferred secondary location: filter out nodes that are not
+    /// available, and gather their AZs.
+    pub(crate) fn filter_usable_nodes(
+        &self,
+        nodes: &[NodeId],
+    ) -> Vec<(NodeId, Option<AvailabilityZone>)> {
+        nodes
             .iter()
-            .map(|(az, nodes)| (*az, nodes.get(nodes.len() / 2).unwrap().2))
-            .collect::<Vec<_>>();
-        // Sort by utilization.  Also include the AZ to break ties.
-        median_by_az.sort_by_key(|i| (i.1, i.0));
-
-        // Return the AZ with the lowest median utilization
-        Some(median_by_az.first().unwrap().0.clone())
+            .filter_map(|node_id| {
+                let node = self
+                    .nodes
+                    .get(node_id)
+                    .expect("Referenced nodes always exist");
+                if matches!(node.may_schedule, MaySchedule::Yes(_)) {
+                    Some((*node_id, Some(node.az.clone())))
+                } else {
+                    None
+                }
+            })
+            .collect()
     }
 
     /// Unit test access to internal state
@@ -843,7 +919,14 @@ pub(crate) mod test_utils {
 
 #[cfg(test)]
 mod tests {
-    use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
+    use pageserver_api::{
+        controller_api::NodeAvailability, models::utilization::test_utilization,
+        shard::ShardIdentity,
+    };
+    use utils::{
+        id::TenantId,
+        shard::{ShardCount, ShardNumber, TenantShardId},
+    };
 
     use super::*;
 
@@ -853,8 +936,8 @@ mod tests {
         let nodes = test_utils::make_test_nodes(2, &[]);
 
         let mut scheduler = Scheduler::new(nodes.values());
-        let mut t1_intent = IntentState::new();
-        let mut t2_intent = IntentState::new();
+        let mut t1_intent = IntentState::new(None);
+        let mut t2_intent = IntentState::new(None);
 
         let context = ScheduleContext::default();
 
@@ -930,7 +1013,7 @@ mod tests {
             let scheduled = scheduler
                 .schedule_shard::<AttachedShardTag>(&[], &None, context)
                 .unwrap();
-            let mut intent = IntentState::new();
+            let mut intent = IntentState::new(None);
             intent.set_attached(scheduler, Some(scheduled));
             scheduled_intents.push(intent);
             assert_eq!(scheduled, expect_node);
@@ -1063,7 +1146,7 @@ mod tests {
             let scheduled = scheduler
                 .schedule_shard::<Tag>(&[], &preferred_az, context)
                 .unwrap();
-            let mut intent = IntentState::new();
+            let mut intent = IntentState::new(preferred_az.clone());
             intent.set_attached(scheduler, Some(scheduled));
             scheduled_intents.push(intent);
             assert_eq!(scheduled, expect_node);
@@ -1089,9 +1172,9 @@ mod tests {
             &mut context,
         );
 
-        // Node 2 is not in "az-a", but it has the lowest affinity so we prefer that.
+        // Node 1 and 3 (az-a) have same affinity score, so prefer the lowest node id.
         assert_scheduler_chooses::<AttachedShardTag>(
-            NodeId(2),
+            NodeId(1),
             Some(az_a_tag.clone()),
             &mut scheduled_intents,
             &mut scheduler,
@@ -1107,26 +1190,6 @@ mod tests {
             &mut context,
         );
 
-        // Avoid nodes in "az-b" for the secondary location.
-        // Nodes 1 and 3 are identically loaded, so prefer the lowest node id.
-        assert_scheduler_chooses::<SecondaryShardTag>(
-            NodeId(1),
-            Some(az_b_tag.clone()),
-            &mut scheduled_intents,
-            &mut scheduler,
-            &mut context,
-        );
-
-        // Avoid nodes in "az-b" for the secondary location.
-        // Node 3 has lower affinity score than 1, so prefer that.
-        assert_scheduler_chooses::<SecondaryShardTag>(
-            NodeId(3),
-            Some(az_b_tag.clone()),
-            &mut scheduled_intents,
-            &mut scheduler,
-            &mut context,
-        );
-
         for mut intent in scheduled_intents {
             intent.clear(&mut scheduler);
         }
@@ -1150,34 +1213,292 @@ mod tests {
 
         let mut scheduler = Scheduler::new(nodes.values());
 
-        /// Force the utilization of a node in Scheduler's state to a particular
-        /// number of bytes used.
-        fn set_utilization(scheduler: &mut Scheduler, node_id: NodeId, shard_count: u32) {
-            let mut node = Node::new(
-                node_id,
-                "".to_string(),
-                0,
-                "".to_string(),
-                0,
-                scheduler.nodes.get(&node_id).unwrap().az.clone(),
-            );
-            node.set_availability(NodeAvailability::Active(test_utilization::simple(
-                shard_count,
-                0,
-            )));
-            scheduler.node_upsert(&node);
+        /// Force the `home_shard_count` of a node directly: this is the metric used
+        /// by the scheduler when picking AZs.
+        fn set_shard_count(scheduler: &mut Scheduler, node_id: NodeId, shard_count: usize) {
+            let node = scheduler.nodes.get_mut(&node_id).unwrap();
+            node.home_shard_count = shard_count;
         }
 
         // Initial empty state.  Scores are tied, scheduler prefers lower AZ ID.
         assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
 
-        // Put some utilization on one node in AZ A: this should change nothing, as the median hasn't changed
-        set_utilization(&mut scheduler, NodeId(1), 1000000);
-        assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
-
-        // Put some utilization on a second node in AZ A: now the median has changed, so the scheduler
-        // should prefer the other AZ.
-        set_utilization(&mut scheduler, NodeId(2), 1000000);
+        // Home shard count is higher in AZ A, so AZ B will be preferred
+        set_shard_count(&mut scheduler, NodeId(1), 10);
         assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_b_tag.clone()));
+
+        // Total home shard count is higher in AZ B, so we revert to preferring AZ A
+        set_shard_count(&mut scheduler, NodeId(4), 6);
+        set_shard_count(&mut scheduler, NodeId(5), 6);
+        assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
+    }
+
+    /// Test that when selecting AZs for many new tenants, we get the expected balance across nodes
+    #[test]
+    fn az_selection_many() {
+        let az_a_tag = AvailabilityZone("az-a".to_string());
+        let az_b_tag = AvailabilityZone("az-b".to_string());
+        let az_c_tag = AvailabilityZone("az-c".to_string());
+        let nodes = test_utils::make_test_nodes(
+            6,
+            &[
+                az_a_tag.clone(),
+                az_b_tag.clone(),
+                az_c_tag.clone(),
+                az_a_tag.clone(),
+                az_b_tag.clone(),
+                az_c_tag.clone(),
+            ],
+        );
+
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        // We should get 1/6th of these on each node, give or take a few...
+        let total_tenants = 300;
+
+        // ...where the 'few' is the number of AZs, because the scheduling will sometimes overshoot
+        // on one AZ before correcting itself.  This is because we select the 'home' AZ based on
+        // an AZ-wide metric, but we select the location for secondaries on a purely node-based
+        // metric (while excluding the home AZ).
+        let grace = 3;
+
+        let mut scheduled_shards = Vec::new();
+        for _i in 0..total_tenants {
+            let preferred_az = scheduler.get_az_for_new_tenant().unwrap();
+
+            let mut node_home_counts = scheduler
+                .nodes
+                .iter()
+                .map(|(node_id, node)| (node_id, node.home_shard_count))
+                .collect::<Vec<_>>();
+            node_home_counts.sort_by_key(|i| i.0);
+            eprintln!("Selected {}, vs nodes {:?}", preferred_az, node_home_counts);
+
+            let tenant_shard_id = TenantShardId {
+                tenant_id: TenantId::generate(),
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(1),
+            };
+
+            let shard_identity = ShardIdentity::new(
+                tenant_shard_id.shard_number,
+                tenant_shard_id.shard_count,
+                pageserver_api::shard::ShardStripeSize(1),
+            )
+            .unwrap();
+            let mut shard = TenantShard::new(
+                tenant_shard_id,
+                shard_identity,
+                pageserver_api::controller_api::PlacementPolicy::Attached(1),
+                Some(preferred_az),
+            );
+
+            let mut context = ScheduleContext::default();
+            shard.schedule(&mut scheduler, &mut context).unwrap();
+            eprintln!("Scheduled shard at {:?}", shard.intent);
+
+            scheduled_shards.push(shard);
+        }
+
+        for (node_id, node) in &scheduler.nodes {
+            eprintln!(
+                "Node {}: {} {} {}",
+                node_id, node.shard_count, node.attached_shard_count, node.home_shard_count
+            );
+        }
+
+        for node in scheduler.nodes.values() {
+            assert!((node.home_shard_count as i64 - total_tenants as i64 / 6).abs() < grace);
+        }
+
+        for mut shard in scheduled_shards {
+            shard.intent.clear(&mut scheduler);
+        }
+    }
+
+    #[test]
+    /// Make sure that when we have an odd number of nodes and an even number of shards, we still
+    /// get scheduling stability.
+    fn odd_nodes_stability() {
+        let az_a = AvailabilityZone("az-a".to_string());
+        let az_b = AvailabilityZone("az-b".to_string());
+
+        let nodes = test_utils::make_test_nodes(
+            10,
+            &[
+                az_a.clone(),
+                az_a.clone(),
+                az_a.clone(),
+                az_a.clone(),
+                az_a.clone(),
+                az_b.clone(),
+                az_b.clone(),
+                az_b.clone(),
+                az_b.clone(),
+                az_b.clone(),
+            ],
+        );
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        // Need to keep these alive because they contribute to shard counts via RAII
+        let mut scheduled_shards = Vec::new();
+
+        let mut context = ScheduleContext::default();
+
+        fn schedule_shard(
+            tenant_shard_id: TenantShardId,
+            expect_attached: NodeId,
+            expect_secondary: NodeId,
+            scheduled_shards: &mut Vec<TenantShard>,
+            scheduler: &mut Scheduler,
+            preferred_az: Option<AvailabilityZone>,
+            context: &mut ScheduleContext,
+        ) {
+            let shard_identity = ShardIdentity::new(
+                tenant_shard_id.shard_number,
+                tenant_shard_id.shard_count,
+                pageserver_api::shard::ShardStripeSize(1),
+            )
+            .unwrap();
+            let mut shard = TenantShard::new(
+                tenant_shard_id,
+                shard_identity,
+                pageserver_api::controller_api::PlacementPolicy::Attached(1),
+                preferred_az,
+            );
+
+            shard.schedule(scheduler, context).unwrap();
+
+            assert_eq!(shard.intent.get_attached().unwrap(), expect_attached);
+            assert_eq!(
+                shard.intent.get_secondary().first().unwrap(),
+                &expect_secondary
+            );
+
+            scheduled_shards.push(shard);
+        }
+
+        let tenant_id = TenantId::generate();
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(8),
+            },
+            NodeId(1),
+            NodeId(6),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(1),
+                shard_count: ShardCount(8),
+            },
+            NodeId(2),
+            NodeId(7),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(2),
+                shard_count: ShardCount(8),
+            },
+            NodeId(3),
+            NodeId(8),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(3),
+                shard_count: ShardCount(8),
+            },
+            NodeId(4),
+            NodeId(9),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(4),
+                shard_count: ShardCount(8),
+            },
+            NodeId(5),
+            NodeId(10),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(5),
+                shard_count: ShardCount(8),
+            },
+            NodeId(1),
+            NodeId(6),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(6),
+                shard_count: ShardCount(8),
+            },
+            NodeId(2),
+            NodeId(7),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(7),
+                shard_count: ShardCount(8),
+            },
+            NodeId(3),
+            NodeId(8),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        // Assert that the optimizer suggests nochanges, i.e. our initial scheduling was stable.
+        for shard in &scheduled_shards {
+            assert_eq!(shard.optimize_attachment(&mut scheduler, &context), None);
+        }
+
+        for mut shard in scheduled_shards {
+            shard.intent.clear(&mut scheduler);
+        }
     }
 }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 8aa263f0c3..dadcc44cfb 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1404,7 +1404,11 @@ impl Service {
 
             // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
             // it with what we can infer: the node for which a generation was most recently issued.
-            let mut intent = IntentState::new();
+            let mut intent = IntentState::new(
+                tsp.preferred_az_id
+                    .as_ref()
+                    .map(|az| AvailabilityZone(az.clone())),
+            );
             if let Some(generation_pageserver) = tsp.generation_pageserver.map(|n| NodeId(n as u64))
             {
                 if nodes.contains_key(&generation_pageserver) {
@@ -2474,18 +2478,29 @@ impl Service {
         tenant_id: TenantId,
         _guard: &TracingExclusiveGuard<TenantOperations>,
     ) -> Result<(), ApiError> {
-        let present_in_memory = {
+        // Check if the tenant is present in memory, and select an AZ to use when loading
+        // if we will load it.
+        let load_in_az = {
             let locked = self.inner.read().unwrap();
-            locked
+            let existing = locked
                 .tenants
                 .range(TenantShardId::tenant_range(tenant_id))
-                .next()
-                .is_some()
-        };
+                .next();
 
-        if present_in_memory {
-            return Ok(());
-        }
+            // If the tenant is not present in memory, we expect to load it from database,
+            // so let's figure out what AZ to load it into while we have self.inner locked.
+            if existing.is_none() {
+                locked
+                    .scheduler
+                    .get_az_for_new_tenant()
+                    .ok_or(ApiError::BadRequest(anyhow::anyhow!(
+                        "No AZ with nodes found to load tenant"
+                    )))?
+            } else {
+                // We already have this tenant in memory
+                return Ok(());
+            }
+        };
 
         let tenant_shards = self.persistence.load_tenant(tenant_id).await?;
         if tenant_shards.is_empty() {
@@ -2494,8 +2509,20 @@ impl Service {
             ));
         }
 
-        // TODO: choose a fresh AZ to use for this tenant when un-detaching: there definitely isn't a running
-        // compute, so no benefit to making AZ sticky across detaches.
+        // Update the persistent shards with the AZ that we are about to apply to in-memory state
+        self.persistence
+            .set_tenant_shard_preferred_azs(
+                tenant_shards
+                    .iter()
+                    .map(|t| {
+                        (
+                            t.get_tenant_shard_id().expect("Corrupt shard in database"),
+                            load_in_az.clone(),
+                        )
+                    })
+                    .collect(),
+            )
+            .await?;
 
         let mut locked = self.inner.write().unwrap();
         tracing::info!(
@@ -2505,7 +2532,7 @@ impl Service {
         );
 
         locked.tenants.extend(tenant_shards.into_iter().map(|p| {
-            let intent = IntentState::new();
+            let intent = IntentState::new(Some(load_in_az.clone()));
             let shard =
                 TenantShard::from_persistent(p, intent).expect("Corrupt shard row in database");
 
@@ -4236,6 +4263,22 @@ impl Service {
                 }
 
                 tracing::info!("Restoring parent shard {tenant_shard_id}");
+
+                // Drop any intents that refer to unavailable nodes, to enable this abort to proceed even
+                // if the original attachment location is offline.
+                if let Some(node_id) = shard.intent.get_attached() {
+                    if !nodes.get(node_id).unwrap().is_available() {
+                        tracing::info!("Demoting attached intent for {tenant_shard_id} on unavailable node {node_id}");
+                        shard.intent.demote_attached(scheduler, *node_id);
+                    }
+                }
+                for node_id in shard.intent.get_secondary().clone() {
+                    if !nodes.get(&node_id).unwrap().is_available() {
+                        tracing::info!("Dropping secondary intent for {tenant_shard_id} on unavailable node {node_id}");
+                        shard.intent.remove_secondary(scheduler, node_id);
+                    }
+                }
+
                 shard.splitting = SplitState::Idle;
                 if let Err(e) = shard.schedule(scheduler, &mut ScheduleContext::default()) {
                     // If this shard can't be scheduled now (perhaps due to offline nodes or
@@ -4389,15 +4432,13 @@ impl Service {
 
                     let mut child_state =
                         TenantShard::new(child, child_shard, policy.clone(), preferred_az.clone());
-                    child_state.intent = IntentState::single(scheduler, Some(pageserver));
+                    child_state.intent =
+                        IntentState::single(scheduler, Some(pageserver), preferred_az.clone());
                     child_state.observed = ObservedState {
                         locations: child_observed,
                     };
                     child_state.generation = Some(generation);
                     child_state.config = config.clone();
-                    if let Some(preferred_az) = &preferred_az {
-                        child_state.set_preferred_az(preferred_az.clone());
-                    }
 
                     // The child's TenantShard::splitting is intentionally left at the default value of Idle,
                     // as at this point in the split process we have succeeded and this part is infallible:
@@ -5014,6 +5055,8 @@ impl Service {
                         // If our new attached node was a secondary, it no longer should be.
                         shard.intent.remove_secondary(scheduler, migrate_req.node_id);
 
+                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
+
                         // If we were already attached to something, demote that to a secondary
                         if let Some(old_attached) = old_attached {
                             if n > 0 {
@@ -5025,8 +5068,6 @@ impl Service {
                                 shard.intent.push_secondary(scheduler, old_attached);
                             }
                         }
-
-                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
                     }
                     PlacementPolicy::Secondary => {
                         shard.intent.clear(scheduler);
@@ -5712,7 +5753,7 @@ impl Service {
             register_req.listen_http_port,
             register_req.listen_pg_addr,
             register_req.listen_pg_port,
-            register_req.availability_zone_id,
+            register_req.availability_zone_id.clone(),
         );
 
         // TODO: idempotency if the node already exists in the database
@@ -5732,8 +5773,9 @@ impl Service {
             .set(locked.nodes.len() as i64);
 
         tracing::info!(
-            "Registered pageserver {}, now have {} pageservers",
+            "Registered pageserver {} ({}), now have {} pageservers",
             register_req.node_id,
+            register_req.availability_zone_id,
             locked.nodes.len()
         );
         Ok(())
@@ -6467,6 +6509,7 @@ impl Service {
                 // Shard was dropped between planning and execution;
                 continue;
             };
+            tracing::info!("Applying optimization: {optimization:?}");
             if shard.apply_optimization(scheduler, optimization) {
                 optimizations_applied += 1;
                 if self.maybe_reconcile_shard(shard, nodes).is_some() {
@@ -6497,7 +6540,13 @@ impl Service {
 
         let mut work = Vec::new();
         let mut locked = self.inner.write().unwrap();
-        let (nodes, tenants, scheduler) = locked.parts_mut();
+        let (_nodes, tenants, scheduler) = locked.parts_mut();
+
+        // We are going to plan a bunch of optimisations before applying any of them, so the
+        // utilisation stats on nodes will be effectively stale for the >1st optimisation we
+        // generate.  To avoid this causing unstable migrations/flapping, it's important that the
+        // code in TenantShard for finding optimisations uses [`NodeAttachmentSchedulingScore::disregard_utilization`]
+        // to ignore the utilisation component of the score.
 
         for (_tenant_id, schedule_context, shards) in
             TenantShardContextIterator::new(tenants, ScheduleMode::Speculative)
@@ -6528,13 +6577,28 @@ impl Service {
                     continue;
                 }
 
-                // TODO: optimization calculations are relatively expensive: create some fast-path for
-                // the common idle case (avoiding the search on tenants that we have recently checked)
+                // Fast path: we may quickly identify shards that don't have any possible optimisations
+                if !shard.maybe_optimizable(scheduler, &schedule_context) {
+                    if cfg!(feature = "testing") {
+                        // Check that maybe_optimizable doesn't disagree with the actual optimization functions.
+                        // Only do this in testing builds because it is not a correctness-critical check, so we shouldn't
+                        // panic in prod if we hit this, or spend cycles on it in prod.
+                        assert!(shard
+                            .optimize_attachment(scheduler, &schedule_context)
+                            .is_none());
+                        assert!(shard
+                            .optimize_secondary(scheduler, &schedule_context)
+                            .is_none());
+                    }
+                    continue;
+                }
+
                 if let Some(optimization) =
-                    // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to
+                    // If idle, maybe optimize attachments: if a shard has a secondary location that is preferable to
                     // its primary location based on soft constraints, cut it over.
-                    shard.optimize_attachment(nodes, &schedule_context)
+                    shard.optimize_attachment(scheduler, &schedule_context)
                 {
+                    tracing::info!(tenant_shard_id=%shard.tenant_shard_id, "Identified optimization for attachment: {optimization:?}");
                     work.push((shard.tenant_shard_id, optimization));
                     break;
                 } else if let Some(optimization) =
@@ -6544,6 +6608,7 @@ impl Service {
                     // in the same tenant with secondary locations on the node where they originally split.
                     shard.optimize_secondary(scheduler, &schedule_context)
                 {
+                    tracing::info!(tenant_shard_id=%shard.tenant_shard_id, "Identified optimization for secondary: {optimization:?}");
                     work.push((shard.tenant_shard_id, optimization));
                     break;
                 }
@@ -6592,8 +6657,10 @@ impl Service {
                         }
                     }
                 }
-                ScheduleOptimizationAction::ReplaceSecondary(_) => {
-                    // No extra checks needed to replace a secondary: this does not interrupt client access
+                ScheduleOptimizationAction::ReplaceSecondary(_)
+                | ScheduleOptimizationAction::CreateSecondary(_)
+                | ScheduleOptimizationAction::RemoveSecondary(_) => {
+                    // No extra checks needed to manage secondaries: this does not interrupt client access
                     validated_work.push((tenant_shard_id, optimization))
                 }
             };
@@ -6665,26 +6732,35 @@ impl Service {
     /// we have this helper to move things along faster.
     #[cfg(feature = "testing")]
     async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) {
-        let (attached_node, secondary_node) = {
+        let (attached_node, secondaries) = {
             let locked = self.inner.read().unwrap();
             let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
+                tracing::warn!(
+                    "Skipping kick of secondary download for {tenant_shard_id}: not found"
+                );
                 return;
             };
-            let (Some(attached), Some(secondary)) = (
-                shard.intent.get_attached(),
-                shard.intent.get_secondary().first(),
-            ) else {
+
+            let Some(attached) = shard.intent.get_attached() else {
+                tracing::warn!(
+                    "Skipping kick of secondary download for {tenant_shard_id}: no attached"
+                );
                 return;
             };
-            (
-                locked.nodes.get(attached).unwrap().clone(),
-                locked.nodes.get(secondary).unwrap().clone(),
-            )
+
+            let secondaries = shard
+                .intent
+                .get_secondary()
+                .iter()
+                .map(|n| locked.nodes.get(n).unwrap().clone())
+                .collect::<Vec<_>>();
+
+            (locked.nodes.get(attached).unwrap().clone(), secondaries)
         };
 
         // Make remote API calls to upload + download heatmaps: we ignore errors because this is just
         // a 'kick' to let scheduling optimisation run more promptly.
-        attached_node
+        match attached_node
             .with_client_retries(
                 |client| async move { client.tenant_heatmap_upload(tenant_shard_id).await },
                 &self.config.jwt_token,
@@ -6693,22 +6769,57 @@ impl Service {
                 SHORT_RECONCILE_TIMEOUT,
                 &self.cancel,
             )
-            .await;
+            .await
+        {
+            Some(Err(e)) => {
+                tracing::info!(
+                    "Failed to upload heatmap from {attached_node} for {tenant_shard_id}: {e}"
+                );
+            }
+            None => {
+                tracing::info!(
+                    "Cancelled while uploading heatmap from {attached_node} for {tenant_shard_id}"
+                );
+            }
+            Some(Ok(_)) => {
+                tracing::info!(
+                    "Successfully uploaded heatmap from {attached_node} for {tenant_shard_id}"
+                );
+            }
+        }
 
-        secondary_node
-            .with_client_retries(
-                |client| async move {
-                    client
-                        .tenant_secondary_download(tenant_shard_id, Some(Duration::from_secs(1)))
-                        .await
-                },
-                &self.config.jwt_token,
-                3,
-                10,
-                SHORT_RECONCILE_TIMEOUT,
-                &self.cancel,
-            )
-            .await;
+        for secondary_node in secondaries {
+            match secondary_node
+                .with_client_retries(
+                    |client| async move {
+                        client
+                            .tenant_secondary_download(
+                                tenant_shard_id,
+                                Some(Duration::from_secs(1)),
+                            )
+                            .await
+                    },
+                    &self.config.jwt_token,
+                    3,
+                    10,
+                    SHORT_RECONCILE_TIMEOUT,
+                    &self.cancel,
+                )
+                .await
+            {
+                Some(Err(e)) => {
+                    tracing::info!(
+                "Failed to download heatmap from {secondary_node} for {tenant_shard_id}: {e}"
+            );
+                }
+                None => {
+                    tracing::info!("Cancelled while downloading heatmap from {secondary_node} for {tenant_shard_id}");
+                }
+                Some(Ok(progress)) => {
+                    tracing::info!("Successfully downloaded heatmap from {secondary_node} for {tenant_shard_id}: {progress:?}");
+                }
+            }
+        }
     }
 
     /// Look for shards which are oversized and in need of splitting
@@ -7144,9 +7255,15 @@ impl Service {
     fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
         let mut locked = self.inner.write().unwrap();
         let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
+        let (nodes, tenants, _scheduler) = locked.parts_mut();
 
-        let mut tids_by_node = locked
-            .tenants
+        let node_az = nodes
+            .get(&node_id)
+            .expect("Node must exist")
+            .get_availability_zone_id()
+            .clone();
+
+        let mut tids_by_node = tenants
             .iter_mut()
             .filter_map(|(tid, tenant_shard)| {
                 if !matches!(
@@ -7159,6 +7276,25 @@ impl Service {
                     return None;
                 }
 
+                // AZ check: when filling nodes after a restart, our intent is to move _back_ the
+                // shards which belong on this node, not to promote shards whose scheduling preference
+                // would be on their currently attached node.  So will avoid promoting shards whose
+                // home AZ doesn't match the AZ of the node we're filling.
+                match tenant_shard.preferred_az() {
+                    None => {
+                        // Shard doesn't have an AZ preference: it is elegible to be moved.
+                    }
+                    Some(az) if az == &node_az => {
+                        // This shard's home AZ is equal to the node we're filling: it is
+                        // elegible to be moved: fall through;
+                    }
+                    Some(_) => {
+                        // This shard's home AZ is somewhere other than the node we're filling:
+                        // do not include it in the fill plan.
+                        return None;
+                    }
+                }
+
                 if tenant_shard.intent.get_secondary().contains(&node_id) {
                     if let Some(primary) = tenant_shard.intent.get_attached() {
                         return Some((*primary, *tid));
diff --git a/storage_controller/src/service/context_iterator.rs b/storage_controller/src/service/context_iterator.rs
index d38010a27e..dd6913e988 100644
--- a/storage_controller/src/service/context_iterator.rs
+++ b/storage_controller/src/service/context_iterator.rs
@@ -43,9 +43,6 @@ impl<'a> Iterator for TenantShardContextIterator<'a> {
 
             // Accumulate the schedule context for all the shards in a tenant
             schedule_context.avoid(&shard.intent.all_pageservers());
-            if let Some(attached) = shard.intent.get_attached() {
-                schedule_context.push_attached(*attached);
-            }
             tenant_shards.push(shard);
 
             if tenant_shard_id.shard_number.0 == tenant_shard_id.shard_count.count() - 1 {
@@ -115,7 +112,7 @@ mod tests {
         assert_eq!(tenant_id, t1_id);
         assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
         assert_eq!(shards.len(), 1);
-        assert_eq!(context.attach_count(), 1);
+        assert_eq!(context.location_count(), 2);
 
         let (tenant_id, context, shards) = iter.next().unwrap();
         assert_eq!(tenant_id, t2_id);
@@ -124,13 +121,13 @@ mod tests {
         assert_eq!(shards[2].tenant_shard_id.shard_number, ShardNumber(2));
         assert_eq!(shards[3].tenant_shard_id.shard_number, ShardNumber(3));
         assert_eq!(shards.len(), 4);
-        assert_eq!(context.attach_count(), 4);
+        assert_eq!(context.location_count(), 8);
 
         let (tenant_id, context, shards) = iter.next().unwrap();
         assert_eq!(tenant_id, t3_id);
         assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
         assert_eq!(shards.len(), 1);
-        assert_eq!(context.attach_count(), 1);
+        assert_eq!(context.location_count(), 2);
 
         for shard in tenants.values_mut() {
             shard.intent.clear(&mut scheduler);
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index e0a71b5822..2ba2a57eba 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -11,16 +11,14 @@ use crate::{
     persistence::TenantShardPersistence,
     reconciler::{ReconcileUnits, ReconcilerConfig},
     scheduler::{
-        AffinityScore, AttachedShardTag, MaySchedule, RefCountUpdate, ScheduleContext,
-        SecondaryShardTag,
+        AffinityScore, AttachedShardTag, NodeSchedulingScore, NodeSecondarySchedulingScore,
+        RefCountUpdate, ScheduleContext, SecondaryShardTag, ShardTag,
     },
     service::ReconcileResultRequest,
 };
 use futures::future::{self, Either};
 use itertools::Itertools;
-use pageserver_api::controller_api::{
-    AvailabilityZone, NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy,
-};
+use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy, ShardSchedulingPolicy};
 use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
     shard::{ShardIdentity, TenantShardId},
@@ -33,6 +31,7 @@ use utils::{
     generation::Generation,
     id::NodeId,
     seqwait::{SeqWait, SeqWaitError},
+    shard::ShardCount,
     sync::gate::GateGuard,
 };
 
@@ -147,45 +146,67 @@ pub(crate) struct TenantShard {
     // Support/debug tool: if something is going wrong or flapping with scheduling, this may
     // be set to a non-active state to avoid making changes while the issue is fixed.
     scheduling_policy: ShardSchedulingPolicy,
+}
+
+#[derive(Clone, Debug, Serialize)]
+pub(crate) struct IntentState {
+    attached: Option<NodeId>,
+    secondary: Vec<NodeId>,
 
     // We should attempt to schedule this shard in the provided AZ to
     // decrease chances of cross-AZ compute.
     preferred_az_id: Option<AvailabilityZone>,
 }
 
-#[derive(Default, Clone, Debug, Serialize)]
-pub(crate) struct IntentState {
-    attached: Option<NodeId>,
-    secondary: Vec<NodeId>,
-}
-
 impl IntentState {
-    pub(crate) fn new() -> Self {
+    pub(crate) fn new(preferred_az_id: Option<AvailabilityZone>) -> Self {
         Self {
             attached: None,
             secondary: vec![],
+            preferred_az_id,
         }
     }
-    pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option<NodeId>) -> Self {
+    pub(crate) fn single(
+        scheduler: &mut Scheduler,
+        node_id: Option<NodeId>,
+        preferred_az_id: Option<AvailabilityZone>,
+    ) -> Self {
         if let Some(node_id) = node_id {
-            scheduler.update_node_ref_counts(node_id, RefCountUpdate::Attach);
+            scheduler.update_node_ref_counts(
+                node_id,
+                preferred_az_id.as_ref(),
+                RefCountUpdate::Attach,
+            );
         }
         Self {
             attached: node_id,
             secondary: vec![],
+            preferred_az_id,
         }
     }
 
     pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option<NodeId>) {
         if self.attached != new_attached {
             if let Some(old_attached) = self.attached.take() {
-                scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach);
+                scheduler.update_node_ref_counts(
+                    old_attached,
+                    self.preferred_az_id.as_ref(),
+                    RefCountUpdate::Detach,
+                );
             }
             if let Some(new_attached) = &new_attached {
-                scheduler.update_node_ref_counts(*new_attached, RefCountUpdate::Attach);
+                scheduler.update_node_ref_counts(
+                    *new_attached,
+                    self.preferred_az_id.as_ref(),
+                    RefCountUpdate::Attach,
+                );
             }
             self.attached = new_attached;
         }
+
+        if let Some(new_attached) = &new_attached {
+            assert!(!self.secondary.contains(new_attached));
+        }
     }
 
     /// Like set_attached, but the node is from [`Self::secondary`].  This swaps the node from
@@ -204,15 +225,28 @@ impl IntentState {
         let demoted = self.attached;
         self.attached = Some(promote_secondary);
 
-        scheduler.update_node_ref_counts(promote_secondary, RefCountUpdate::PromoteSecondary);
+        scheduler.update_node_ref_counts(
+            promote_secondary,
+            self.preferred_az_id.as_ref(),
+            RefCountUpdate::PromoteSecondary,
+        );
         if let Some(demoted) = demoted {
-            scheduler.update_node_ref_counts(demoted, RefCountUpdate::DemoteAttached);
+            scheduler.update_node_ref_counts(
+                demoted,
+                self.preferred_az_id.as_ref(),
+                RefCountUpdate::DemoteAttached,
+            );
         }
     }
 
     pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
-        debug_assert!(!self.secondary.contains(&new_secondary));
-        scheduler.update_node_ref_counts(new_secondary, RefCountUpdate::AddSecondary);
+        assert!(!self.secondary.contains(&new_secondary));
+        assert!(self.attached != Some(new_secondary));
+        scheduler.update_node_ref_counts(
+            new_secondary,
+            self.preferred_az_id.as_ref(),
+            RefCountUpdate::AddSecondary,
+        );
         self.secondary.push(new_secondary);
     }
 
@@ -220,27 +254,43 @@ impl IntentState {
     pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) {
         let index = self.secondary.iter().position(|n| *n == node_id);
         if let Some(index) = index {
-            scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary);
+            scheduler.update_node_ref_counts(
+                node_id,
+                self.preferred_az_id.as_ref(),
+                RefCountUpdate::RemoveSecondary,
+            );
             self.secondary.remove(index);
         }
     }
 
     pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) {
         for secondary in self.secondary.drain(..) {
-            scheduler.update_node_ref_counts(secondary, RefCountUpdate::RemoveSecondary);
+            scheduler.update_node_ref_counts(
+                secondary,
+                self.preferred_az_id.as_ref(),
+                RefCountUpdate::RemoveSecondary,
+            );
         }
     }
 
     /// Remove the last secondary node from the list of secondaries
     pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) {
         if let Some(node_id) = self.secondary.pop() {
-            scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary);
+            scheduler.update_node_ref_counts(
+                node_id,
+                self.preferred_az_id.as_ref(),
+                RefCountUpdate::RemoveSecondary,
+            );
         }
     }
 
     pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
         if let Some(old_attached) = self.attached.take() {
-            scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach);
+            scheduler.update_node_ref_counts(
+                old_attached,
+                self.preferred_az_id.as_ref(),
+                RefCountUpdate::Detach,
+            );
         }
 
         self.clear_secondary(scheduler);
@@ -275,7 +325,11 @@ impl IntentState {
         if self.attached == Some(node_id) {
             self.attached = None;
             self.secondary.push(node_id);
-            scheduler.update_node_ref_counts(node_id, RefCountUpdate::DemoteAttached);
+            scheduler.update_node_ref_counts(
+                node_id,
+                self.preferred_az_id.as_ref(),
+                RefCountUpdate::DemoteAttached,
+            );
             true
         } else {
             false
@@ -315,6 +369,7 @@ pub(crate) struct ObservedStateLocation {
     /// we know that we might have some state on this node.
     pub(crate) conf: Option<LocationConfig>,
 }
+
 pub(crate) struct ReconcilerWaiter {
     // For observability purposes, remember the ID of the shard we're
     // waiting for.
@@ -360,6 +415,10 @@ pub(crate) enum ScheduleOptimizationAction {
     ReplaceSecondary(ReplaceSecondary),
     // Migrate attachment to an existing secondary location
     MigrateAttachment(MigrateAttachment),
+    // Create a secondary location, with the intent of later migrating to it
+    CreateSecondary(NodeId),
+    // Remove a secondary location that we previously created to facilitate a migration
+    RemoveSecondary(NodeId),
 }
 
 #[derive(Eq, PartialEq, Debug, Clone)]
@@ -486,7 +545,7 @@ impl TenantShard {
         Self {
             tenant_shard_id,
             policy,
-            intent: IntentState::default(),
+            intent: IntentState::new(preferred_az_id),
             generation: Some(Generation::new(0)),
             shard,
             observed: ObservedState::default(),
@@ -500,7 +559,6 @@ impl TenantShard {
             last_error: Arc::default(),
             pending_compute_notification: false,
             scheduling_policy: ShardSchedulingPolicy::default(),
-            preferred_az_id,
         }
     }
 
@@ -563,7 +621,7 @@ impl TenantShard {
             return Ok((false, node_id));
         }
 
-        if let Some(promote_secondary) = scheduler.node_preferred(&self.intent.secondary) {
+        if let Some(promote_secondary) = self.preferred_secondary(scheduler) {
             // Promote a secondary
             tracing::debug!("Promoted secondary {} to attached", promote_secondary);
             self.intent.promote_attached(scheduler, promote_secondary);
@@ -572,7 +630,7 @@ impl TenantShard {
             // Pick a fresh node: either we had no secondaries or none were schedulable
             let node_id = scheduler.schedule_shard::<AttachedShardTag>(
                 &self.intent.secondary,
-                &self.preferred_az_id,
+                &self.intent.preferred_az_id,
                 context,
             )?;
             tracing::debug!("Selected {} as attached", node_id);
@@ -594,9 +652,6 @@ impl TenantShard {
         let r = self.do_schedule(scheduler, context);
 
         context.avoid(&self.intent.all_pageservers());
-        if let Some(attached) = self.intent.get_attached() {
-            context.push_attached(*attached);
-        }
 
         r
     }
@@ -631,24 +686,7 @@ impl TenantShard {
         use PlacementPolicy::*;
         match self.policy {
             Attached(secondary_count) => {
-                let retain_secondaries = if self.intent.attached.is_none()
-                    && scheduler.node_preferred(&self.intent.secondary).is_some()
-                {
-                    // If we have no attached, and one of the secondaries is elegible to be promoted, retain
-                    // one more secondary than we usually would, as one of them will become attached futher down this function.
-                    secondary_count + 1
-                } else {
-                    secondary_count
-                };
-
-                while self.intent.secondary.len() > retain_secondaries {
-                    // We have no particular preference for one secondary location over another: just
-                    // arbitrarily drop from the end
-                    self.intent.pop_secondary(scheduler);
-                    modified = true;
-                }
-
-                // Should have exactly one attached, and N secondaries
+                // Should have exactly one attached, and at least N secondaries
                 let (modified_attached, attached_node_id) =
                     self.schedule_attached(scheduler, context)?;
                 modified |= modified_attached;
@@ -657,7 +695,7 @@ impl TenantShard {
                 while self.intent.secondary.len() < secondary_count {
                     let node_id = scheduler.schedule_shard::<SecondaryShardTag>(
                         &used_pageservers,
-                        &self.preferred_az_id,
+                        &self.intent.preferred_az_id,
                         context,
                     )?;
                     self.intent.push_secondary(scheduler, node_id);
@@ -674,7 +712,7 @@ impl TenantShard {
                     // Populate secondary by scheduling a fresh node
                     let node_id = scheduler.schedule_shard::<SecondaryShardTag>(
                         &[],
-                        &self.preferred_az_id,
+                        &self.intent.preferred_az_id,
                         context,
                     )?;
                     self.intent.push_secondary(scheduler, node_id);
@@ -718,7 +756,7 @@ impl TenantShard {
     ) -> Result<(), ScheduleError> {
         let promote_to = match promote_to {
             Some(node) => node,
-            None => match scheduler.node_preferred(self.intent.get_secondary()) {
+            None => match self.preferred_secondary(scheduler) {
                 Some(node) => node,
                 None => {
                     return Err(ScheduleError::ImpossibleConstraint);
@@ -745,90 +783,276 @@ impl TenantShard {
         Ok(())
     }
 
+    /// Returns None if the current location's score is unavailable, i.e. cannot draw a conclusion
+    fn is_better_location<T: ShardTag>(
+        &self,
+        scheduler: &mut Scheduler,
+        schedule_context: &ScheduleContext,
+        current: NodeId,
+        candidate: NodeId,
+    ) -> Option<bool> {
+        let Some(candidate_score) = scheduler.compute_node_score::<T::Score>(
+            candidate,
+            &self.intent.preferred_az_id,
+            schedule_context,
+        ) else {
+            // The candidate node is unavailable for scheduling or otherwise couldn't get a score
+            return None;
+        };
+
+        match scheduler.compute_node_score::<T::Score>(
+            current,
+            &self.intent.preferred_az_id,
+            schedule_context,
+        ) {
+            Some(current_score) => {
+                // Ignore utilization components when comparing scores: we don't want to migrate
+                // because of transient load variations, it risks making the system thrash, and
+                // migrating for utilization requires a separate high level view of the system to
+                // e.g. prioritize moving larger or smaller tenants, rather than arbitrarily
+                // moving things around in the order that we hit this function.
+                let candidate_score = candidate_score.for_optimization();
+                let current_score = current_score.for_optimization();
+
+                if candidate_score < current_score {
+                    tracing::info!("Found a lower scoring location! {candidate} is better than {current} ({candidate_score:?} is better than {current_score:?})");
+                    Some(true)
+                } else {
+                    // The candidate node is no better than our current location, so don't migrate
+                    tracing::debug!(
+                        "Candidate node {candidate} is no better than our current location {current} (candidate {candidate_score:?} vs current {current_score:?})",
+                    );
+                    Some(false)
+                }
+            }
+            None => {
+                // The current node is unavailable for scheduling, so we can't make any sensible
+                // decisions about optimisation.  This should be a transient state -- if the node
+                // is offline then it will get evacuated, if is blocked by a scheduling mode
+                // then we will respect that mode by doing nothing.
+                tracing::debug!("Current node {current} is unavailable for scheduling");
+                None
+            }
+        }
+    }
+
+    fn find_better_location<T: ShardTag>(
+        &self,
+        scheduler: &mut Scheduler,
+        schedule_context: &ScheduleContext,
+        current: NodeId,
+        hard_exclude: &[NodeId],
+    ) -> Option<NodeId> {
+        // Look for a lower-scoring location to attach to
+        let Ok(candidate_node) = scheduler.schedule_shard::<T>(
+            hard_exclude,
+            &self.intent.preferred_az_id,
+            schedule_context,
+        ) else {
+            // A scheduling error means we have no possible candidate replacements
+            tracing::debug!("No candidate node found");
+            return None;
+        };
+
+        if candidate_node == current {
+            // We're already at the best possible location, so don't migrate
+            tracing::debug!("Candidate node {candidate_node} is already in use");
+            return None;
+        }
+
+        self.is_better_location::<T>(scheduler, schedule_context, current, candidate_node)
+            .and_then(|better| if better { Some(candidate_node) } else { None })
+    }
+
+    /// This function is an optimization, used to avoid doing large numbers of scheduling operations
+    /// when looking for optimizations.  This function uses knowledge of how scores work to do some
+    /// fast checks for whether it may to be possible to improve a score.
+    ///
+    /// If we return true, it only means that optimization _might_ be possible, not that it necessarily is.  If we
+    /// return no, it definitely means that calling [`Self::optimize_attachment`] or [`Self::optimize_secondary`] would do no
+    /// work.
+    pub(crate) fn maybe_optimizable(
+        &self,
+        scheduler: &mut Scheduler,
+        schedule_context: &ScheduleContext,
+    ) -> bool {
+        // Sharded tenant: check if any locations have a nonzero affinity score
+        if self.shard.count >= ShardCount(1) {
+            let schedule_context = schedule_context.project_detach(self);
+            for node in self.intent.all_pageservers() {
+                if let Some(af) = schedule_context.nodes.get(&node) {
+                    if *af > AffinityScore(0) {
+                        return true;
+                    }
+                }
+            }
+        }
+
+        // Attached tenant: check if the attachment is outside the preferred AZ
+        if let PlacementPolicy::Attached(_) = self.policy {
+            if let Some(attached) = self.intent.get_attached() {
+                if scheduler.get_node_az(attached) != self.intent.preferred_az_id {
+                    return true;
+                }
+            }
+        }
+
+        // Tenant with secondary locations: check if any are within the preferred AZ
+        for secondary in self.intent.get_secondary() {
+            if scheduler.get_node_az(secondary) == self.intent.preferred_az_id {
+                return true;
+            }
+        }
+
+        // Does the tenant have excess secondaries?
+        if self.intent.get_secondary().len() > self.policy.want_secondaries() {
+            return true;
+        }
+
+        // Fall through: no optimizations possible
+        false
+    }
+
     /// Optimize attachments: if a shard has a secondary location that is preferable to
     /// its primary location based on soft constraints, switch that secondary location
     /// to be attached.
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
     pub(crate) fn optimize_attachment(
         &self,
-        nodes: &HashMap<NodeId, Node>,
+        scheduler: &mut Scheduler,
         schedule_context: &ScheduleContext,
     ) -> Option<ScheduleOptimization> {
         let attached = (*self.intent.get_attached())?;
-        if self.intent.secondary.is_empty() {
-            // We can only do useful work if we have both attached and secondary locations: this
-            // function doesn't schedule new locations, only swaps between attached and secondaries.
-            return None;
-        }
 
-        let current_affinity_score = schedule_context.get_node_affinity(attached);
-        let current_attachment_count = schedule_context.get_node_attachments(attached);
+        let schedule_context = schedule_context.project_detach(self);
 
-        // Generate score for each node, dropping any un-schedulable nodes.
-        let all_pageservers = self.intent.all_pageservers();
-        let mut scores = all_pageservers
-            .iter()
-            .flat_map(|node_id| {
-                let node = nodes.get(node_id);
-                if node.is_none() {
-                    None
-                } else if matches!(
-                    node.unwrap().get_scheduling(),
-                    NodeSchedulingPolicy::Filling
-                ) {
-                    // If the node is currently filling, don't count it as a candidate to avoid,
-                    // racing with the background fill.
-                    None
-                } else if matches!(node.unwrap().may_schedule(), MaySchedule::No) {
-                    None
-                } else {
-                    let affinity_score = schedule_context.get_node_affinity(*node_id);
-                    let attachment_count = schedule_context.get_node_attachments(*node_id);
-                    Some((*node_id, affinity_score, attachment_count))
-                }
-            })
-            .collect::<Vec<_>>();
-
-        // Sort precedence:
-        //  1st - prefer nodes with the lowest total affinity score
-        //  2nd - prefer nodes with the lowest number of attachments in this context
-        //  3rd - if all else is equal, sort by node ID for determinism in tests.
-        scores.sort_by_key(|i| (i.1, i.2, i.0));
-
-        if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) =
-            scores.first()
-        {
-            if attached != *preferred_node {
-                // The best alternative must be more than 1 better than us, otherwise we could end
-                // up flapping back next time we're called (e.g. there's no point migrating from
-                // a location with score 1 to a score zero, because on next location the situation
-                // would be the same, but in reverse).
-                if current_affinity_score > *preferred_affinity_score + AffinityScore(1)
-                    || current_attachment_count > *preferred_attachment_count + 1
-                {
-                    tracing::info!(
-                        "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})",
-                        self.intent.get_secondary()
-                    );
-                    return Some(ScheduleOptimization {
-                        sequence: self.sequence,
-                        action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
-                            old_attached_node_id: attached,
-                            new_attached_node_id: *preferred_node,
-                        }),
-                    });
-                }
-            } else {
-                tracing::debug!(
-                    "Node {} is already preferred (score {:?})",
-                    preferred_node,
-                    preferred_affinity_score
-                );
+        // If we already have a secondary that is higher-scoring than out current location,
+        // then simply migrate to it.
+        for secondary in self.intent.get_secondary() {
+            if let Some(true) = self.is_better_location::<AttachedShardTag>(
+                scheduler,
+                &schedule_context,
+                attached,
+                *secondary,
+            ) {
+                return Some(ScheduleOptimization {
+                    sequence: self.sequence,
+                    action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                        old_attached_node_id: attached,
+                        new_attached_node_id: *secondary,
+                    }),
+                });
             }
         }
 
-        // Fall-through: we didn't find an optimization
-        None
+        // Given that none of our current secondaries is a better location than our current
+        // attached location (checked above), we may trim any secondaries that are not needed
+        // for the placement policy.
+        if self.intent.get_secondary().len() > self.policy.want_secondaries() {
+            // This code path cleans up extra secondaries after migrating, and/or
+            // trims extra secondaries after a PlacementPolicy::Attached(N) was
+            // modified to decrease N.
+
+            let secondary_scores = self
+                .intent
+                .get_secondary()
+                .iter()
+                .map(|node_id| {
+                    (
+                        *node_id,
+                        scheduler.compute_node_score::<NodeSecondarySchedulingScore>(
+                            *node_id,
+                            &self.intent.preferred_az_id,
+                            &schedule_context,
+                        ),
+                    )
+                })
+                .collect::<Vec<_>>();
+
+            if secondary_scores.iter().any(|score| score.1.is_none()) {
+                // Don't have full list of scores, so can't make a good decision about which to drop unless
+                // there is an obvious one in the wrong AZ
+                for secondary in self.intent.get_secondary() {
+                    if scheduler.get_node_az(secondary) == self.intent.preferred_az_id {
+                        return Some(ScheduleOptimization {
+                            sequence: self.sequence,
+                            action: ScheduleOptimizationAction::RemoveSecondary(*secondary),
+                        });
+                    }
+                }
+
+                // Fall through: we didn't identify one to remove.  This ought to be rare.
+                tracing::warn!("Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)",
+                self.intent.get_secondary()
+            );
+            } else {
+                let victim = secondary_scores
+                    .iter()
+                    .max_by_key(|score| score.1.unwrap())
+                    .unwrap()
+                    .0;
+                return Some(ScheduleOptimization {
+                    sequence: self.sequence,
+                    action: ScheduleOptimizationAction::RemoveSecondary(victim),
+                });
+            }
+        }
+
+        let replacement = self.find_better_location::<AttachedShardTag>(
+            scheduler,
+            &schedule_context,
+            attached,
+            &[], // Don't exclude secondaries: our preferred attachment location may be a secondary
+        );
+
+        // We have found a candidate and confirmed that its score is preferable
+        // to our current location. See if we have a secondary location in the preferred location already: if not,
+        // then create one.
+        if let Some(replacement) = replacement {
+            // If we are currently in non-preferred AZ, then the scheduler might suggest a location that is better, but still
+            // not in our preferred AZ.  Migration has a cost in resources an impact to the workload, so we want to avoid doing
+            // multiple hops where we might go to some other AZ before eventually finding a suitable location in our preferred
+            // AZ: skip this optimization if it is not in our final, preferred AZ.
+            //
+            // This should be a transient state, there should always be capacity eventually in our preferred AZ (even if nodes
+            // there are too overloaded for scheduler to suggest them, more should be provisioned eventually).
+            if self.intent.preferred_az_id.is_some()
+                && scheduler.get_node_az(&replacement) != self.intent.preferred_az_id
+            {
+                tracing::debug!(
+                    "Candidate node {replacement} is not in preferred AZ {:?}",
+                    self.intent.preferred_az_id
+                );
+
+                // This should only happen if our current location is not in the preferred AZ, otherwise
+                // [`Self::find_better_location`]` should have rejected any other location outside the preferred Az, because
+                // AZ is the highest priority part of NodeAttachmentSchedulingScore.
+                debug_assert!(scheduler.get_node_az(&attached) != self.intent.preferred_az_id);
+
+                return None;
+            }
+
+            if !self.intent.get_secondary().contains(&replacement) {
+                Some(ScheduleOptimization {
+                    sequence: self.sequence,
+                    action: ScheduleOptimizationAction::CreateSecondary(replacement),
+                })
+            } else {
+                // We already have a secondary in the preferred location, let's try migrating to it.  Our caller
+                // will check the warmth of the destination before deciding whether to really execute this.
+                Some(ScheduleOptimization {
+                    sequence: self.sequence,
+                    action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                        old_attached_node_id: attached,
+                        new_attached_node_id: replacement,
+                    }),
+                })
+            }
+        } else {
+            // We didn't find somewhere we'd rather be, and we don't have any excess secondaries
+            // to clean up: no action required.
+            None
+        }
     }
 
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
@@ -837,50 +1061,40 @@ impl TenantShard {
         scheduler: &mut Scheduler,
         schedule_context: &ScheduleContext,
     ) -> Option<ScheduleOptimization> {
-        if self.intent.secondary.is_empty() {
-            // We can only do useful work if we have both attached and secondary locations: this
-            // function doesn't schedule new locations, only swaps between attached and secondaries.
+        if self.intent.get_secondary().len() > self.policy.want_secondaries() {
+            // We have extra secondaries, perhaps to facilitate a migration of the attached location:
+            // do nothing, it is up to [`Self::optimize_attachment`] to clean them up.  When that's done,
+            // and we are called again, we will proceed.
+            tracing::debug!("Too many secondaries: skipping");
             return None;
         }
 
+        let schedule_context = schedule_context.project_detach(self);
+
         for secondary in self.intent.get_secondary() {
-            let Some(affinity_score) = schedule_context.nodes.get(secondary) else {
-                // We're already on a node unaffected any affinity constraints,
-                // so we won't change it.
-                continue;
+            // Make sure we don't try to migrate a secondary to our attached location: this case happens
+            // easily in environments without multiple AZs.
+            let exclude = match self.intent.attached {
+                Some(attached) => vec![attached],
+                None => vec![],
             };
 
-            // Let the scheduler suggest a node, where it would put us if we were scheduling afresh
-            // This implicitly limits the choice to nodes that are available, and prefers nodes
-            // with lower utilization.
-            let Ok(candidate_node) = scheduler.schedule_shard::<SecondaryShardTag>(
-                &self.intent.all_pageservers(),
-                &self.preferred_az_id,
-                schedule_context,
-            ) else {
-                // A scheduling error means we have no possible candidate replacements
-                continue;
-            };
-
-            let candidate_affinity_score = schedule_context
-                .nodes
-                .get(&candidate_node)
-                .unwrap_or(&AffinityScore::FREE);
-
-            // The best alternative must be more than 1 better than us, otherwise we could end
-            // up flapping back next time we're called.
-            if *candidate_affinity_score + AffinityScore(1) < *affinity_score {
-                // If some other node is available and has a lower score than this node, then
-                // that other node is a good place to migrate to.
-                tracing::info!(
-                    "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})",
-                    self.intent.get_secondary()
-                );
+            let replacement = self.find_better_location::<SecondaryShardTag>(
+                scheduler,
+                &schedule_context,
+                *secondary,
+                &exclude,
+            );
+            assert!(replacement != Some(*secondary));
+            if let Some(replacement) = replacement {
+                // We have found a candidate and confirmed that its score is preferable
+                // to our current location. See if we have a secondary location in the preferred location already: if not,
+                // then create one.
                 return Some(ScheduleOptimization {
                     sequence: self.sequence,
                     action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary {
                         old_node_id: *secondary,
-                        new_node_id: candidate_node,
+                        new_node_id: replacement,
                     }),
                 });
             }
@@ -921,11 +1135,54 @@ impl TenantShard {
                 self.intent.remove_secondary(scheduler, old_node_id);
                 self.intent.push_secondary(scheduler, new_node_id);
             }
+            ScheduleOptimizationAction::CreateSecondary(new_node_id) => {
+                self.intent.push_secondary(scheduler, new_node_id);
+            }
+            ScheduleOptimizationAction::RemoveSecondary(old_secondary) => {
+                self.intent.remove_secondary(scheduler, old_secondary);
+            }
         }
 
         true
     }
 
+    /// When a shard has several secondary locations, we need to pick one in situations where
+    /// we promote one of them to an attached location:
+    ///  - When draining a node for restart
+    ///  - When responding to a node failure
+    ///
+    /// In this context, 'preferred' does not mean the node with the best scheduling score: instead
+    /// we want to pick the node which is best for use _temporarily_ while the previous attached location
+    /// is unavailable (e.g. because it's down or deploying).  That means we prefer to use secondary
+    /// locations in a non-preferred AZ, as they're more likely to have awarm cache than a temporary
+    /// secondary in the preferred AZ (which are usually only created for migrations, and if they exist
+    /// they're probably not warmed up yet). The latter behavior is based oni
+    ///
+    /// If the input is empty, or all the nodes are not elegible for scheduling, return None: the
+    /// caller needs to a pick a node some other way.
+    pub(crate) fn preferred_secondary(&self, scheduler: &Scheduler) -> Option<NodeId> {
+        let candidates = scheduler.filter_usable_nodes(&self.intent.secondary);
+
+        // We will sort candidates to prefer nodes which are _not_ in our preferred AZ, i.e. we prefer
+        // to migrate to a long-lived secondary location (which would have been scheduled in a non-preferred AZ),
+        // rather than a short-lived secondary location being used for optimization/migration (which would have
+        // been scheduled in our preferred AZ).
+        let mut candidates = candidates
+            .iter()
+            .map(|(node_id, node_az)| {
+                if node_az == &self.intent.preferred_az_id {
+                    (1, *node_id)
+                } else {
+                    (0, *node_id)
+                }
+            })
+            .collect::<Vec<_>>();
+
+        candidates.sort();
+
+        candidates.first().map(|i| i.1)
+    }
+
     /// Query whether the tenant's observed state for attached node matches its intent state, and if so,
     /// yield the node ID.  This is appropriate for emitting compute hook notifications: we are checking that
     /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
@@ -1207,7 +1464,7 @@ impl TenantShard {
             detach,
             reconciler_config,
             config: self.config.clone(),
-            preferred_az: self.preferred_az_id.clone(),
+            preferred_az: self.intent.preferred_az_id.clone(),
             observed: self.observed.clone(),
             original_observed: self.observed.clone(),
             compute_hook: compute_hook.clone(),
@@ -1428,7 +1685,6 @@ impl TenantShard {
             pending_compute_notification: false,
             delayed_reconcile: false,
             scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
-            preferred_az_id: tsp.preferred_az_id.map(AvailabilityZone),
         })
     }
 
@@ -1444,16 +1700,16 @@ impl TenantShard {
             config: serde_json::to_string(&self.config).unwrap(),
             splitting: SplitState::default(),
             scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
-            preferred_az_id: self.preferred_az_id.as_ref().map(|az| az.0.clone()),
+            preferred_az_id: self.intent.preferred_az_id.as_ref().map(|az| az.0.clone()),
         }
     }
 
     pub(crate) fn preferred_az(&self) -> Option<&AvailabilityZone> {
-        self.preferred_az_id.as_ref()
+        self.intent.preferred_az_id.as_ref()
     }
 
     pub(crate) fn set_preferred_az(&mut self, preferred_az_id: AvailabilityZone) {
-        self.preferred_az_id = Some(preferred_az_id);
+        self.intent.preferred_az_id = Some(preferred_az_id);
     }
 
     /// Returns all the nodes to which this tenant shard is attached according to the
@@ -1756,65 +2012,90 @@ pub(crate) mod tests {
     }
 
     #[test]
-    fn optimize_attachment() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(3, &[]);
+    /// Simple case: moving attachment to somewhere better where we already have a secondary
+    fn optimize_attachment_simple() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(
+            3,
+            &[
+                AvailabilityZone("az-a".to_string()),
+                AvailabilityZone("az-b".to_string()),
+                AvailabilityZone("az-c".to_string()),
+            ],
+        );
         let mut scheduler = Scheduler::new(nodes.values());
 
         let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        shard_a.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string()));
         let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        shard_b.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string()));
 
         // Initially: both nodes attached on shard 1, and both have secondary locations
         // on different nodes.
-        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
-        shard_a.intent.push_secondary(&mut scheduler, NodeId(2));
+        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(2)));
+        shard_a.intent.push_secondary(&mut scheduler, NodeId(1));
         shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1)));
-        shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
+        shard_b.intent.push_secondary(&mut scheduler, NodeId(2));
 
-        let mut schedule_context = ScheduleContext::default();
-        schedule_context.avoid(&shard_a.intent.all_pageservers());
-        schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
-        schedule_context.avoid(&shard_b.intent.all_pageservers());
-        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
+        fn make_schedule_context(shard_a: &TenantShard, shard_b: &TenantShard) -> ScheduleContext {
+            let mut schedule_context = ScheduleContext::default();
+            schedule_context.avoid(&shard_a.intent.all_pageservers());
+            schedule_context.avoid(&shard_b.intent.all_pageservers());
+            schedule_context
+        }
 
-        let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context);
-
-        // Either shard should recognize that it has the option to switch to a secondary location where there
-        // would be no other shards from the same tenant, and request to do so.
+        let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        let optimization_a = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
         assert_eq!(
             optimization_a,
             Some(ScheduleOptimization {
                 sequence: shard_a.sequence,
                 action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
-                    old_attached_node_id: NodeId(1),
-                    new_attached_node_id: NodeId(2)
+                    old_attached_node_id: NodeId(2),
+                    new_attached_node_id: NodeId(1)
                 })
             })
         );
-
-        // Note that these optimizing two shards in the same tenant with the same ScheduleContext is
-        // mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility
-        // of [`Service::optimize_all`] to avoid trying
-        // to do optimizations for multiple shards in the same tenant at the same time.  Generating
-        // both optimizations is just done for test purposes
-        let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context);
-        assert_eq!(
-            optimization_b,
-            Some(ScheduleOptimization {
-                sequence: shard_b.sequence,
-                action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
-                    old_attached_node_id: NodeId(1),
-                    new_attached_node_id: NodeId(3)
-                })
-            })
-        );
-
-        // Applying these optimizations should result in the end state proposed
         shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
-        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2)));
-        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
-        shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap());
-        assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3)));
-        assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]);
+
+        // // Either shard should recognize that it has the option to switch to a secondary location where there
+        // // would be no other shards from the same tenant, and request to do so.
+        // assert_eq!(
+        //     optimization_a_prepare,
+        //     Some(ScheduleOptimization {
+        //         sequence: shard_a.sequence,
+        //         action: ScheduleOptimizationAction::CreateSecondary(NodeId(2))
+        //     })
+        // );
+        // shard_a.apply_optimization(&mut scheduler, optimization_a_prepare.unwrap());
+
+        // let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        // let optimization_a_migrate = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        // assert_eq!(
+        //     optimization_a_migrate,
+        //     Some(ScheduleOptimization {
+        //         sequence: shard_a.sequence,
+        //         action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+        //             old_attached_node_id: NodeId(1),
+        //             new_attached_node_id: NodeId(2)
+        //         })
+        //     })
+        // );
+        // shard_a.apply_optimization(&mut scheduler, optimization_a_migrate.unwrap());
+
+        // let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        // let optimization_a_cleanup = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        // assert_eq!(
+        //     optimization_a_cleanup,
+        //     Some(ScheduleOptimization {
+        //         sequence: shard_a.sequence,
+        //         action: ScheduleOptimizationAction::RemoveSecondary(NodeId(1))
+        //     })
+        // );
+        // shard_a.apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap());
+
+        // // Shard B should not be moved anywhere, since the pressure on node 1 was relieved by moving shard A
+        // let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        // assert_eq!(shard_b.optimize_attachment(&mut scheduler, &schedule_context), None);
 
         shard_a.intent.clear(&mut scheduler);
         shard_b.intent.clear(&mut scheduler);
@@ -1822,6 +2103,190 @@ pub(crate) mod tests {
         Ok(())
     }
 
+    #[test]
+    /// Complicated case: moving attachment to somewhere better where we do not have a secondary
+    /// already, creating one as needed.
+    fn optimize_attachment_multistep() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(
+            3,
+            &[
+                AvailabilityZone("az-a".to_string()),
+                AvailabilityZone("az-b".to_string()),
+                AvailabilityZone("az-c".to_string()),
+            ],
+        );
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        // Two shards of a tenant that wants to be in AZ A
+        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        shard_a.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string()));
+        let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        shard_b.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string()));
+
+        // Both shards are initially attached in non-home AZ _and_ have secondaries in non-home AZs
+        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(2)));
+        shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
+        shard_b.intent.set_attached(&mut scheduler, Some(NodeId(3)));
+        shard_b.intent.push_secondary(&mut scheduler, NodeId(2));
+
+        fn make_schedule_context(shard_a: &TenantShard, shard_b: &TenantShard) -> ScheduleContext {
+            let mut schedule_context = ScheduleContext::default();
+            schedule_context.avoid(&shard_a.intent.all_pageservers());
+            schedule_context.avoid(&shard_b.intent.all_pageservers());
+            schedule_context
+        }
+
+        let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        let optimization_a_prepare = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_prepare,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::CreateSecondary(NodeId(1))
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization_a_prepare.unwrap());
+
+        let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        let optimization_a_migrate = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_migrate,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                    old_attached_node_id: NodeId(2),
+                    new_attached_node_id: NodeId(1)
+                })
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization_a_migrate.unwrap());
+
+        let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        let optimization_a_cleanup = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_cleanup,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::RemoveSecondary(NodeId(3))
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap());
+
+        // // Shard B should not be moved anywhere, since the pressure on node 1 was relieved by moving shard A
+        // let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        // assert_eq!(shard_b.optimize_attachment(&mut scheduler, &schedule_context), None);
+
+        shard_a.intent.clear(&mut scheduler);
+        shard_b.intent.clear(&mut scheduler);
+
+        Ok(())
+    }
+
+    #[test]
+    /// Check that multi-step migration works when moving to somewhere that is only better by
+    /// 1 AffinityScore -- this ensures that we don't have a bug like the intermediate secondary
+    /// counting toward the affinity score such that it prevents the rest of the migration from happening.
+    fn optimize_attachment_marginal() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(2, &[]);
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        // Multi-sharded tenant, we will craft a situation where affinity
+        // scores differ only slightly
+        let mut shards = make_test_tenant(PlacementPolicy::Attached(0), ShardCount::new(4), None);
+
+        // 1 attached on node 1
+        shards[0]
+            .intent
+            .set_attached(&mut scheduler, Some(NodeId(1)));
+        // 3 attached on node 2
+        shards[1]
+            .intent
+            .set_attached(&mut scheduler, Some(NodeId(2)));
+        shards[2]
+            .intent
+            .set_attached(&mut scheduler, Some(NodeId(2)));
+        shards[3]
+            .intent
+            .set_attached(&mut scheduler, Some(NodeId(2)));
+
+        // The scheduler should figure out that we need to:
+        // - Create a secondary for shard 3 on node 1
+        // - Migrate shard 3 to node 1
+        // - Remove shard 3's location on node 2
+
+        fn make_schedule_context(shards: &Vec<TenantShard>) -> ScheduleContext {
+            let mut schedule_context = ScheduleContext::default();
+            for shard in shards {
+                schedule_context.avoid(&shard.intent.all_pageservers());
+            }
+            schedule_context
+        }
+
+        let schedule_context = make_schedule_context(&shards);
+        let optimization_a_prepare =
+            shards[1].optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_prepare,
+            Some(ScheduleOptimization {
+                sequence: shards[1].sequence,
+                action: ScheduleOptimizationAction::CreateSecondary(NodeId(1))
+            })
+        );
+        shards[1].apply_optimization(&mut scheduler, optimization_a_prepare.unwrap());
+
+        let schedule_context = make_schedule_context(&shards);
+        let optimization_a_migrate =
+            shards[1].optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_migrate,
+            Some(ScheduleOptimization {
+                sequence: shards[1].sequence,
+                action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                    old_attached_node_id: NodeId(2),
+                    new_attached_node_id: NodeId(1)
+                })
+            })
+        );
+        shards[1].apply_optimization(&mut scheduler, optimization_a_migrate.unwrap());
+
+        let schedule_context = make_schedule_context(&shards);
+        let optimization_a_cleanup =
+            shards[1].optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_cleanup,
+            Some(ScheduleOptimization {
+                sequence: shards[1].sequence,
+                action: ScheduleOptimizationAction::RemoveSecondary(NodeId(2))
+            })
+        );
+        shards[1].apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap());
+
+        // Everything should be stable now
+        let schedule_context = make_schedule_context(&shards);
+        assert_eq!(
+            shards[0].optimize_attachment(&mut scheduler, &schedule_context),
+            None
+        );
+        assert_eq!(
+            shards[1].optimize_attachment(&mut scheduler, &schedule_context),
+            None
+        );
+        assert_eq!(
+            shards[2].optimize_attachment(&mut scheduler, &schedule_context),
+            None
+        );
+        assert_eq!(
+            shards[3].optimize_attachment(&mut scheduler, &schedule_context),
+            None
+        );
+
+        for mut shard in shards {
+            shard.intent.clear(&mut scheduler);
+        }
+
+        Ok(())
+    }
+
     #[test]
     fn optimize_secondary() -> anyhow::Result<()> {
         let nodes = make_test_nodes(4, &[]);
@@ -1839,9 +2304,7 @@ pub(crate) mod tests {
 
         let mut schedule_context = ScheduleContext::default();
         schedule_context.avoid(&shard_a.intent.all_pageservers());
-        schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
         schedule_context.avoid(&shard_b.intent.all_pageservers());
-        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
 
         let optimization_a = shard_a.optimize_secondary(&mut scheduler, &schedule_context);
 
@@ -1872,7 +2335,6 @@ pub(crate) mod tests {
     // called repeatedly in the background.
     // Returns the applied optimizations
     fn optimize_til_idle(
-        nodes: &HashMap<NodeId, Node>,
         scheduler: &mut Scheduler,
         shards: &mut [TenantShard],
     ) -> Vec<ScheduleOptimization> {
@@ -1884,14 +2346,18 @@ pub(crate) mod tests {
 
             for shard in shards.iter() {
                 schedule_context.avoid(&shard.intent.all_pageservers());
-                if let Some(attached) = shard.intent.get_attached() {
-                    schedule_context.push_attached(*attached);
-                }
             }
 
             for shard in shards.iter_mut() {
-                let optimization = shard.optimize_attachment(nodes, &schedule_context);
+                let optimization = shard.optimize_attachment(scheduler, &schedule_context);
+                tracing::info!(
+                    "optimize_attachment({})={:?}",
+                    shard.tenant_shard_id,
+                    optimization
+                );
                 if let Some(optimization) = optimization {
+                    // Check that maybe_optimizable wouldn't have wrongly claimed this optimization didn't exist
+                    assert!(shard.maybe_optimizable(scheduler, &schedule_context));
                     optimizations.push(optimization.clone());
                     shard.apply_optimization(scheduler, optimization);
                     any_changed = true;
@@ -1899,7 +2365,15 @@ pub(crate) mod tests {
                 }
 
                 let optimization = shard.optimize_secondary(scheduler, &schedule_context);
+                tracing::info!(
+                    "optimize_secondary({})={:?}",
+                    shard.tenant_shard_id,
+                    optimization
+                );
                 if let Some(optimization) = optimization {
+                    // Check that maybe_optimizable wouldn't have wrongly claimed this optimization didn't exist
+                    assert!(shard.maybe_optimizable(scheduler, &schedule_context));
+
                     optimizations.push(optimization.clone());
                     shard.apply_optimization(scheduler, optimization);
                     any_changed = true;
@@ -1923,14 +2397,34 @@ pub(crate) mod tests {
     /// that it converges.
     #[test]
     fn optimize_add_nodes() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(4, &[]);
+        let nodes = make_test_nodes(
+            9,
+            &[
+                // Initial 6 nodes
+                AvailabilityZone("az-a".to_string()),
+                AvailabilityZone("az-a".to_string()),
+                AvailabilityZone("az-b".to_string()),
+                AvailabilityZone("az-b".to_string()),
+                AvailabilityZone("az-c".to_string()),
+                AvailabilityZone("az-c".to_string()),
+                // Three we will add later
+                AvailabilityZone("az-a".to_string()),
+                AvailabilityZone("az-b".to_string()),
+                AvailabilityZone("az-c".to_string()),
+            ],
+        );
 
-        // Only show the scheduler a couple of nodes
+        // Only show the scheduler two nodes in each AZ to start with
         let mut scheduler = Scheduler::new([].iter());
-        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
-        scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
+        for i in 1..=6 {
+            scheduler.node_upsert(nodes.get(&NodeId(i)).unwrap());
+        }
 
-        let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4), None);
+        let mut shards = make_test_tenant(
+            PlacementPolicy::Attached(1),
+            ShardCount::new(4),
+            Some(AvailabilityZone("az-a".to_string())),
+        );
         let mut schedule_context = ScheduleContext::default();
         for shard in &mut shards {
             assert!(shard
@@ -1938,30 +2432,50 @@ pub(crate) mod tests {
                 .is_ok());
         }
 
-        // We should see equal number of locations on the two nodes.
-        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
+        // Initial: attached locations land in the tenant's home AZ.
+        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
         assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 2);
-
-        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
         assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2);
 
-        // Add another two nodes: we should see the shards spread out when their optimize
-        // methods are called
-        scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap());
-        scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
-        optimize_til_idle(&nodes, &mut scheduler, &mut shards);
+        // Initial: secondary locations in a remote AZ
+        assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 0);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 0);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(5)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(5)), 0);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(6)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(6)), 0);
 
-        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
+        // Add another three nodes: we should see the shards spread out when their optimize
+        // methods are called
+        scheduler.node_upsert(nodes.get(&NodeId(7)).unwrap());
+        scheduler.node_upsert(nodes.get(&NodeId(8)).unwrap());
+        scheduler.node_upsert(nodes.get(&NodeId(9)).unwrap());
+        optimize_til_idle(&mut scheduler, &mut shards);
+
+        // We expect one attached location was moved to the new node in the tenant's home AZ
+        assert_eq!(scheduler.get_node_shard_count(NodeId(7)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(7)), 1);
+        // The original node has one less attached shard
+        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
         assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
 
+        // One of the original nodes still has two attachments, since there are an odd number of nodes
         assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
-        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2);
 
-        assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
-        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 1);
-
-        assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
-        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 1);
+        // None of our secondaries moved, since we already had enough nodes for those to be
+        // scheduled perfectly
+        assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 0);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 0);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(5)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(5)), 0);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(6)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(6)), 0);
 
         for shard in shards.iter_mut() {
             shard.intent.clear(&mut scheduler);
@@ -2001,10 +2515,10 @@ pub(crate) mod tests {
             shard.schedule(&mut scheduler, context).unwrap();
         }
 
-        let applied_to_a = optimize_til_idle(&nodes, &mut scheduler, &mut a);
+        let applied_to_a = optimize_til_idle(&mut scheduler, &mut a);
         assert_eq!(applied_to_a, vec![]);
 
-        let applied_to_b = optimize_til_idle(&nodes, &mut scheduler, &mut b);
+        let applied_to_b = optimize_til_idle(&mut scheduler, &mut b);
         assert_eq!(applied_to_b, vec![]);
 
         for shard in a.iter_mut().chain(b.iter_mut()) {
diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py
index caa89955e3..76c3ad01a4 100644
--- a/test_runner/performance/test_sharding_autosplit.py
+++ b/test_runner/performance/test_sharding_autosplit.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
 import concurrent.futures
 import re
+import threading
 from pathlib import Path
 
 import pytest
@@ -188,7 +189,20 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
 
         check_pgbench_output(out_path)
 
-    with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads:
+    stop_pump = threading.Event()
+
+    def pump_controller():
+        # Run a background loop to force the storage controller to run its
+        # background work faster than it otherwise would: this helps
+        # us:
+        #  A) to create a test that runs in a shorter time
+        #  B) to create a test that is more intensive by doing the shard migrations
+        #     after splits happen more rapidly.
+        while not stop_pump.is_set():
+            env.storage_controller.reconcile_all()
+            stop_pump.wait(0.1)
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count + 1) as pgbench_threads:
         pgbench_futs = []
         for tenant_state in tenants.values():
             fut = pgbench_threads.submit(run_pgbench_init, tenant_state.endpoint)
@@ -198,6 +212,8 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
         for fut in pgbench_futs:
             fut.result()
 
+        pump_fut = pgbench_threads.submit(pump_controller)
+
         pgbench_futs = []
         for tenant_state in tenants.values():
             fut = pgbench_threads.submit(run_pgbench_main, tenant_state.endpoint)
@@ -207,6 +223,9 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
         for fut in pgbench_futs:
             fut.result()
 
+        stop_pump.set()
+        pump_fut.result()
+
     def assert_all_split():
         for tenant_id in tenants.keys():
             shards = tenant_get_shards(env, tenant_id)
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 49f41483ec..d45db28c78 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -13,11 +13,13 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
+    NeonPageserver,
     PageserverAvailability,
     PageserverSchedulingPolicy,
 )
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pg_version import PgVersion
+from fixtures.utils import wait_until
 
 
 def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[str, int]:
@@ -85,8 +87,12 @@ def test_storage_controller_many_tenants(
     )
 
     AZS = ["alpha", "bravo", "charlie"]
+
+    def az_selector(node_id):
+        return f"az-{AZS[(node_id - 1) % len(AZS)]}"
+
     neon_env_builder.pageserver_config_override = lambda ps_cfg: ps_cfg.update(
-        {"availability_zone": f"az-{AZS[ps_cfg['id'] % len(AZS)]}"}
+        {"availability_zone": az_selector(ps_cfg["id"])}
     )
 
     # A small sleep on each call into the notify hook, to simulate the latency of doing a database write
@@ -168,6 +174,31 @@ def test_storage_controller_many_tenants(
         log.info(f"Resident memory: {rss} ({ rss / total_shards} per shard)")
         assert rss < expect_memory_per_shard * total_shards
 
+    def assert_all_tenants_scheduled_in_home_az():
+        for tenant_id in tenant_ids:
+            desc = env.storage_controller.tenant_describe(tenant_id)
+            preferred_az = None
+            for shard in desc["shards"]:
+                # All shards in a tenant should have the same preferred AZ
+                if preferred_az is None:
+                    preferred_az = shard["preferred_az_id"]
+                else:
+                    assert preferred_az == shard["preferred_az_id"]
+
+                # Attachment should be in the preferred AZ
+                assert shard["preferred_az_id"] == az_selector(
+                    shard["node_attached"]
+                ), f"Shard {shard['tenant_shard_id']} not in {shard['preferred_az_id']}"
+
+                # Secondary locations should not be in the preferred AZ
+                for node_secondary in shard["node_secondary"]:
+                    assert (
+                        shard["preferred_az_id"] != az_selector(node_secondary)
+                    ), f"Shard {shard['tenant_shard_id']} secondary should be in {shard['preferred_az_id']}"
+
+                # There should only be one secondary location (i.e. no migrations in flight)
+                assert len(shard["node_secondary"]) == 1
+
     # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore
     # permits, to ensure that we are exercising stressing that.
     api_concurrency = 135
@@ -242,6 +273,22 @@ def test_storage_controller_many_tenants(
             f"Created {len(tenants_with_timelines)} timelines in {time.time() - t1}, {len(tenants_with_timelines) / (time.time() - t1)}/s"
         )
 
+    # Check initial scheduling
+    assert_all_tenants_scheduled_in_home_az()
+    az_attached_counts: defaultdict[str, int] = defaultdict(int)
+    az_secondary_counts: defaultdict[str, int] = defaultdict(int)
+    node_attached_counts: defaultdict[str, int] = defaultdict(int)
+    for tenant_id in tenants.keys():
+        desc = env.storage_controller.tenant_describe(tenant_id)
+        for shard in desc["shards"]:
+            az_attached_counts[az_selector(shard["node_attached"])] += 1
+            node_attached_counts[shard["node_attached"]] += 1
+            for node_secondary in shard["node_secondary"]:
+                az_secondary_counts[az_selector(node_secondary)] += 1
+
+    log.info(f"Initial node attached counts: {node_attached_counts}")
+    log.info(f"Initial AZ shard counts: {az_attached_counts}, {az_secondary_counts}")
+
     # Plan operations: ensure each tenant with a timeline gets at least
     # one of each operation type.  Then add other tenants to make up the
     # numbers.
@@ -450,11 +497,77 @@ def test_storage_controller_many_tenants(
         env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
         env.storage_controller.consistency_check()
 
+    # Since we did `reconcile_until_idle` during the above loop, the system should be left in
+    # an optimally scheduled state.  Validate that this includes all the tenants being scheduled
+    # in their home AZ.
+    assert_all_tenants_scheduled_in_home_az()
+
     # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn,
     # as they were not offline long enough to trigger any scheduling changes.
     env.storage_controller.consistency_check()
     check_memory()
 
+    # Simulate loss of an AZ
+    victim_az = "az-alpha"
+    killed_pageservers = []
+    for ps in env.pageservers:
+        if az_selector(ps.id) == victim_az:
+            ps.stop(immediate=True)
+            killed_pageservers.append(ps)
+            log.info(f"Killed pageserver {ps.id}")
+
+    assert killed_pageservers
+
+    # Wait for the controller to notice the pageservers are dead
+    def assert_pageservers_availability(
+        pageservers: list[NeonPageserver], expected_availability: PageserverAvailability
+    ):
+        nodes = env.storage_controller.nodes()
+        checked_any = False
+        node_ids = [ps.id for ps in pageservers]
+        for node in nodes:
+            if node["id"] in node_ids:
+                checked_any = True
+                assert (
+                    node["availability"] == expected_availability
+                ), f"Node {node['id']} is not {expected_availability} yet: {node['availability']}"
+
+        assert checked_any
+
+    wait_until(
+        lambda: assert_pageservers_availability(killed_pageservers, PageserverAvailability.OFFLINE),
+        timeout=60,
+    )
+
+    # Let the controller finish all its rescheduling
+    env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
+
+    # Check that all the tenants are rescheduled to the remaining pageservers
+    for tenant_id in tenant_ids:
+        desc = env.storage_controller.tenant_describe(tenant_id)
+        for shard in desc["shards"]:
+            # Attachment should be outside the AZ where we killed the pageservers
+            assert (
+                az_selector(shard["node_attached"]) != victim_az
+            ), f"Shard {shard['tenant_shard_id']} still in {victim_az} (node {shard['node_attached']})"
+
+    # Bring back the pageservers
+    for ps in killed_pageservers:
+        ps.start()
+
+    wait_until(
+        lambda: assert_pageservers_availability(killed_pageservers, PageserverAvailability.ACTIVE),
+        timeout=60,
+    )
+
+    # A very long timeout is required: we will be migrating all the tenants on all the pageservers
+    # in the region that we just restored.  Assume it'll take up to twice as long as it took to fill
+    # a single node
+    env.storage_controller.reconcile_until_idle(
+        max_interval=0.1, timeout_secs=DRAIN_FILL_TIMEOUT * 4
+    )
+    assert_all_tenants_scheduled_in_home_az()
+
     # Stop the storage controller before tearing down fixtures, because it otherwise might log
     # errors trying to call our `ComputeReconfigure`.
     env.storage_controller.stop()
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 673904a1cd..86a6b7428b 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -520,14 +520,18 @@ def test_sharding_split_smoke(
     shard_count = 2
     # Shard count we split into
     split_shard_count = 4
-    # We will have 2 shards per pageserver once done (including secondaries)
-    neon_env_builder.num_pageservers = split_shard_count
+    # In preferred AZ & other AZ we will end up with one shard per pageserver
+    neon_env_builder.num_pageservers = split_shard_count * 2
 
     # Two AZs
     def assign_az(ps_cfg):
         az = f"az-{(ps_cfg['id'] - 1) % 2}"
         ps_cfg["availability_zone"] = az
 
+        # We will run more pageservers than tests usually do, so give them tiny page caches
+        # in case we're on a test node under memory pressure.
+        ps_cfg["page_cache_size"] = 128
+
     neon_env_builder.pageserver_config_override = assign_az
 
     # 1MiB stripes: enable getting some meaningful data distribution without
@@ -679,8 +683,8 @@ def test_sharding_split_smoke(
     # - shard_count reconciles for the original setup of the tenant
     # - shard_count reconciles for detaching the original secondary locations during split
     # - split_shard_count reconciles during shard splitting, for setting up secondaries.
-    # - split_shard_count/2 of the child shards will need to fail over to their secondaries (since we have 8 shards and 4 pageservers, only 4 will move)
-    expect_reconciles = shard_count * 2 + split_shard_count + split_shard_count / 2
+    # - split_shard_count/2 reconciles to migrate shards to their temporary secondaries
+    expect_reconciles = shard_count * 2 + split_shard_count + 3 * (split_shard_count / 2)
 
     reconcile_ok = env.storage_controller.get_metric_value(
         "storage_controller_reconcile_complete_total", filter={"status": "ok"}
@@ -745,10 +749,14 @@ def test_sharding_split_smoke(
     # dominated by shard count.
     log.info(f"total: {total}")
     assert total == {
-        1: 2,
-        2: 2,
-        3: 2,
-        4: 2,
+        1: 1,
+        2: 1,
+        3: 1,
+        4: 1,
+        5: 1,
+        6: 1,
+        7: 1,
+        8: 1,
     }
 
     # The controller is not required to lay out the attached locations in any particular way, but
@@ -1387,13 +1395,7 @@ def test_sharding_split_failures(
                 else:
                     attached_count += 1
 
-        if exclude_ps_id is not None:
-            # For a node failure case, we expect there to be a secondary location
-            # scheduled on the offline node, so expect one fewer secondary in total
-            assert secondary_count == initial_shard_count - 1
-        else:
-            assert secondary_count == initial_shard_count
-
+        assert secondary_count == initial_shard_count
         assert attached_count == initial_shard_count
 
     def assert_split_done(exclude_ps_id: int | None = None) -> None:
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 3a55e75589..8ffb6ba6b2 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3213,11 +3213,12 @@ def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
 @run_only_on_default_postgres("this is like a 'unit test' against storcon db")
 def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
     def assign_az(ps_cfg):
-        az = f"az-{ps_cfg['id']}"
+        az = f"az-{ps_cfg['id'] % 2}"
+        log.info("Assigned AZ {az}")
         ps_cfg["availability_zone"] = az
 
     neon_env_builder.pageserver_config_override = assign_az
-    neon_env_builder.num_pageservers = 2
+    neon_env_builder.num_pageservers = 4
     env = neon_env_builder.init_configs()
     env.start()
 
@@ -3232,8 +3233,14 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
 
         assert shards[0]["preferred_az_id"] == expected_az
 
+    # When all other schedule scoring parameters are equal, tenants should round-robin on AZs
+    assert env.storage_controller.tenant_describe(tids[0])["shards"][0]["preferred_az_id"] == "az-0"
+    assert env.storage_controller.tenant_describe(tids[1])["shards"][0]["preferred_az_id"] == "az-1"
+    assert env.storage_controller.tenant_describe(tids[2])["shards"][0]["preferred_az_id"] == "az-0"
+
+    # Try modifying preferred AZ
     updated = env.storage_controller.set_preferred_azs(
-        {TenantShardId(tid, 0, 0): "foo" for tid in tids}
+        {TenantShardId(tid, 0, 0): "az-0" for tid in tids}
     )
 
     assert set(updated) == set([TenantShardId(tid, 0, 0) for tid in tids])
@@ -3241,29 +3248,24 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
     for tid in tids:
         shards = env.storage_controller.tenant_describe(tid)["shards"]
         assert len(shards) == 1
-        assert shards[0]["preferred_az_id"] == "foo"
+        assert shards[0]["preferred_az_id"] == "az-0"
 
-    # Generate a layer to avoid shard split handling on ps from tripping
-    # up on debug assert.
-    timeline_id = TimelineId.generate()
-    env.create_timeline("bar", tids[0], timeline_id)
-
-    workload = Workload(env, tids[0], timeline_id, branch_name="bar")
-    workload.init()
-    workload.write_rows(256)
-    workload.validate()
+    # Having modified preferred AZ, we should get moved there
+    env.storage_controller.reconcile_until_idle(max_interval=0.1)
+    for tid in tids:
+        shard = env.storage_controller.tenant_describe(tid)["shards"][0]
+        attached_to = shard["node_attached"]
+        attached_in_az = env.get_pageserver(attached_to).az_id
+        assert shard["preferred_az_id"] == attached_in_az == "az-0"
 
     env.storage_controller.tenant_shard_split(tids[0], shard_count=2)
+    env.storage_controller.reconcile_until_idle(max_interval=0.1)
     shards = env.storage_controller.tenant_describe(tids[0])["shards"]
     assert len(shards) == 2
     for shard in shards:
         attached_to = shard["node_attached"]
-        expected_az = env.get_pageserver(attached_to).az_id
-
-        # The scheduling optimization logic is not yet AZ-aware, so doesn't succeed
-        # in putting the tenant shards in the preferred AZ.
-        # To be fixed in https://github.com/neondatabase/neon/pull/9916
-        # assert shard["preferred_az_id"] == expected_az
+        attached_in_az = env.get_pageserver(attached_to).az_id
+        assert shard["preferred_az_id"] == attached_in_az == "az-0"
 
 
 @run_only_on_default_postgres("Postgres version makes no difference here")

From 1783501eaa12519cd94c174114d2e79381ffee2d Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 13 Jan 2025 22:01:03 +0200
Subject: [PATCH 176/583] Increase max connection for replica to prevent test
 flukyness (#10306)

## Problem

See https://github.com/neondatabase/neon/issues/10167
Too small number of `max_connections` (2) can cause failures of
test_physical_replication_config_mismatch_too_many_known_xids test

## Summary of changes

Increase `max_connections` to 5

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 test_runner/regress/test_physical_replication.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py
index 6cb11b825d..17819fd367 100644
--- a/test_runner/regress/test_physical_replication.py
+++ b/test_runner/regress/test_physical_replication.py
@@ -187,7 +187,7 @@ def test_physical_replication_config_mismatch_too_many_known_xids(neon_simple_en
         origin=primary,
         endpoint_id="secondary",
         config_lines=[
-            "max_connections=2",
+            "max_connections=5",
             "autovacuum_max_workers=1",
             "max_worker_processes=5",
             "max_wal_senders=1",

From 430b556b3472007341f57af57ebdbafb67e6dc85 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 13 Jan 2025 18:44:39 -0600
Subject: [PATCH 177/583] Update postgres-exporter and sql_exporter in computes
 (#10349)

The postgres-exporter was much further out of date, but let's just bump
both.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 build-tools.Dockerfile                      | 2 +-
 compute/compute-node.Dockerfile             | 4 ++--
 test_runner/regress/test_compute_metrics.py | 9 +++------
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index cf6439d004..7a2ec9c43e 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -115,7 +115,7 @@ RUN set -e \
 
 # Keep the version the same as in compute/compute-node.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-ENV SQL_EXPORTER_VERSION=0.16.0
+ENV SQL_EXPORTER_VERSION=0.17.0
 RUN curl -fsSL \
     "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
     --output sql_exporter.tar.gz \
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 2d38796d77..89cee6761f 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1274,11 +1274,11 @@ RUN set -e \
 #
 #########################################################################################
 
-FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
+FROM quay.io/prometheuscommunity/postgres-exporter:v0.16.0 AS postgres-exporter
 
 # Keep the version the same as in build-tools.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter
+FROM burningalchemist/sql_exporter:0.17.0 AS sql-exporter
 
 #########################################################################################
 #
diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py
index 71963355b7..5dcc93acff 100644
--- a/test_runner/regress/test_compute_metrics.py
+++ b/test_runner/regress/test_compute_metrics.py
@@ -219,7 +219,7 @@ if SQL_EXPORTER is None:
             #
             # The "host" network mode allows sql_exporter to talk to the
             # endpoint which is running on the host.
-            super().__init__("docker.io/burningalchemist/sql_exporter:0.16.0", network_mode="host")
+            super().__init__("docker.io/burningalchemist/sql_exporter:0.17.0", network_mode="host")
 
             self.__logs_dir = logs_dir
             self.__port = port
@@ -252,7 +252,7 @@ if SQL_EXPORTER is None:
             log.info("Waiting for sql_exporter to be ready")
             wait_for_logs(
                 self,
-                rf'level=info msg="Listening on" address=\[::\]:{self.__port}',
+                rf'msg="Listening on" address=\[::\]:{self.__port}',
                 timeout=5,
             )
 
@@ -344,10 +344,7 @@ else:
                         time.sleep(0.5)
                         continue
 
-                    if (
-                        f'level=info msg="Listening on" address=[::]:{self._sql_exporter_port}'
-                        in line
-                    ):
+                    if f'msg="Listening on" address=[::]:{self._sql_exporter_port}' in line:
                         break
 
         @override

From a039f8381f83547dec0d774b3598b8e4b75abf3a Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 14 Jan 2025 07:54:30 +0200
Subject: [PATCH 178/583] Optimize vector get last written LSN (#10360)

## Problem

See https://github.com/neondatabase/neon/issues/10281

pg17 performs extra lock/unlock operation when fetching LwLSN.

## Summary of changes

Perform all lookups under one lock, moving initialization of not found
keys to separate loop.

Related Postgres PR:
https://github.com/neondatabase/postgres/pull/553

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 9c9e9a78a9..0f8da73ed0 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 9c9e9a78a93aebec2f6a2f54644442d35ffa245c
+Subproject commit 0f8da73ed08d4fc4ee58cccea008c75bfb20baa8
diff --git a/vendor/revisions.json b/vendor/revisions.json
index d182b88008..b4d57ab709 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v17": [
     "17.2",
-    "9c9e9a78a93aebec2f6a2f54644442d35ffa245c"
+    "0f8da73ed08d4fc4ee58cccea008c75bfb20baa8"
   ],
   "v16": [
     "16.6",

From df4abd8b14ff823f060f66acde34713da013d062 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 14 Jan 2025 12:53:32 +0000
Subject: [PATCH 179/583] fix: force-refresh azure identity token (#10378)

## Problem

Because of https://github.com/Azure/azure-sdk-for-rust/issues/1739, our
identity token file was not being refreshed. This caused our uploads to
start failing when the storage token expired.

## Summary of changes

Drop and recreate the remote storage config every time we upload in
order to force reload the identity token file.
---
 proxy/src/context/parquet.rs | 73 +++++++++++++++++++++++-------------
 1 file changed, 46 insertions(+), 27 deletions(-)

diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 5f65b17374..d7ffff0483 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -187,10 +187,6 @@ pub async fn worker(
     let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx));
     let rx = rx.map(RequestData::from);
 
-    let storage = GenericRemoteStorage::from_config(&remote_storage_config)
-        .await
-        .context("remote storage init")?;
-
     let properties = WriterProperties::builder()
         .set_data_page_size_limit(config.parquet_upload_page_size)
         .set_compression(config.parquet_upload_compression);
@@ -224,18 +220,18 @@ pub async fn worker(
         let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx));
         let rx_disconnect = rx_disconnect.map(RequestData::from);
 
-        let storage_disconnect =
-            GenericRemoteStorage::from_config(&disconnect_events_storage_config)
-                .await
-                .context("remote storage for disconnect events init")?;
         let parquet_config_disconnect = parquet_config.clone();
         tokio::try_join!(
-            worker_inner(storage, rx, parquet_config),
-            worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect)
+            worker_inner(remote_storage_config, rx, parquet_config),
+            worker_inner(
+                disconnect_events_storage_config,
+                rx_disconnect,
+                parquet_config_disconnect
+            )
         )
         .map(|_| ())
     } else {
-        worker_inner(storage, rx, parquet_config).await
+        worker_inner(remote_storage_config, rx, parquet_config).await
     }
 }
 
@@ -251,18 +247,32 @@ struct ParquetConfig {
     test_remote_failures: u64,
 }
 
+impl ParquetConfig {
+    async fn storage(
+        &self,
+        storage_config: &RemoteStorageConfig,
+    ) -> anyhow::Result<GenericRemoteStorage> {
+        let storage = GenericRemoteStorage::from_config(storage_config)
+            .await
+            .context("remote storage init")?;
+
+        #[cfg(any(test, feature = "testing"))]
+        if self.test_remote_failures > 0 {
+            return Ok(GenericRemoteStorage::unreliable_wrapper(
+                storage,
+                self.test_remote_failures,
+            ));
+        }
+
+        Ok(storage)
+    }
+}
+
 async fn worker_inner(
-    storage: GenericRemoteStorage,
+    storage_config: RemoteStorageConfig,
     rx: impl Stream<Item = RequestData>,
     config: ParquetConfig,
 ) -> anyhow::Result<()> {
-    #[cfg(any(test, feature = "testing"))]
-    let storage = if config.test_remote_failures > 0 {
-        GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures)
-    } else {
-        storage
-    };
-
     let mut rx = std::pin::pin!(rx);
 
     let mut rows = Vec::with_capacity(config.rows_per_group);
@@ -285,7 +295,7 @@ async fn worker_inner(
         }
         if len > config.file_size || force {
             last_upload = time::Instant::now();
-            let file = upload_parquet(w, len, &storage).await?;
+            let file = upload_parquet(w, len, &storage_config, &config).await?;
             w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?;
             len = 0;
         }
@@ -298,7 +308,7 @@ async fn worker_inner(
     }
 
     if !w.flushed_row_groups().is_empty() {
-        let _rtchk: Writer<BytesMut> = upload_parquet(w, len, &storage).await?;
+        let _rtchk: Writer<BytesMut> = upload_parquet(w, len, &storage_config, &config).await?;
     }
 
     Ok(())
@@ -340,7 +350,8 @@ where
 async fn upload_parquet(
     mut w: SerializedFileWriter<Writer<BytesMut>>,
     len: i64,
-    storage: &GenericRemoteStorage,
+    storage_config: &RemoteStorageConfig,
+    config: &ParquetConfig,
 ) -> anyhow::Result<Writer<BytesMut>> {
     let len_uncompressed = w
         .flushed_row_groups()
@@ -377,6 +388,15 @@ async fn upload_parquet(
         size, compression, "uploading request parquet file"
     );
 
+    // A bug in azure-sdk means that the identity-token-file that expires after
+    // 1 hour is not refreshed. This identity-token is used to fetch the actual azure storage
+    // tokens that last for 24 hours. After this 24 hour period, azure-sdk tries to refresh
+    // the storage token, but the identity token has now expired.
+    // <https://github.com/Azure/azure-sdk-for-rust/issues/1739>
+    //
+    // To work around this, we recreate the storage every time.
+    let storage = config.storage(storage_config).await?;
+
     let year = now.year();
     let month = now.month();
     let day = now.day();
@@ -431,8 +451,8 @@ mod tests {
     use rand::rngs::StdRng;
     use rand::{Rng, SeedableRng};
     use remote_storage::{
-        GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config,
-        DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
+        RemoteStorageConfig, RemoteStorageKind, S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
+        DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
     };
     use tokio::sync::mpsc;
     use tokio::time;
@@ -559,12 +579,11 @@ mod tests {
             timeout: std::time::Duration::from_secs(120),
             small_timeout: std::time::Duration::from_secs(30),
         };
-        let storage = GenericRemoteStorage::from_config(&remote_storage_config)
+
+        worker_inner(remote_storage_config, rx, config)
             .await
             .unwrap();
 
-        worker_inner(storage, rx, config).await.unwrap();
-
         let mut files = WalkDir::new(tmpdir.as_std_path())
             .into_iter()
             .filter_map(|entry| entry.ok())

From 9bdb14c1c0b3ef1082ce68f1c54d4547393da362 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 14 Jan 2025 10:27:48 -0500
Subject: [PATCH 180/583] fix(pageserver): ensure initial image layers have
 correct key ranges (#10374)

## Problem

Discovered during the relation dir refactor work.

If we do not create images as in this patch, we would get two set of
image layers:

```
0000...METADATA_KEYS
0000...REL_KEYS
```

They overlap at the same LSN and would cause data loss for relation
keys. This doesn't happen in prod because initial image layer generation
is never called, but better to be fixed to avoid future issues with the
reldir refactors.

## Summary of changes

* Consolidate create_image_layers call into a single one.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs | 43 +++++++++++++++----------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f7227efeba..741b214a73 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3781,36 +3781,35 @@ impl Timeline {
                 return Err(FlushLayerError::Cancelled);
             }
 
-            let mut layers_to_upload = Vec::new();
-            layers_to_upload.extend(
-                self.create_image_layers(
-                    &rel_partition,
-                    self.initdb_lsn,
-                    ImageLayerCreationMode::Initial,
-                    ctx,
-                )
-                .await?,
-            );
+            // Ensure that we have a single call to `create_image_layers` with a combined dense keyspace.
+            // So that the key ranges don't overlap.
+            let mut partitions = KeyPartitioning::default();
+            partitions.parts.extend(rel_partition.parts);
             if !metadata_partition.parts.is_empty() {
                 assert_eq!(
                     metadata_partition.parts.len(),
                     1,
                     "currently sparse keyspace should only contain a single metadata keyspace"
                 );
-                layers_to_upload.extend(
-                    self.create_image_layers(
-                        // Safety: create_image_layers treat sparse keyspaces differently that it does not scan
-                        // every single key within the keyspace, and therefore, it's safe to force converting it
-                        // into a dense keyspace before calling this function.
-                        &metadata_partition.into_dense(),
-                        self.initdb_lsn,
-                        ImageLayerCreationMode::Initial,
-                        ctx,
-                    )
-                    .await?,
-                );
+                // Safety: create_image_layers treat sparse keyspaces differently that it does not scan
+                // every single key within the keyspace, and therefore, it's safe to force converting it
+                // into a dense keyspace before calling this function.
+                partitions
+                    .parts
+                    .extend(metadata_partition.into_dense().parts);
             }
 
+            let mut layers_to_upload = Vec::new();
+            layers_to_upload.extend(
+                self.create_image_layers(
+                    &partitions,
+                    self.initdb_lsn,
+                    ImageLayerCreationMode::Initial,
+                    ctx,
+                )
+                .await?,
+            );
+
             (layers_to_upload, None)
         } else {
             // Normal case, write out a L0 delta layer file.

From 2466a2f97763aff5b84253a242909fc19bc995a2 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 14 Jan 2025 16:28:01 +0100
Subject: [PATCH 181/583] page_service: throttle individual requests instead of
 the batched request (#10353)

## Problem

Before this PR, the pagestream throttle was applied weighted on a
per-batch basis.
This had several problems:

1. The throttle occurence counters were only bumped by `1` instead of
`batch_size`.
2. The throttle wait time aggregator metric only counted one wait time,
irrespective
of `batch_size`. That makes sense in some ways of looking at it but not
in others.
3. If the last request in the batch runs into the throttle, the other
requests in the
batch are also throttled, i.e., over-throttling happens (theoretical,
didn't measure
   it in practice).

## Solution

It occured to me that we can simply push the throttling upwards into
`pagestream_read_message`.

This has the added benefit that in pipeline mode, the `executor` stage
will, if it is idle,
steal whatever requests already made it into the `spsc_fold` and execute
them; before this
change, that was not the case - the throttling happened in the
`executor` stage instead of
the `batcher` stage.

## Code Changes

There are two changes in this PR:

1. Lifting up the throttling into the `pagestream_read_message` method.
2. Move the throttling metrics out of the `Throttle` type into
`SmgrOpMetrics`.
Unlike the other smgr metrics, throttling is per-tenant, hence the Arc.
3. Refactor the `SmgrOpTimer` implementation to account for the new
observation states,
   and simplify its design.
4. Drive-by-fix flush time metrics. It was using the same `now` in the
`observe_guard` every time.

The `SmgrOpTimer` is now a state machine.
Each observation point moves the state machine forward.
If a timer object is dropped early some "pair"-like metrics still
require an increment or observation.
That's done in the Drop implementation, by driving the state machine to
completion.
---
 pageserver/src/metrics.rs                | 367 +++++++++++------------
 pageserver/src/page_service.rs           | 139 +++++----
 pageserver/src/tenant.rs                 |  14 +-
 pageserver/src/tenant/throttle.rs        |  45 +--
 pageserver/src/tenant/timeline.rs        |   8 +-
 pageserver/src/tenant/timeline/delete.rs |   1 +
 6 files changed, 284 insertions(+), 290 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 5b8419fda9..5b1cbbad63 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1224,117 +1224,189 @@ pub(crate) struct SmgrOpTimerInner {
     global_flush_in_progress_micros: IntCounter,
     per_timeline_flush_in_progress_micros: IntCounter,
 
+    throttling: Arc<tenant_throttling::Pagestream>,
+
     timings: SmgrOpTimerState,
 }
 
+/// The stages of request processing are represented by the enum variants.
+/// Used as part of [`SmgrOpTimerInner::timings`].
+///
+/// Request processing calls into the `SmgrOpTimer::observe_*` methods at the
+/// transition points.
+/// These methods bump relevant counters and then update [`SmgrOpTimerInner::timings`]
+/// to the next state.
+///
+/// Each request goes through every stage, in all configurations.
+///
 #[derive(Debug)]
 enum SmgrOpTimerState {
     Received {
+        // In the future, we may want to track the full time the request spent
+        // inside pageserver process (time spent in kernel buffers can't be tracked).
+        // `received_at` would be used for that.
+        #[allow(dead_code)]
         received_at: Instant,
     },
-    ThrottleDoneExecutionStarting {
-        received_at: Instant,
+    Throttling {
         throttle_started_at: Instant,
-        started_execution_at: Instant,
     },
+    Batching {
+        throttle_done_at: Instant,
+    },
+    Executing {
+        execution_started_at: Instant,
+    },
+    Flushing,
+    // NB: when adding observation points, remember to update the Drop impl.
 }
 
+// NB: when adding observation points, remember to update the Drop impl.
+impl SmgrOpTimer {
+    /// See [`SmgrOpTimerState`] for more context.
+    pub(crate) fn observe_throttle_start(&mut self, at: Instant) {
+        let Some(inner) = self.0.as_mut() else {
+            return;
+        };
+        let SmgrOpTimerState::Received { received_at: _ } = &mut inner.timings else {
+            return;
+        };
+        inner.throttling.count_accounted_start.inc();
+        inner.timings = SmgrOpTimerState::Throttling {
+            throttle_started_at: at,
+        };
+    }
+
+    /// See [`SmgrOpTimerState`] for more context.
+    pub(crate) fn observe_throttle_done(&mut self, throttle: ThrottleResult) {
+        let Some(inner) = self.0.as_mut() else {
+            return;
+        };
+        let SmgrOpTimerState::Throttling {
+            throttle_started_at,
+        } = &inner.timings
+        else {
+            return;
+        };
+        inner.throttling.count_accounted_finish.inc();
+        match throttle {
+            ThrottleResult::NotThrottled { end } => {
+                inner.timings = SmgrOpTimerState::Batching {
+                    throttle_done_at: end,
+                };
+            }
+            ThrottleResult::Throttled { end } => {
+                // update metrics
+                inner.throttling.count_throttled.inc();
+                inner
+                    .throttling
+                    .wait_time
+                    .inc_by((end - *throttle_started_at).as_micros().try_into().unwrap());
+                // state transition
+                inner.timings = SmgrOpTimerState::Batching {
+                    throttle_done_at: end,
+                };
+            }
+        }
+    }
+
+    /// See [`SmgrOpTimerState`] for more context.
+    pub(crate) fn observe_execution_start(&mut self, at: Instant) {
+        let Some(inner) = self.0.as_mut() else {
+            return;
+        };
+        let SmgrOpTimerState::Batching { throttle_done_at } = &inner.timings else {
+            return;
+        };
+        // update metrics
+        let batch = at - *throttle_done_at;
+        inner.global_batch_wait_time.observe(batch.as_secs_f64());
+        inner
+            .per_timeline_batch_wait_time
+            .observe(batch.as_secs_f64());
+        // state transition
+        inner.timings = SmgrOpTimerState::Executing {
+            execution_started_at: at,
+        }
+    }
+
+    /// For all but the first caller, this is a no-op.
+    /// The first callers receives Some, subsequent ones None.
+    ///
+    /// See [`SmgrOpTimerState`] for more context.
+    pub(crate) fn observe_execution_end_flush_start(
+        &mut self,
+        at: Instant,
+    ) -> Option<SmgrOpFlushInProgress> {
+        // NB: unlike the other observe_* methods, this one take()s.
+        #[allow(clippy::question_mark)] // maintain similar code pattern.
+        let Some(mut inner) = self.0.take() else {
+            return None;
+        };
+        let SmgrOpTimerState::Executing {
+            execution_started_at,
+        } = &inner.timings
+        else {
+            return None;
+        };
+        // update metrics
+        let execution = at - *execution_started_at;
+        inner
+            .global_execution_latency_histo
+            .observe(execution.as_secs_f64());
+        if let Some(per_timeline_execution_latency_histo) =
+            &inner.per_timeline_execution_latency_histo
+        {
+            per_timeline_execution_latency_histo.observe(execution.as_secs_f64());
+        }
+
+        // state transition
+        inner.timings = SmgrOpTimerState::Flushing;
+
+        // return the flush in progress object which
+        // will do the remaining metrics updates
+        let SmgrOpTimerInner {
+            global_flush_in_progress_micros,
+            per_timeline_flush_in_progress_micros,
+            ..
+        } = inner;
+        Some(SmgrOpFlushInProgress {
+            flush_started_at: at,
+            global_micros: global_flush_in_progress_micros,
+            per_timeline_micros: per_timeline_flush_in_progress_micros,
+        })
+    }
+}
+
+/// The last stage of request processing is serializing and flushing the request
+/// into the TCP connection. We want to make slow flushes observable
+/// _while they are occuring_, so this struct provides a wrapper method [`Self::measure`]
+/// to periodically bump the metric.
+///
+/// If in the future we decide that we're not interested in live updates, we can
+/// add another `observe_*` method to [`SmgrOpTimer`], follow the existing pattern there,
+/// and remove this struct from the code base.
 pub(crate) struct SmgrOpFlushInProgress {
     flush_started_at: Instant,
     global_micros: IntCounter,
     per_timeline_micros: IntCounter,
 }
 
-impl SmgrOpTimer {
-    pub(crate) fn observe_throttle_done_execution_starting(&mut self, throttle: &ThrottleResult) {
-        let inner = self.0.as_mut().expect("other public methods consume self");
-        match (&mut inner.timings, throttle) {
-            (SmgrOpTimerState::Received { received_at }, throttle) => match throttle {
-                ThrottleResult::NotThrottled { start } => {
-                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
-                        received_at: *received_at,
-                        throttle_started_at: *start,
-                        started_execution_at: *start,
-                    };
-                }
-                ThrottleResult::Throttled { start, end } => {
-                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
-                        received_at: *start,
-                        throttle_started_at: *start,
-                        started_execution_at: *end,
-                    };
-                }
-            },
-            (x, _) => panic!("called in unexpected state: {x:?}"),
-        }
-    }
-
-    pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
-        let (flush_start, inner) = self
-            .smgr_op_end()
-            .expect("this method consume self, and the only other caller is drop handler");
-        let SmgrOpTimerInner {
-            global_flush_in_progress_micros,
-            per_timeline_flush_in_progress_micros,
-            ..
-        } = inner;
-        SmgrOpFlushInProgress {
-            flush_started_at: flush_start,
-            global_micros: global_flush_in_progress_micros,
-            per_timeline_micros: per_timeline_flush_in_progress_micros,
-        }
-    }
-
-    /// Returns `None`` if this method has already been called, `Some` otherwise.
-    fn smgr_op_end(&mut self) -> Option<(Instant, SmgrOpTimerInner)> {
-        let inner = self.0.take()?;
-
-        let now = Instant::now();
-
-        let batch;
-        let execution;
-        let throttle;
-        match inner.timings {
-            SmgrOpTimerState::Received { received_at } => {
-                batch = (now - received_at).as_secs_f64();
-                // TODO: use label for dropped requests.
-                // This is quite rare in practice, only during tenant/pageservers shutdown.
-                throttle = Duration::ZERO;
-                execution = Duration::ZERO.as_secs_f64();
-            }
-            SmgrOpTimerState::ThrottleDoneExecutionStarting {
-                received_at,
-                throttle_started_at,
-                started_execution_at,
-            } => {
-                batch = (throttle_started_at - received_at).as_secs_f64();
-                throttle = started_execution_at - throttle_started_at;
-                execution = (now - started_execution_at).as_secs_f64();
-            }
-        }
-
-        // update time spent in batching
-        inner.global_batch_wait_time.observe(batch);
-        inner.per_timeline_batch_wait_time.observe(batch);
-
-        // time spent in throttle metric is updated by throttle impl
-        let _ = throttle;
-
-        // update metrics for execution latency
-        inner.global_execution_latency_histo.observe(execution);
-        if let Some(per_timeline_execution_latency_histo) =
-            &inner.per_timeline_execution_latency_histo
-        {
-            per_timeline_execution_latency_histo.observe(execution);
-        }
-
-        Some((now, inner))
-    }
-}
-
 impl Drop for SmgrOpTimer {
     fn drop(&mut self) {
-        self.smgr_op_end();
+        // In case of early drop, update any of the remaining metrics with
+        // observations so that (started,finished) counter pairs balance out
+        // and all counters on the latency path have the the same number of
+        // observations.
+        // It's technically lying and it would be better if each metric had
+        // a separate label or similar for cancelled requests.
+        // But we don't have that right now and counter pairs balancing
+        // out is useful when using the metrics in panels and whatnot.
+        let now = Instant::now();
+        self.observe_throttle_start(now);
+        self.observe_throttle_done(ThrottleResult::NotThrottled { end: now });
+        self.observe_execution_start(now);
+        self.observe_execution_end_flush_start(now);
     }
 }
 
@@ -1345,12 +1417,12 @@ impl SmgrOpFlushInProgress {
     {
         let mut fut = std::pin::pin!(fut);
 
-        let now = Instant::now();
         // Whenever observe_guard gets called, or dropped,
         // it adds the time elapsed since its last call to metrics.
         // Last call is tracked in `now`.
         let mut observe_guard = scopeguard::guard(
             || {
+                let now = Instant::now();
                 let elapsed = now - self.flush_started_at;
                 self.global_micros
                     .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
@@ -1393,7 +1465,6 @@ pub enum SmgrQueryType {
     GetSlruSegment,
 }
 
-#[derive(Debug)]
 pub(crate) struct SmgrQueryTimePerTimeline {
     global_started: [IntCounter; SmgrQueryType::COUNT],
     global_latency: [Histogram; SmgrQueryType::COUNT],
@@ -1405,6 +1476,7 @@ pub(crate) struct SmgrQueryTimePerTimeline {
     per_timeline_flush_in_progress_micros: IntCounter,
     global_batch_wait_time: Histogram,
     per_timeline_batch_wait_time: Histogram,
+    throttling: Arc<tenant_throttling::Pagestream>,
 }
 
 static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1610,7 +1682,11 @@ static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy<Histogram> = Lazy::new(||
 });
 
 impl SmgrQueryTimePerTimeline {
-    pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
+    pub(crate) fn new(
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+        pagestream_throttle_metrics: Arc<tenant_throttling::Pagestream>,
+    ) -> Self {
         let tenant_id = tenant_shard_id.tenant_id.to_string();
         let shard_slug = format!("{}", tenant_shard_id.shard_slug());
         let timeline_id = timeline_id.to_string();
@@ -1671,6 +1747,7 @@ impl SmgrQueryTimePerTimeline {
             per_timeline_flush_in_progress_micros,
             global_batch_wait_time,
             per_timeline_batch_wait_time,
+            throttling: pagestream_throttle_metrics,
         }
     }
     pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer {
@@ -1686,88 +1763,24 @@ impl SmgrQueryTimePerTimeline {
         SmgrOpTimer(Some(SmgrOpTimerInner {
             global_execution_latency_histo: self.global_latency[op as usize].clone(),
             per_timeline_execution_latency_histo: per_timeline_latency_histo,
-            timings: SmgrOpTimerState::Received { received_at },
             global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
             per_timeline_flush_in_progress_micros: self
                 .per_timeline_flush_in_progress_micros
                 .clone(),
             global_batch_wait_time: self.global_batch_wait_time.clone(),
             per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(),
+            throttling: self.throttling.clone(),
+            timings: SmgrOpTimerState::Received { received_at },
         }))
     }
 
+    /// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer
     pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
         self.global_batch_size.observe(batch_size as f64);
         self.per_timeline_batch_size.observe(batch_size as f64);
     }
 }
 
-#[cfg(test)]
-mod smgr_query_time_tests {
-    use std::time::Instant;
-
-    use pageserver_api::shard::TenantShardId;
-    use strum::IntoEnumIterator;
-    use utils::id::{TenantId, TimelineId};
-
-    // Regression test, we used hard-coded string constants before using an enum.
-    #[test]
-    fn op_label_name() {
-        use super::SmgrQueryType::*;
-        let expect: [(super::SmgrQueryType, &'static str); 5] = [
-            (GetRelExists, "get_rel_exists"),
-            (GetRelSize, "get_rel_size"),
-            (GetPageAtLsn, "get_page_at_lsn"),
-            (GetDbSize, "get_db_size"),
-            (GetSlruSegment, "get_slru_segment"),
-        ];
-        for (op, expect) in expect {
-            let actual: &'static str = op.into();
-            assert_eq!(actual, expect);
-        }
-    }
-
-    #[test]
-    fn basic() {
-        let ops: Vec<_> = super::SmgrQueryType::iter().collect();
-
-        for op in &ops {
-            let tenant_id = TenantId::generate();
-            let timeline_id = TimelineId::generate();
-            let metrics = super::SmgrQueryTimePerTimeline::new(
-                &TenantShardId::unsharded(tenant_id),
-                &timeline_id,
-            );
-
-            let get_counts = || {
-                let global: u64 = ops
-                    .iter()
-                    .map(|op| metrics.global_latency[*op as usize].get_sample_count())
-                    .sum();
-                (
-                    global,
-                    metrics.per_timeline_getpage_latency.get_sample_count(),
-                )
-            };
-
-            let (pre_global, pre_per_tenant_timeline) = get_counts();
-            assert_eq!(pre_per_tenant_timeline, 0);
-
-            let timer = metrics.start_smgr_op(*op, Instant::now());
-            drop(timer);
-
-            let (post_global, post_per_tenant_timeline) = get_counts();
-            if matches!(op, super::SmgrQueryType::GetPageAtLsn) {
-                // getpage ops are tracked per-timeline, others aren't
-                assert_eq!(post_per_tenant_timeline, 1);
-            } else {
-                assert_eq!(post_per_tenant_timeline, 0);
-            }
-            assert!(post_global > pre_global);
-        }
-    }
-}
-
 // keep in sync with control plane Go code so that we can validate
 // compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
 static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
@@ -3563,9 +3576,7 @@ pub(crate) mod tenant_throttling {
     use once_cell::sync::Lazy;
     use utils::shard::TenantShardId;
 
-    use crate::tenant::{self};
-
-    struct GlobalAndPerTenantIntCounter {
+    pub(crate) struct GlobalAndPerTenantIntCounter {
         global: IntCounter,
         per_tenant: IntCounter,
     }
@@ -3583,10 +3594,10 @@ pub(crate) mod tenant_throttling {
     }
 
     pub(crate) struct Metrics<const KIND: usize> {
-        count_accounted_start: GlobalAndPerTenantIntCounter,
-        count_accounted_finish: GlobalAndPerTenantIntCounter,
-        wait_time: GlobalAndPerTenantIntCounter,
-        count_throttled: GlobalAndPerTenantIntCounter,
+        pub(super) count_accounted_start: GlobalAndPerTenantIntCounter,
+        pub(super) count_accounted_finish: GlobalAndPerTenantIntCounter,
+        pub(super) wait_time: GlobalAndPerTenantIntCounter,
+        pub(super) count_throttled: GlobalAndPerTenantIntCounter,
     }
 
     static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
@@ -3721,26 +3732,6 @@ pub(crate) mod tenant_throttling {
             }
         }
     }
-
-    impl<const KIND: usize> tenant::throttle::Metric for Metrics<KIND> {
-        #[inline(always)]
-        fn accounting_start(&self) {
-            self.count_accounted_start.inc();
-        }
-        #[inline(always)]
-        fn accounting_finish(&self) {
-            self.count_accounted_finish.inc();
-        }
-        #[inline(always)]
-        fn observe_throttling(
-            &self,
-            tenant::throttle::Observation { wait_time }: &tenant::throttle::Observation,
-        ) {
-            let val = u64::try_from(wait_time.as_micros()).unwrap();
-            self.wait_time.inc_by(val);
-            self.count_throttled.inc();
-        }
-    }
 }
 
 pub(crate) mod disk_usage_based_eviction {
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index f6504bd3b5..b3e18fed99 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -592,43 +592,21 @@ enum BatchedFeMessage {
 }
 
 impl BatchedFeMessage {
-    async fn throttle_and_record_start_processing(
-        &mut self,
-        cancel: &CancellationToken,
-    ) -> Result<(), QueryError> {
-        let (shard, tokens, timers) = match self {
-            BatchedFeMessage::Exists { shard, timer, .. }
-            | BatchedFeMessage::Nblocks { shard, timer, .. }
-            | BatchedFeMessage::DbSize { shard, timer, .. }
-            | BatchedFeMessage::GetSlruSegment { shard, timer, .. } => {
-                (
-                    shard,
-                    // 1 token is probably under-estimating because these
-                    // request handlers typically do several Timeline::get calls.
-                    1,
-                    itertools::Either::Left(std::iter::once(timer)),
-                )
+    fn observe_execution_start(&mut self, at: Instant) {
+        match self {
+            BatchedFeMessage::Exists { timer, .. }
+            | BatchedFeMessage::Nblocks { timer, .. }
+            | BatchedFeMessage::DbSize { timer, .. }
+            | BatchedFeMessage::GetSlruSegment { timer, .. } => {
+                timer.observe_execution_start(at);
             }
-            BatchedFeMessage::GetPage { shard, pages, .. } => (
-                shard,
-                pages.len(),
-                itertools::Either::Right(pages.iter_mut().map(|p| &mut p.timer)),
-            ),
-            BatchedFeMessage::RespondError { .. } => return Ok(()),
-        };
-        let throttled = tokio::select! {
-            throttled = shard.pagestream_throttle.throttle(tokens) => { throttled }
-            _ = shard.cancel.cancelled() => {
-                return Err(QueryError::Shutdown);
+            BatchedFeMessage::GetPage { pages, .. } => {
+                for page in pages {
+                    page.timer.observe_execution_start(at);
+                }
             }
-            _ = cancel.cancelled() => {
-                return Err(QueryError::Shutdown);
-            }
-        };
-        for timer in timers {
-            timer.observe_throttle_done_execution_starting(&throttled);
+            BatchedFeMessage::RespondError { .. } => {}
         }
-        Ok(())
     }
 }
 
@@ -720,6 +698,26 @@ impl PageServerHandler {
         let neon_fe_msg =
             PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
 
+        // TODO: turn in to async closure once available to avoid repeating received_at
+        async fn record_op_start_and_throttle(
+            shard: &timeline::handle::Handle<TenantManagerTypes>,
+            op: metrics::SmgrQueryType,
+            received_at: Instant,
+        ) -> Result<SmgrOpTimer, QueryError> {
+            // It's important to start the smgr op metric recorder as early as possible
+            // so that the _started counters are incremented before we do
+            // any serious waiting, e.g., for throttle, batching, or actual request handling.
+            let mut timer = shard.query_metrics.start_smgr_op(op, received_at);
+            let now = Instant::now();
+            timer.observe_throttle_start(now);
+            let throttled = tokio::select! {
+                res = shard.pagestream_throttle.throttle(1, now) => res,
+                _ = shard.cancel.cancelled() => return Err(QueryError::Shutdown),
+            };
+            timer.observe_throttle_done(throttled);
+            Ok(timer)
+        }
+
         let batched_msg = match neon_fe_msg {
             PagestreamFeMessage::Exists(req) => {
                 let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
@@ -727,9 +725,12 @@ impl PageServerHandler {
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
                     .await?;
-                let timer = shard
-                    .query_metrics
-                    .start_smgr_op(metrics::SmgrQueryType::GetRelExists, received_at);
+                let timer = record_op_start_and_throttle(
+                    &shard,
+                    metrics::SmgrQueryType::GetRelExists,
+                    received_at,
+                )
+                .await?;
                 BatchedFeMessage::Exists {
                     span,
                     timer,
@@ -743,9 +744,12 @@ impl PageServerHandler {
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
                     .await?;
-                let timer = shard
-                    .query_metrics
-                    .start_smgr_op(metrics::SmgrQueryType::GetRelSize, received_at);
+                let timer = record_op_start_and_throttle(
+                    &shard,
+                    metrics::SmgrQueryType::GetRelSize,
+                    received_at,
+                )
+                .await?;
                 BatchedFeMessage::Nblocks {
                     span,
                     timer,
@@ -759,9 +763,12 @@ impl PageServerHandler {
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
                     .await?;
-                let timer = shard
-                    .query_metrics
-                    .start_smgr_op(metrics::SmgrQueryType::GetDbSize, received_at);
+                let timer = record_op_start_and_throttle(
+                    &shard,
+                    metrics::SmgrQueryType::GetDbSize,
+                    received_at,
+                )
+                .await?;
                 BatchedFeMessage::DbSize {
                     span,
                     timer,
@@ -775,9 +782,12 @@ impl PageServerHandler {
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
                     .await?;
-                let timer = shard
-                    .query_metrics
-                    .start_smgr_op(metrics::SmgrQueryType::GetSlruSegment, received_at);
+                let timer = record_op_start_and_throttle(
+                    &shard,
+                    metrics::SmgrQueryType::GetSlruSegment,
+                    received_at,
+                )
+                .await?;
                 BatchedFeMessage::GetSlruSegment {
                     span,
                     timer,
@@ -826,12 +836,12 @@ impl PageServerHandler {
                     }
                 };
 
-                // It's important to start the timer before waiting for the LSN
-                // so that the _started counters are incremented before we do
-                // any serious waiting, e.g., for LSNs.
-                let timer = shard
-                    .query_metrics
-                    .start_smgr_op(metrics::SmgrQueryType::GetPageAtLsn, received_at);
+                let timer = record_op_start_and_throttle(
+                    &shard,
+                    metrics::SmgrQueryType::GetPageAtLsn,
+                    received_at,
+                )
+                .await?;
 
                 let effective_request_lsn = match Self::wait_or_get_last_lsn(
                     &shard,
@@ -937,6 +947,13 @@ impl PageServerHandler {
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
+        let started_at = Instant::now();
+        let batch = {
+            let mut batch = batch;
+            batch.observe_execution_start(started_at);
+            batch
+        };
+
         // invoke handler function
         let (handler_results, span): (
             Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>>,
@@ -1103,8 +1120,11 @@ impl PageServerHandler {
             // The timer's underlying metric is used for a storage-internal latency SLO and
             // we don't want to include latency in it that we can't control.
             // And as pointed out above, in this case, we don't control the time that flush will take.
-            let flushing_timer =
-                timer.map(|timer| timer.observe_smgr_op_completion_and_start_flushing());
+            let flushing_timer = timer.map(|mut timer| {
+                timer
+                    .observe_execution_end_flush_start(Instant::now())
+                    .expect("we are the first caller")
+            });
 
             // what we want to do
             let flush_fut = pgb_writer.flush();
@@ -1258,7 +1278,7 @@ impl PageServerHandler {
                 Ok(msg) => msg,
                 Err(e) => break e,
             };
-            let mut msg = match msg {
+            let msg = match msg {
                 Some(msg) => msg,
                 None => {
                     debug!("pagestream subprotocol end observed");
@@ -1266,10 +1286,6 @@ impl PageServerHandler {
                 }
             };
 
-            if let Err(cancelled) = msg.throttle_and_record_start_processing(&self.cancel).await {
-                break cancelled;
-            }
-
             let err = self
                 .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, protocol_version, ctx)
                 .await;
@@ -1429,15 +1445,12 @@ impl PageServerHandler {
                             return Ok(());
                         }
                     };
-                    let mut batch = match batch {
+                    let batch = match batch {
                         Ok(batch) => batch,
                         Err(e) => {
                             return Err(e);
                         }
                     };
-                    batch
-                        .throttle_and_record_start_processing(&self.cancel)
-                        .await?;
                     self.pagesteam_handle_batched_message(
                         pgb_writer,
                         batch,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 070593b104..f6d758ad22 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -365,8 +365,9 @@ pub struct Tenant {
 
     /// Throttle applied at the top of [`Timeline::get`].
     /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
-    pub(crate) pagestream_throttle:
-        Arc<throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
+    pub(crate) pagestream_throttle: Arc<throttle::Throttle>,
+
+    pub(crate) pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
 
     /// An ongoing timeline detach concurrency limiter.
     ///
@@ -1687,6 +1688,7 @@ impl Tenant {
                     TimelineResources {
                         remote_client,
                         pagestream_throttle: self.pagestream_throttle.clone(),
+                        pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
                         l0_flush_global_state: self.l0_flush_global_state.clone(),
                     },
                     LoadTimelineCause::Attach,
@@ -3992,6 +3994,9 @@ impl Tenant {
         Ok(timeline)
     }
 
+    /// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object
+    /// to ensure proper cleanup of background tasks and metrics.
+    //
     // Allow too_many_arguments because a constructor's argument list naturally grows with the
     // number of attributes in the struct: breaking these out into a builder wouldn't be helpful.
     #[allow(clippy::too_many_arguments)]
@@ -4100,8 +4105,10 @@ impl Tenant {
             gate: Gate::default(),
             pagestream_throttle: Arc::new(throttle::Throttle::new(
                 Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
-                crate::metrics::tenant_throttling::Metrics::new(&tenant_shard_id),
             )),
+            pagestream_throttle_metrics: Arc::new(
+                crate::metrics::tenant_throttling::Pagestream::new(&tenant_shard_id),
+            ),
             tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
             ongoing_timeline_detach: std::sync::Mutex::default(),
             gc_block: Default::default(),
@@ -5008,6 +5015,7 @@ impl Tenant {
         TimelineResources {
             remote_client: self.build_timeline_remote_client(timeline_id),
             pagestream_throttle: self.pagestream_throttle.clone(),
+            pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
             l0_flush_global_state: self.l0_flush_global_state.clone(),
         }
     }
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index 8ab6a0e060..300d779125 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -3,7 +3,7 @@ use std::{
         atomic::{AtomicU64, Ordering},
         Arc,
     },
-    time::{Duration, Instant},
+    time::Instant,
 };
 
 use arc_swap::ArcSwap;
@@ -16,9 +16,8 @@ use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter};
 /// To share a throttle among multiple entities, wrap it in an [`Arc`].
 ///
 /// The intial use case for this is tenant-wide throttling of getpage@lsn requests.
-pub struct Throttle<M: Metric> {
+pub struct Throttle {
     inner: ArcSwap<Inner>,
-    metric: M,
     /// will be turned into [`Stats::count_accounted_start`]
     count_accounted_start: AtomicU64,
     /// will be turned into [`Stats::count_accounted_finish`]
@@ -36,15 +35,6 @@ pub struct Inner {
 
 pub type Config = pageserver_api::models::ThrottleConfig;
 
-pub struct Observation {
-    pub wait_time: Duration,
-}
-pub trait Metric {
-    fn accounting_start(&self);
-    fn accounting_finish(&self);
-    fn observe_throttling(&self, observation: &Observation);
-}
-
 /// See [`Throttle::reset_stats`].
 pub struct Stats {
     /// Number of requests that started [`Throttle::throttle`] calls.
@@ -59,18 +49,14 @@ pub struct Stats {
 }
 
 pub enum ThrottleResult {
-    NotThrottled { start: Instant },
-    Throttled { start: Instant, end: Instant },
+    NotThrottled { end: Instant },
+    Throttled { end: Instant },
 }
 
-impl<M> Throttle<M>
-where
-    M: Metric,
-{
-    pub fn new(config: Config, metric: M) -> Self {
+impl Throttle {
+    pub fn new(config: Config) -> Self {
         Self {
             inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
-            metric,
             count_accounted_start: AtomicU64::new(0),
             count_accounted_finish: AtomicU64::new(0),
             count_throttled: AtomicU64::new(0),
@@ -127,32 +113,27 @@ where
         self.inner.load().rate_limiter.steady_rps()
     }
 
-    pub async fn throttle(&self, key_count: usize) -> ThrottleResult {
+    /// `start` must be [`Instant::now`] or earlier.
+    pub async fn throttle(&self, key_count: usize, start: Instant) -> ThrottleResult {
         let inner = self.inner.load_full(); // clones the `Inner` Arc
 
-        let start = std::time::Instant::now();
-
         if !inner.enabled {
-            return ThrottleResult::NotThrottled { start };
+            return ThrottleResult::NotThrottled { end: start };
         }
 
-        self.metric.accounting_start();
         self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
         let did_throttle = inner.rate_limiter.acquire(key_count).await;
         self.count_accounted_finish.fetch_add(1, Ordering::Relaxed);
-        self.metric.accounting_finish();
 
         if did_throttle {
             self.count_throttled.fetch_add(1, Ordering::Relaxed);
-            let now = Instant::now();
-            let wait_time = now - start;
+            let end = Instant::now();
+            let wait_time = end - start;
             self.sum_throttled_usecs
                 .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
-            let observation = Observation { wait_time };
-            self.metric.observe_throttling(&observation);
-            ThrottleResult::Throttled { start, end: now }
+            ThrottleResult::Throttled { end }
         } else {
-            ThrottleResult::NotThrottled { start }
+            ThrottleResult::NotThrottled { end: start }
         }
     }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 741b214a73..4aa6b7a05a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -208,8 +208,8 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
     pub remote_client: RemoteTimelineClient,
-    pub pagestream_throttle:
-        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
+    pub pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,
+    pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
     pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }
 
@@ -412,8 +412,7 @@ pub struct Timeline {
     gc_lock: tokio::sync::Mutex<()>,
 
     /// Cloned from [`super::Tenant::pagestream_throttle`] on construction.
-    pub(crate) pagestream_throttle:
-        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
+    pub(crate) pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,
 
     /// Size estimator for aux file v2
     pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
@@ -2310,6 +2309,7 @@ impl Timeline {
                 query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new(
                     &tenant_shard_id,
                     &timeline_id,
+                    resources.pagestream_throttle_metrics,
                 ),
 
                 directory_metrics: array::from_fn(|_| AtomicU64::new(0)),
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index ae44af3fad..bdc315d985 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -301,6 +301,7 @@ impl DeleteTimelineFlow {
                 TimelineResources {
                     remote_client,
                     pagestream_throttle: tenant.pagestream_throttle.clone(),
+                    pagestream_throttle_metrics: tenant.pagestream_throttle_metrics.clone(),
                     l0_flush_global_state: tenant.l0_flush_global_state.clone(),
                 },
                 // Important. We dont pass ancestor above because it can be missing.

From aa7323a384dd60b8c176ba6bed6a3c2e5f9bd95e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 14 Jan 2025 15:30:43 +0000
Subject: [PATCH 182/583] storage controller: quality of life improvements for
 AZ handling (#10379)

## Problem

Since https://github.com/neondatabase/neon/pull/9916, the preferred AZ
of a tenant is much more impactful, and we would like to make it more
visible in tooling.

## Summary of changes

- Include AZ in node describe API
- Include AZ info in node & tenant outputs in CLI
- Add metrics for per-node shard counts, labelled by AZ
- Add a CLI for setting preferred AZ on a tenant
- Extend AZ-setting API+CLI to handle None for clearing preferred AZ
---
 control_plane/storcon_cli/src/main.rs     | 120 ++++++++++++++++++++--
 libs/pageserver_api/src/controller_api.rs |   4 +-
 storage_controller/src/metrics.rs         |  19 ++++
 storage_controller/src/node.rs            |   1 +
 storage_controller/src/persistence.rs     |   7 +-
 storage_controller/src/scheduler.rs       |  29 +++++-
 storage_controller/src/service.rs         |   9 +-
 storage_controller/src/tenant_shard.rs    |   4 +-
 8 files changed, 175 insertions(+), 18 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 1653b3c845..9d133e4af1 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,12 +1,16 @@
 use futures::StreamExt;
-use std::{str::FromStr, time::Duration};
+use std::{
+    collections::{HashMap, HashSet},
+    str::FromStr,
+    time::Duration,
+};
 
 use clap::{Parser, Subcommand};
 use pageserver_api::{
     controller_api::{
         AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
-        SafekeeperDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
-        TenantDescribeResponse, TenantPolicyRequest,
+        SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
+        TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
     },
     models::{
         EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -153,6 +157,12 @@ enum Command {
         #[arg(long)]
         tenant_id: TenantId,
     },
+    TenantSetPreferredAz {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        preferred_az: Option<String>,
+    },
     /// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate
     /// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region.
     TenantDrop {
@@ -402,11 +412,12 @@ async fn main() -> anyhow::Result<()> {
             resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
 
             let mut table = comfy_table::Table::new();
-            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
+            table.set_header(["Id", "Hostname", "AZ", "Scheduling", "Availability"]);
             for node in resp {
                 table.add_row([
                     format!("{}", node.id),
                     node.listen_http_addr,
+                    node.availability_zone_id,
                     format!("{:?}", node.scheduling),
                     format!("{:?}", node.availability),
                 ]);
@@ -479,6 +490,7 @@ async fn main() -> anyhow::Result<()> {
             let mut table = comfy_table::Table::new();
             table.set_header([
                 "TenantId",
+                "Preferred AZ",
                 "ShardCount",
                 "StripeSize",
                 "Placement",
@@ -488,6 +500,11 @@ async fn main() -> anyhow::Result<()> {
                 let shard_zero = tenant.shards.into_iter().next().unwrap();
                 table.add_row([
                     format!("{}", tenant.tenant_id),
+                    shard_zero
+                        .preferred_az_id
+                        .as_ref()
+                        .cloned()
+                        .unwrap_or("".to_string()),
                     format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
                     format!("{:?}", tenant.stripe_size),
                     format!("{:?}", tenant.policy),
@@ -614,6 +631,19 @@ async fn main() -> anyhow::Result<()> {
                     None,
                 )
                 .await?;
+
+            let nodes = storcon_client
+                .dispatch::<(), Vec<NodeDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/node".to_string(),
+                    None,
+                )
+                .await?;
+            let nodes = nodes
+                .into_iter()
+                .map(|n| (n.id, n))
+                .collect::<HashMap<_, _>>();
+
             println!("Tenant {tenant_id}");
             let mut table = comfy_table::Table::new();
             table.add_row(["Policy", &format!("{:?}", policy)]);
@@ -622,7 +652,14 @@ async fn main() -> anyhow::Result<()> {
             println!("{table}");
             println!("Shards:");
             let mut table = comfy_table::Table::new();
-            table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
+            table.set_header([
+                "Shard",
+                "Attached",
+                "Attached AZ",
+                "Secondary",
+                "Last error",
+                "status",
+            ]);
             for shard in shards {
                 let secondary = shard
                     .node_secondary
@@ -645,11 +682,18 @@ async fn main() -> anyhow::Result<()> {
                 }
                 let status = status_parts.join(",");
 
+                let attached_node = shard
+                    .node_attached
+                    .as_ref()
+                    .map(|id| nodes.get(id).expect("Shard references nonexistent node"));
+
                 table.add_row([
                     format!("{}", shard.tenant_shard_id),
-                    shard
-                        .node_attached
-                        .map(|n| format!("{}", n))
+                    attached_node
+                        .map(|n| format!("{} ({})", n.listen_http_addr, n.id))
+                        .unwrap_or(String::new()),
+                    attached_node
+                        .map(|n| n.availability_zone_id.clone())
                         .unwrap_or(String::new()),
                     secondary,
                     shard.last_error,
@@ -658,6 +702,66 @@ async fn main() -> anyhow::Result<()> {
             }
             println!("{table}");
         }
+        Command::TenantSetPreferredAz {
+            tenant_id,
+            preferred_az,
+        } => {
+            // First learn about the tenant's shards
+            let describe_response = storcon_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await?;
+
+            // Learn about nodes to validate the AZ ID
+            let nodes = storcon_client
+                .dispatch::<(), Vec<NodeDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/node".to_string(),
+                    None,
+                )
+                .await?;
+
+            if let Some(preferred_az) = &preferred_az {
+                let azs = nodes
+                    .into_iter()
+                    .map(|n| (n.availability_zone_id))
+                    .collect::<HashSet<_>>();
+                if !azs.contains(preferred_az) {
+                    anyhow::bail!(
+                        "AZ {} not found on any node: known AZs are: {:?}",
+                        preferred_az,
+                        azs
+                    );
+                }
+            } else {
+                // Make it obvious to the user that since they've omitted an AZ, we're clearing it
+                eprintln!("Clearing preferred AZ for tenant {}", tenant_id);
+            }
+
+            // Construct a request that modifies all the tenant's shards
+            let req = ShardsPreferredAzsRequest {
+                preferred_az_ids: describe_response
+                    .shards
+                    .into_iter()
+                    .map(|s| {
+                        (
+                            s.tenant_shard_id,
+                            preferred_az.clone().map(AvailabilityZone),
+                        )
+                    })
+                    .collect(),
+            };
+            storcon_client
+                .dispatch::<ShardsPreferredAzsRequest, ()>(
+                    Method::PUT,
+                    "control/v1/preferred_azs".to_string(),
+                    Some(req),
+                )
+                .await?;
+        }
         Command::TenantWarmup { tenant_id } => {
             let describe_response = storcon_client
                 .dispatch::<(), TenantDescribeResponse>(
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index f3aefc6df9..f3880cb766 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -87,7 +87,7 @@ impl Display for AvailabilityZone {
 #[derive(Serialize, Deserialize)]
 pub struct ShardsPreferredAzsRequest {
     #[serde(flatten)]
-    pub preferred_az_ids: HashMap<TenantShardId, AvailabilityZone>,
+    pub preferred_az_ids: HashMap<TenantShardId, Option<AvailabilityZone>>,
 }
 
 #[derive(Serialize, Deserialize)]
@@ -144,6 +144,8 @@ pub struct NodeDescribeResponse {
     pub availability: NodeAvailabilityWrapper,
     pub scheduling: NodeSchedulingPolicy,
 
+    pub availability_zone_id: String,
+
     pub listen_http_addr: String,
     pub listen_http_port: u16,
 
diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index 6d5885eba6..4164e3dc2b 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -53,6 +53,16 @@ pub(crate) struct StorageControllerMetricGroup {
     /// How many shards are not scheduled into their preferred AZ
     pub(crate) storage_controller_schedule_az_violation: measured::Gauge,
 
+    /// How many shard locations (secondary or attached) on each node
+    pub(crate) storage_controller_node_shards: measured::GaugeVec<NodeLabelGroupSet>,
+
+    /// How many _attached_ shard locations on each node
+    pub(crate) storage_controller_node_attached_shards: measured::GaugeVec<NodeLabelGroupSet>,
+
+    /// How many _home_ shard locations on each node (i.e. the node's AZ matches the shard's
+    /// preferred AZ)
+    pub(crate) storage_controller_node_home_shards: measured::GaugeVec<NodeLabelGroupSet>,
+
     /// How many shards would like to reconcile but were blocked by concurrency limits
     pub(crate) storage_controller_pending_reconciles: measured::Gauge,
 
@@ -132,6 +142,15 @@ impl Default for StorageControllerMetrics {
     }
 }
 
+#[derive(measured::LabelGroup, Clone)]
+#[label(set = NodeLabelGroupSet)]
+pub(crate) struct NodeLabelGroup<'a> {
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    pub(crate) az: &'a str,
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    pub(crate) node_id: &'a str,
+}
+
 #[derive(measured::LabelGroup)]
 #[label(set = ReconcileCompleteLabelGroupSet)]
 pub(crate) struct ReconcileCompleteLabelGroup {
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 4cc9b0070d..f5c2d329e0 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -299,6 +299,7 @@ impl Node {
             id: self.id,
             availability: self.availability.clone().into(),
             scheduling: self.scheduling,
+            availability_zone_id: self.availability_zone_id.0.clone(),
             listen_http_addr: self.listen_http_addr.clone(),
             listen_http_port: self.listen_http_port,
             listen_pg_addr: self.listen_pg_addr.clone(),
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index beb014f0a8..eb0bfc879e 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -708,10 +708,11 @@ impl Persistence {
         Ok(())
     }
 
+    /// Note that passing None for a shard clears the preferred AZ (rather than leaving it unmodified)
     pub(crate) async fn set_tenant_shard_preferred_azs(
         &self,
-        preferred_azs: Vec<(TenantShardId, AvailabilityZone)>,
-    ) -> DatabaseResult<Vec<(TenantShardId, AvailabilityZone)>> {
+        preferred_azs: Vec<(TenantShardId, Option<AvailabilityZone>)>,
+    ) -> DatabaseResult<Vec<(TenantShardId, Option<AvailabilityZone>)>> {
         use crate::schema::tenant_shards::dsl::*;
 
         self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
@@ -722,7 +723,7 @@ impl Persistence {
                     .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                     .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
                     .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .set(preferred_az_id.eq(preferred_az.0.clone()))
+                    .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
                     .execute(conn)?;
 
                 if updated == 1 {
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 04a594dcac..f5cab9dd57 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -1,4 +1,4 @@
-use crate::{node::Node, tenant_shard::TenantShard};
+use crate::{metrics::NodeLabelGroup, node::Node, tenant_shard::TenantShard};
 use itertools::Itertools;
 use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization};
 use serde::Serialize;
@@ -872,6 +872,33 @@ impl Scheduler {
     pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize {
         self.nodes.get(&node_id).unwrap().attached_shard_count
     }
+
+    /// Some metrics that we only calculate periodically: this is simpler than
+    /// rigorously updating them on every change.
+    pub(crate) fn update_metrics(&self) {
+        for (node_id, node) in &self.nodes {
+            let node_id_str = format!("{}", node_id);
+            let label_group = NodeLabelGroup {
+                az: &node.az.0,
+                node_id: &node_id_str,
+            };
+
+            crate::metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_node_shards
+                .set(label_group.clone(), node.shard_count as i64);
+
+            crate::metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_node_attached_shards
+                .set(label_group.clone(), node.attached_shard_count as i64);
+
+            crate::metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_node_home_shards
+                .set(label_group.clone(), node.home_shard_count as i64);
+        }
+    }
 }
 
 #[cfg(test)]
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index dadcc44cfb..cbb9103880 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2517,7 +2517,7 @@ impl Service {
                     .map(|t| {
                         (
                             t.get_tenant_shard_id().expect("Corrupt shard in database"),
-                            load_in_az.clone(),
+                            Some(load_in_az.clone()),
                         )
                     })
                     .collect(),
@@ -6390,7 +6390,7 @@ impl Service {
     /// available.  A return value of 0 indicates that everything is fully reconciled already.
     fn reconcile_all(&self) -> usize {
         let mut locked = self.inner.write().unwrap();
-        let (nodes, tenants, _scheduler) = locked.parts_mut();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
         let pageservers = nodes.clone();
 
         // This function is an efficient place to update lazy statistics, since we are walking
@@ -6451,6 +6451,9 @@ impl Service {
             }
         }
 
+        // Some metrics are calculated from SchedulerNode state, update these periodically
+        scheduler.update_metrics();
+
         // Process any deferred tenant drops
         for (tenant_id, guard) in drop_detached_tenants {
             self.maybe_drop_tenant(tenant_id, &mut locked, &guard);
@@ -6509,7 +6512,7 @@ impl Service {
                 // Shard was dropped between planning and execution;
                 continue;
             };
-            tracing::info!("Applying optimization: {optimization:?}");
+            tracing::info!(tenant_shard_id=%tenant_shard_id, "Applying optimization: {optimization:?}");
             if shard.apply_optimization(scheduler, optimization) {
                 optimizations_applied += 1;
                 if self.maybe_reconcile_shard(shard, nodes).is_some() {
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 2ba2a57eba..79ed628c25 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1708,8 +1708,8 @@ impl TenantShard {
         self.intent.preferred_az_id.as_ref()
     }
 
-    pub(crate) fn set_preferred_az(&mut self, preferred_az_id: AvailabilityZone) {
-        self.intent.preferred_az_id = Some(preferred_az_id);
+    pub(crate) fn set_preferred_az(&mut self, preferred_az_id: Option<AvailabilityZone>) {
+        self.intent.preferred_az_id = preferred_az_id;
     }
 
     /// Returns all the nodes to which this tenant shard is attached according to the

From ffaa52ff5d3f36becaf70be2b6053c7423deba61 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 14 Jan 2025 17:31:59 +0100
Subject: [PATCH 183/583] pageserver: reorder upload queue when possible
 (#10218)

## Problem

The upload queue currently sees significant head-of-line blocking. For
example, index uploads act as upload barriers, and for every layer flush
we schedule a layer and index upload, which effectively serializes layer
uploads.

Resolves #10096.

## Summary of changes

Allow upload queue operations to bypass the queue if they don't conflict
with preceding operations, increasing parallelism.

NB: the upload queue currently schedules an explicit barrier after every
layer flush as well (see #8550). This must be removed to enable
parallelism. This will require a better mechanism for compaction
backpressure, see e.g. #8390 or #5415.
---
 Cargo.lock                                    |   1 +
 pageserver/Cargo.toml                         |   5 +
 pageserver/benches/upload_queue.rs            |  86 ++
 pageserver/src/tenant/metadata.rs             |   1 -
 .../src/tenant/remote_timeline_client.rs      | 101 +-
 .../tenant/remote_timeline_client/index.rs    |   2 +-
 pageserver/src/tenant/storage_layer/layer.rs  |   2 +-
 pageserver/src/tenant/upload_queue.rs         | 957 +++++++++++++++++-
 8 files changed, 1029 insertions(+), 126 deletions(-)
 create mode 100644 pageserver/benches/upload_queue.rs

diff --git a/Cargo.lock b/Cargo.lock
index 08453120c7..1e29f4fc08 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4044,6 +4044,7 @@ dependencies = [
  "postgres_connection",
  "postgres_ffi",
  "postgres_initdb",
+ "pprof",
  "pq_proto",
  "procfs",
  "rand 0.8.5",
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 140b287ccc..8547746d94 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -44,6 +44,7 @@ postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
 postgres_initdb.workspace = true
+pprof.workspace = true
 rand.workspace = true
 range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
@@ -108,3 +109,7 @@ harness = false
 [[bench]]
 name = "bench_ingest"
 harness = false
+
+[[bench]]
+name = "upload_queue"
+harness = false
diff --git a/pageserver/benches/upload_queue.rs b/pageserver/benches/upload_queue.rs
new file mode 100644
index 0000000000..528b3d5490
--- /dev/null
+++ b/pageserver/benches/upload_queue.rs
@@ -0,0 +1,86 @@
+//! Upload queue benchmarks.
+
+use std::str::FromStr as _;
+use std::sync::atomic::AtomicU32;
+use std::sync::Arc;
+
+use criterion::{criterion_group, criterion_main, Bencher, Criterion};
+use pageserver::tenant::metadata::TimelineMetadata;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
+use pageserver::tenant::storage_layer::LayerName;
+use pageserver::tenant::upload_queue::{Delete, UploadOp, UploadQueue, UploadTask};
+use pageserver::tenant::IndexPart;
+use pprof::criterion::{Output, PProfProfiler};
+use utils::generation::Generation;
+use utils::shard::{ShardCount, ShardIndex, ShardNumber};
+
+// Register benchmarks with Criterion.
+criterion_group!(
+    name = benches;
+    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets = bench_upload_queue_next_ready,
+);
+criterion_main!(benches);
+
+/// Benchmarks the cost of UploadQueue::next_ready() with the given number of in-progress tasks
+/// (which is equivalent to tasks ahead of it in the queue). This has linear cost, and the upload
+/// queue as a whole is thus quadratic.
+///
+/// UploadOp::UploadLayer requires an entire tenant and timeline to construct, so we just test
+/// Delete and UploadMetadata instead. This is incidentally the most expensive case.
+fn bench_upload_queue_next_ready(c: &mut Criterion) {
+    let mut g = c.benchmark_group("upload_queue_next_ready");
+    for inprogress in [0, 1, 10, 100, 1_000, 10_000, 100_000, 1_000_000] {
+        g.bench_function(format!("inprogress={inprogress}"), |b| {
+            run_bench(b, inprogress).unwrap()
+        });
+    }
+
+    fn run_bench(b: &mut Bencher, inprogress: usize) -> anyhow::Result<()> {
+        // Construct two layers. layer0 is in the indexes, layer1 will be deleted.
+        let layer0 = LayerName::from_str("000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name");
+        let layer1 = LayerName::from_str("100000000000000000000000000000000001-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name");
+
+        let metadata = LayerFileMetadata {
+            shard: ShardIndex::new(ShardNumber(1), ShardCount(2)),
+            generation: Generation::Valid(1),
+            file_size: 0,
+        };
+
+        // Construct the (initial and uploaded) index with layer0.
+        let mut index = IndexPart::empty(TimelineMetadata::example());
+        index.layer_metadata.insert(layer0, metadata.clone());
+
+        // Construct the queue.
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&index)?;
+
+        // Populate inprogress_tasks with a bunch of layer1 deletions.
+        let delete = UploadOp::Delete(Delete {
+            layers: vec![(layer1, metadata)],
+        });
+
+        for task_id in 0..(inprogress as u64) {
+            queue.inprogress_tasks.insert(
+                task_id,
+                Arc::new(UploadTask {
+                    task_id,
+                    retries: AtomicU32::new(0),
+                    op: delete.clone(),
+                }),
+            );
+        }
+
+        // Benchmark index upload scheduling.
+        let index_upload = UploadOp::UploadMetadata {
+            uploaded: Box::new(index),
+        };
+
+        b.iter(|| {
+            queue.queued_operations.push_front(index_upload.clone());
+            assert!(queue.next_ready().is_some());
+        });
+
+        Ok(())
+    }
+}
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 24440d4b35..d281eb305f 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -320,7 +320,6 @@ impl TimelineMetadata {
 
     // Checksums make it awkward to build a valid instance by hand.  This helper
     // provides a TimelineMetadata with a valid checksum in its header.
-    #[cfg(test)]
     pub fn example() -> Self {
         let instance = Self::new(
             "0/16960E8".parse::<Lsn>().unwrap(),
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 813111245d..75e8da496d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -63,22 +63,18 @@
 //! The contract between client and its user is that the user is responsible of
 //! scheduling operations in an order that keeps the remote consistent as
 //! described above.
+//!
 //! From the user's perspective, the operations are executed sequentially.
 //! Internally, the client knows which operations can be performed in parallel,
 //! and which operations act like a "barrier" that require preceding operations
 //! to finish. The calling code just needs to call the schedule-functions in the
 //! correct order, and the client will parallelize the operations in a way that
-//! is safe.
-//!
-//! The caller should be careful with deletion, though. They should not delete
-//! local files that have been scheduled for upload but not yet finished uploading.
-//! Otherwise the upload will fail. To wait for an upload to finish, use
-//! the 'wait_completion' function (more on that later.)
+//! is safe. For more details, see `UploadOp::can_bypass`.
 //!
 //! All of this relies on the following invariants:
 //!
 //! - We rely on read-after write consistency in the remote storage.
-//! - Layer files are immutable
+//! - Layer files are immutable.
 //!
 //! NB: Pageserver assumes that it has exclusive write access to the tenant in remote
 //! storage. Different tenants can be attached to different pageservers, but if the
@@ -1855,57 +1851,17 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
-    ///
     /// Pick next tasks from the queue, and start as many of them as possible without violating
     /// the ordering constraints.
     ///
-    /// The caller needs to already hold the `upload_queue` lock.
+    /// TODO: consider limiting the number of in-progress tasks, beyond what remote_storage does.
+    /// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has
+    /// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks.
     fn launch_queued_tasks(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
-        while let Some(next_op) = upload_queue.queued_operations.front() {
-            // Can we run this task now?
-            let can_run_now = match next_op {
-                UploadOp::UploadLayer(..) => {
-                    // Can always be scheduled.
-                    true
-                }
-                UploadOp::UploadMetadata { .. } => {
-                    // These can only be performed after all the preceding operations
-                    // have finished.
-                    upload_queue.inprogress_tasks.is_empty()
-                }
-                UploadOp::Delete(..) => {
-                    // Wait for preceding uploads to finish. Concurrent deletions are OK, though.
-                    upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
-                }
+        while let Some(mut next_op) = upload_queue.next_ready() {
+            debug!("starting op: {next_op}");
 
-                UploadOp::Barrier(_) | UploadOp::Shutdown => {
-                    upload_queue.inprogress_tasks.is_empty()
-                }
-            };
-
-            // If we cannot launch this task, don't look any further.
-            //
-            // In some cases, we could let some non-frontmost tasks to "jump the queue" and launch
-            // them now, but we don't try to do that currently.  For example, if the frontmost task
-            // is an index-file upload that cannot proceed until preceding uploads have finished, we
-            // could still start layer uploads that were scheduled later.
-            if !can_run_now {
-                break;
-            }
-
-            if let UploadOp::Shutdown = next_op {
-                // leave the op in the queue but do not start more tasks; it will be dropped when
-                // the stop is called.
-                upload_queue.shutdown_ready.close();
-                break;
-            }
-
-            // We can launch this task. Remove it from the queue first.
-            let mut next_op = upload_queue.queued_operations.pop_front().unwrap();
-
-            debug!("starting op: {}", next_op);
-
-            // Update the counters and prepare
+            // Prepare upload.
             match &mut next_op {
                 UploadOp::UploadLayer(layer, meta, mode) => {
                     if upload_queue
@@ -1916,18 +1872,14 @@ impl RemoteTimelineClient {
                     } else {
                         *mode = Some(OpType::MayReorder)
                     }
-                    upload_queue.num_inprogress_layer_uploads += 1;
-                }
-                UploadOp::UploadMetadata { .. } => {
-                    upload_queue.num_inprogress_metadata_uploads += 1;
                 }
+                UploadOp::UploadMetadata { .. } => {}
                 UploadOp::Delete(Delete { layers }) => {
                     for (name, meta) in layers {
                         upload_queue
                             .recently_deleted
                             .insert((name.clone(), meta.generation));
                     }
-                    upload_queue.num_inprogress_deletions += 1;
                 }
                 UploadOp::Barrier(sender) => {
                     sender.send_replace(());
@@ -2027,6 +1979,8 @@ impl RemoteTimelineClient {
 
             let upload_result: anyhow::Result<()> = match &task.op {
                 UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => {
+                    // TODO: check if this mechanism can be removed now that can_bypass() performs
+                    // conflict checks during scheduling.
                     if let Some(OpType::FlushDeletion) = mode {
                         if self.config.read().unwrap().block_deletions {
                             // Of course, this is not efficient... but usually the queue should be empty.
@@ -2249,13 +2203,8 @@ impl RemoteTimelineClient {
             upload_queue.inprogress_tasks.remove(&task.task_id);
 
             let lsn_update = match task.op {
-                UploadOp::UploadLayer(_, _, _) => {
-                    upload_queue.num_inprogress_layer_uploads -= 1;
-                    None
-                }
+                UploadOp::UploadLayer(_, _, _) => None,
                 UploadOp::UploadMetadata { ref uploaded } => {
-                    upload_queue.num_inprogress_metadata_uploads -= 1;
-
                     // the task id is reused as a monotonicity check for storing the "clean"
                     // IndexPart.
                     let last_updater = upload_queue.clean.1;
@@ -2289,10 +2238,7 @@ impl RemoteTimelineClient {
                         None
                     }
                 }
-                UploadOp::Delete(_) => {
-                    upload_queue.num_inprogress_deletions -= 1;
-                    None
-                }
+                UploadOp::Delete(_) => None,
                 UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(),
             };
 
@@ -2416,9 +2362,6 @@ impl RemoteTimelineClient {
                         visible_remote_consistent_lsn: initialized
                             .visible_remote_consistent_lsn
                             .clone(),
-                        num_inprogress_layer_uploads: 0,
-                        num_inprogress_metadata_uploads: 0,
-                        num_inprogress_deletions: 0,
                         inprogress_tasks: HashMap::default(),
                         queued_operations: VecDeque::default(),
                         #[cfg(feature = "testing")]
@@ -2445,14 +2388,6 @@ impl RemoteTimelineClient {
                     }
                 };
 
-                // consistency check
-                assert_eq!(
-                    qi.num_inprogress_layer_uploads
-                        + qi.num_inprogress_metadata_uploads
-                        + qi.num_inprogress_deletions,
-                    qi.inprogress_tasks.len()
-                );
-
                 // We don't need to do anything here for in-progress tasks. They will finish
                 // on their own, decrement the unfinished-task counter themselves, and observe
                 // that the queue is Stopped.
@@ -2899,8 +2834,8 @@ mod tests {
             let mut guard = client.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut().unwrap();
             assert!(upload_queue.queued_operations.is_empty());
-            assert!(upload_queue.inprogress_tasks.len() == 2);
-            assert!(upload_queue.num_inprogress_layer_uploads == 2);
+            assert_eq!(upload_queue.inprogress_tasks.len(), 2);
+            assert_eq!(upload_queue.num_inprogress_layer_uploads(), 2);
 
             // also check that `latest_file_changes` was updated
             assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2);
@@ -2970,8 +2905,8 @@ mod tests {
             // Deletion schedules upload of the index file, and the file deletion itself
             assert_eq!(upload_queue.queued_operations.len(), 2);
             assert_eq!(upload_queue.inprogress_tasks.len(), 1);
-            assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
-            assert_eq!(upload_queue.num_inprogress_deletions, 0);
+            assert_eq!(upload_queue.num_inprogress_layer_uploads(), 1);
+            assert_eq!(upload_queue.num_inprogress_deletions(), 0);
             assert_eq!(
                 upload_queue.latest_files_changes_since_metadata_upload_scheduled,
                 0
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 51f093cb87..244be5bbb7 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -104,7 +104,7 @@ impl IndexPart {
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
-    pub(crate) fn empty(metadata: TimelineMetadata) -> Self {
+    pub fn empty(metadata: TimelineMetadata) -> Self {
         IndexPart {
             version: Self::LATEST_VERSION,
             layer_metadata: Default::default(),
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 8933e8ceb1..2b06c88e8b 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1812,7 +1812,7 @@ enum LayerKind {
 
 /// Guard for forcing a layer be resident while it exists.
 #[derive(Clone)]
-pub(crate) struct ResidentLayer {
+pub struct ResidentLayer {
     owner: Layer,
     downloaded: Arc<DownloadedLayer>,
 }
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index ef3aa759f3..bd524e8153 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,28 +1,33 @@
+use std::collections::{HashMap, HashSet, VecDeque};
+use std::fmt::Debug;
+use std::sync::atomic::AtomicU32;
+use std::sync::Arc;
+
+use super::remote_timeline_client::is_same_remote_layer_path;
+use super::storage_layer::AsLayerDesc as _;
 use super::storage_layer::LayerName;
 use super::storage_layer::ResidentLayer;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
-use std::collections::HashSet;
-use std::collections::{HashMap, VecDeque};
-use std::fmt::Debug;
+use utils::generation::Generation;
+use utils::lsn::{AtomicLsn, Lsn};
 
 use chrono::NaiveDateTime;
-use std::sync::Arc;
+use once_cell::sync::Lazy;
 use tracing::info;
-use utils::lsn::AtomicLsn;
 
-use std::sync::atomic::AtomicU32;
-use utils::lsn::Lsn;
-
-use utils::generation::Generation;
+/// Kill switch for upload queue reordering in case it causes problems.
+/// TODO: remove this once we have confidence in it.
+static DISABLE_UPLOAD_QUEUE_REORDERING: Lazy<bool> =
+    Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_REORDERING").as_deref() == Ok("true"));
 
 // clippy warns that Uninitialized is much smaller than Initialized, which wastes
 // memory for Uninitialized variants. Doesn't matter in practice, there are not
 // that many upload queues in a running pageserver, and most of them are initialized
 // anyway.
 #[allow(clippy::large_enum_variant)]
-pub(super) enum UploadQueue {
+pub enum UploadQueue {
     Uninitialized,
     Initialized(UploadQueueInitialized),
     Stopped(UploadQueueStopped),
@@ -39,13 +44,13 @@ impl UploadQueue {
 }
 
 #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
-pub(crate) enum OpType {
+pub enum OpType {
     MayReorder,
     FlushDeletion,
 }
 
 /// This keeps track of queued and in-progress tasks.
-pub(crate) struct UploadQueueInitialized {
+pub struct UploadQueueInitialized {
     /// Counter to assign task IDs
     pub(crate) task_counter: u64,
 
@@ -70,21 +75,16 @@ pub(crate) struct UploadQueueInitialized {
     /// we skip validation)
     pub(crate) visible_remote_consistent_lsn: Arc<AtomicLsn>,
 
-    // Breakdown of different kinds of tasks currently in-progress
-    pub(crate) num_inprogress_layer_uploads: usize,
-    pub(crate) num_inprogress_metadata_uploads: usize,
-    pub(crate) num_inprogress_deletions: usize,
-
     /// Tasks that are currently in-progress. In-progress means that a tokio Task
     /// has been launched for it. An in-progress task can be busy uploading, but it can
     /// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can
     /// be waiting for retry in `exponential_backoff`.
-    pub(crate) inprogress_tasks: HashMap<u64, Arc<UploadTask>>,
+    pub inprogress_tasks: HashMap<u64, Arc<UploadTask>>,
 
     /// Queued operations that have not been launched yet. They might depend on previous
     /// tasks to finish. For example, metadata upload cannot be performed before all
     /// preceding layer file uploads have completed.
-    pub(crate) queued_operations: VecDeque<UploadOp>,
+    pub queued_operations: VecDeque<UploadOp>,
 
     /// Files which have been unlinked but not yet had scheduled a deletion for. Only kept around
     /// for error logging.
@@ -122,6 +122,129 @@ impl UploadQueueInitialized {
         let lsn = self.clean.0.metadata.disk_consistent_lsn();
         self.clean.1.map(|_| lsn)
     }
+
+    /// Returns and removes the next ready operation from the queue, if any. This isn't necessarily
+    /// the first operation in the queue, to avoid head-of-line blocking -- an operation can jump
+    /// the queue if it doesn't conflict with operations ahead of it.
+    ///
+    /// None may be returned even if the queue isn't empty, if no operations are ready yet.
+    pub fn next_ready(&mut self) -> Option<UploadOp> {
+        // NB: this is quadratic, but queues are expected to be small.
+        for (i, candidate) in self.queued_operations.iter().enumerate() {
+            // If this candidate is ready, go for it. Otherwise, try the next one.
+            if self.is_ready(i) {
+                // Shutdown operations are left at the head of the queue, to prevent further
+                // operations from starting. Signal that we're ready to shut down.
+                if matches!(candidate, UploadOp::Shutdown) {
+                    assert!(self.inprogress_tasks.is_empty(), "shutdown with tasks");
+                    assert_eq!(i, 0, "shutdown not at head of queue");
+                    self.shutdown_ready.close();
+                    return None;
+                }
+
+                return self.queued_operations.remove(i);
+            }
+
+            // Nothing can bypass a barrier or shutdown. If it wasn't scheduled above, give up.
+            if matches!(candidate, UploadOp::Barrier(_) | UploadOp::Shutdown) {
+                return None;
+            }
+
+            // If upload queue reordering is disabled, bail out after the first operation.
+            if *DISABLE_UPLOAD_QUEUE_REORDERING {
+                return None;
+            }
+        }
+        None
+    }
+
+    /// Returns true if the queued operation at the given position is ready to be uploaded, i.e. if
+    /// it doesn't conflict with any in-progress or queued operations ahead of it. Operations are
+    /// allowed to skip the queue when it's safe to do so, to increase parallelism.
+    ///
+    /// The position must be valid for the queue size.
+    fn is_ready(&self, pos: usize) -> bool {
+        let candidate = self.queued_operations.get(pos).expect("invalid position");
+        self
+            // Look at in-progress operations, in random order.
+            .inprogress_tasks
+            .values()
+            .map(|task| &task.op)
+            // Then queued operations ahead of the candidate, front-to-back.
+            .chain(self.queued_operations.iter().take(pos))
+            // Keep track of the active index ahead of each operation. This is used to ensure that
+            // an upload doesn't skip the queue too far, such that it modifies a layer that's
+            // referenced by an active index.
+            //
+            // It's okay that in-progress operations are emitted in random order above, since at
+            // most one of them can be an index upload (enforced by can_bypass).
+            .scan(&self.clean.0, |next_active_index, op| {
+                let active_index = *next_active_index;
+                if let UploadOp::UploadMetadata { ref uploaded } = op {
+                    *next_active_index = uploaded; // stash index for next operation after this
+                }
+                Some((op, active_index))
+            })
+            // Check if the candidate can bypass all of them.
+            .all(|(op, active_index)| candidate.can_bypass(op, active_index))
+    }
+
+    /// Returns the number of in-progress deletion operations.
+    #[cfg(test)]
+    pub(crate) fn num_inprogress_deletions(&self) -> usize {
+        self.inprogress_tasks
+            .iter()
+            .filter(|(_, t)| matches!(t.op, UploadOp::Delete(_)))
+            .count()
+    }
+
+    /// Returns the number of in-progress layer uploads.
+    #[cfg(test)]
+    pub(crate) fn num_inprogress_layer_uploads(&self) -> usize {
+        self.inprogress_tasks
+            .iter()
+            .filter(|(_, t)| matches!(t.op, UploadOp::UploadLayer(_, _, _)))
+            .count()
+    }
+
+    /// Test helper that schedules all ready operations into inprogress_tasks, and returns
+    /// references to them.
+    ///
+    /// TODO: the corresponding production logic should be moved from RemoteTimelineClient into
+    /// UploadQueue, so we can use the same code path.
+    #[cfg(test)]
+    fn schedule_ready(&mut self) -> Vec<Arc<UploadTask>> {
+        let mut tasks = Vec::new();
+        // NB: schedule operations one by one, to handle conflicts with inprogress_tasks.
+        while let Some(op) = self.next_ready() {
+            self.task_counter += 1;
+            let task = Arc::new(UploadTask {
+                task_id: self.task_counter,
+                op,
+                retries: 0.into(),
+            });
+            self.inprogress_tasks.insert(task.task_id, task.clone());
+            tasks.push(task);
+        }
+        tasks
+    }
+
+    /// Test helper that marks an operation as completed, removing it from inprogress_tasks.
+    ///
+    /// TODO: the corresponding production logic should be moved from RemoteTimelineClient into
+    /// UploadQueue, so we can use the same code path.
+    #[cfg(test)]
+    fn complete(&mut self, task_id: u64) {
+        let Some(task) = self.inprogress_tasks.remove(&task_id) else {
+            return;
+        };
+        // Update the clean index on uploads.
+        if let UploadOp::UploadMetadata { ref uploaded } = task.op {
+            if task.task_id > self.clean.1.unwrap_or_default() {
+                self.clean = (*uploaded.clone(), Some(task.task_id));
+            }
+        }
+    }
 }
 
 #[derive(Clone, Copy)]
@@ -131,12 +254,12 @@ pub(super) enum SetDeletedFlagProgress {
     Successful(NaiveDateTime),
 }
 
-pub(super) struct UploadQueueStoppedDeletable {
+pub struct UploadQueueStoppedDeletable {
     pub(super) upload_queue_for_deletion: UploadQueueInitialized,
     pub(super) deleted_at: SetDeletedFlagProgress,
 }
 
-pub(super) enum UploadQueueStopped {
+pub enum UploadQueueStopped {
     Deletable(UploadQueueStoppedDeletable),
     Uninitialized,
 }
@@ -163,7 +286,7 @@ impl NotInitialized {
 }
 
 impl UploadQueue {
-    pub(crate) fn initialize_empty_remote(
+    pub fn initialize_empty_remote(
         &mut self,
         metadata: &TimelineMetadata,
     ) -> anyhow::Result<&mut UploadQueueInitialized> {
@@ -185,9 +308,6 @@ impl UploadQueue {
             visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
             // what follows are boring default initializations
             task_counter: 0,
-            num_inprogress_layer_uploads: 0,
-            num_inprogress_metadata_uploads: 0,
-            num_inprogress_deletions: 0,
             inprogress_tasks: HashMap::new(),
             queued_operations: VecDeque::new(),
             #[cfg(feature = "testing")]
@@ -202,7 +322,7 @@ impl UploadQueue {
         Ok(self.initialized_mut().expect("we just set it"))
     }
 
-    pub(crate) fn initialize_with_current_remote_index_part(
+    pub fn initialize_with_current_remote_index_part(
         &mut self,
         index_part: &IndexPart,
     ) -> anyhow::Result<&mut UploadQueueInitialized> {
@@ -227,9 +347,6 @@ impl UploadQueue {
             ),
             // what follows are boring default initializations
             task_counter: 0,
-            num_inprogress_layer_uploads: 0,
-            num_inprogress_metadata_uploads: 0,
-            num_inprogress_deletions: 0,
             inprogress_tasks: HashMap::new(),
             queued_operations: VecDeque::new(),
             #[cfg(feature = "testing")]
@@ -244,9 +361,7 @@ impl UploadQueue {
         Ok(self.initialized_mut().expect("we just set it"))
     }
 
-    pub(crate) fn initialized_mut(
-        &mut self,
-    ) -> Result<&mut UploadQueueInitialized, NotInitialized> {
+    pub fn initialized_mut(&mut self) -> Result<&mut UploadQueueInitialized, NotInitialized> {
         use UploadQueue::*;
         match self {
             Uninitialized => Err(NotInitialized::Uninitialized),
@@ -276,23 +391,23 @@ impl UploadQueue {
 
 /// An in-progress upload or delete task.
 #[derive(Debug)]
-pub(crate) struct UploadTask {
+pub struct UploadTask {
     /// Unique ID of this task. Used as the key in `inprogress_tasks` above.
-    pub(crate) task_id: u64,
-    pub(crate) retries: AtomicU32,
+    pub task_id: u64,
+    pub retries: AtomicU32,
 
-    pub(crate) op: UploadOp,
+    pub op: UploadOp,
 }
 
 /// A deletion of some layers within the lifetime of a timeline.  This is not used
 /// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
 #[derive(Debug, Clone)]
-pub(crate) struct Delete {
-    pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>,
+pub struct Delete {
+    pub layers: Vec<(LayerName, LayerFileMetadata)>,
 }
 
-#[derive(Debug)]
-pub(crate) enum UploadOp {
+#[derive(Clone, Debug)]
+pub enum UploadOp {
     /// Upload a layer file. The last field indicates the last operation for thie file.
     UploadLayer(ResidentLayer, LayerFileMetadata, Option<OpType>),
 
@@ -338,3 +453,765 @@ impl std::fmt::Display for UploadOp {
         }
     }
 }
+
+impl UploadOp {
+    /// Returns true if self can bypass other, i.e. if the operations don't conflict. index is the
+    /// active index when other would be uploaded -- if we allow self to bypass other, this would
+    /// be the active index when self is uploaded.
+    pub fn can_bypass(&self, other: &UploadOp, index: &IndexPart) -> bool {
+        match (self, other) {
+            // Nothing can bypass a barrier or shutdown, and it can't bypass anything.
+            (UploadOp::Barrier(_), _) | (_, UploadOp::Barrier(_)) => false,
+            (UploadOp::Shutdown, _) | (_, UploadOp::Shutdown) => false,
+
+            // Uploads and deletes can bypass each other unless they're for the same file.
+            (UploadOp::UploadLayer(a, ameta, _), UploadOp::UploadLayer(b, bmeta, _)) => {
+                let aname = &a.layer_desc().layer_name();
+                let bname = &b.layer_desc().layer_name();
+                !is_same_remote_layer_path(aname, ameta, bname, bmeta)
+            }
+            (UploadOp::UploadLayer(u, umeta, _), UploadOp::Delete(d))
+            | (UploadOp::Delete(d), UploadOp::UploadLayer(u, umeta, _)) => {
+                d.layers.iter().all(|(dname, dmeta)| {
+                    !is_same_remote_layer_path(&u.layer_desc().layer_name(), umeta, dname, dmeta)
+                })
+            }
+
+            // Deletes are idempotent and can always bypass each other.
+            (UploadOp::Delete(_), UploadOp::Delete(_)) => true,
+
+            // Uploads and deletes can bypass an index upload as long as neither the uploaded index
+            // nor the active index below it references the file. A layer can't be modified or
+            // deleted while referenced by an index.
+            //
+            // Similarly, index uploads can bypass uploads and deletes as long as neither the
+            // uploaded index nor the active index references the file (the latter would be
+            // incorrect use by the caller).
+            (UploadOp::UploadLayer(u, umeta, _), UploadOp::UploadMetadata { uploaded: i })
+            | (UploadOp::UploadMetadata { uploaded: i }, UploadOp::UploadLayer(u, umeta, _)) => {
+                let uname = u.layer_desc().layer_name();
+                !i.references(&uname, umeta) && !index.references(&uname, umeta)
+            }
+            (UploadOp::Delete(d), UploadOp::UploadMetadata { uploaded: i })
+            | (UploadOp::UploadMetadata { uploaded: i }, UploadOp::Delete(d)) => {
+                d.layers.iter().all(|(dname, dmeta)| {
+                    !i.references(dname, dmeta) && !index.references(dname, dmeta)
+                })
+            }
+
+            // Indexes can never bypass each other.
+            // TODO: we could coalesce them though, by only uploading the newest ready index. This
+            // is left for later, out of caution.
+            (UploadOp::UploadMetadata { .. }, UploadOp::UploadMetadata { .. }) => false,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
+    use crate::tenant::storage_layer::layer::local_layer_path;
+    use crate::tenant::storage_layer::Layer;
+    use crate::tenant::Timeline;
+    use crate::DEFAULT_PG_VERSION;
+    use itertools::Itertools as _;
+    use std::str::FromStr as _;
+    use utils::shard::{ShardCount, ShardIndex, ShardNumber};
+
+    /// Test helper which asserts that two operations are the same, in lieu of UploadOp PartialEq.
+    #[track_caller]
+    fn assert_same_op(a: &UploadOp, b: &UploadOp) {
+        use UploadOp::*;
+        match (a, b) {
+            (UploadLayer(a, ameta, atype), UploadLayer(b, bmeta, btype)) => {
+                assert_eq!(a.layer_desc().layer_name(), b.layer_desc().layer_name());
+                assert_eq!(ameta, bmeta);
+                assert_eq!(atype, btype);
+            }
+            (Delete(a), Delete(b)) => assert_eq!(a.layers, b.layers),
+            (UploadMetadata { uploaded: a }, UploadMetadata { uploaded: b }) => assert_eq!(a, b),
+            (Barrier(_), Barrier(_)) => {}
+            (Shutdown, Shutdown) => {}
+            (a, b) => panic!("{a:?} != {b:?}"),
+        }
+    }
+
+    /// Test helper which asserts that two sets of operations are the same.
+    #[track_caller]
+    fn assert_same_ops<'a>(
+        a: impl IntoIterator<Item = &'a UploadOp>,
+        b: impl IntoIterator<Item = &'a UploadOp>,
+    ) {
+        a.into_iter()
+            .zip_eq(b)
+            .for_each(|(a, b)| assert_same_op(a, b))
+    }
+
+    /// Test helper to construct a test timeline.
+    ///
+    /// TODO: it really shouldn't be necessary to construct an entire tenant and timeline just to
+    /// test the upload queue -- decouple ResidentLayer from Timeline.
+    ///
+    /// TODO: the upload queue uses TimelineMetadata::example() instead, because there's no way to
+    /// obtain a TimelineMetadata from a Timeline.
+    fn make_timeline() -> Arc<Timeline> {
+        // Grab the current test name from the current thread name.
+        // TODO: TenantHarness shouldn't take a &'static str, but just leak the test name for now.
+        let test_name = std::thread::current().name().unwrap().to_string();
+        let test_name = Box::leak(test_name.into_boxed_str());
+
+        let runtime = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .expect("failed to create runtime");
+
+        runtime
+            .block_on(async {
+                let harness = TenantHarness::create(test_name).await?;
+                let (tenant, ctx) = harness.load().await;
+                tenant
+                    .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
+                    .await
+            })
+            .expect("failed to create timeline")
+    }
+
+    /// Test helper to construct an (empty) resident layer.
+    fn make_layer(timeline: &Arc<Timeline>, name: &str) -> ResidentLayer {
+        make_layer_with_size(timeline, name, 0)
+    }
+
+    /// Test helper to construct a resident layer with the given size.
+    fn make_layer_with_size(timeline: &Arc<Timeline>, name: &str, size: usize) -> ResidentLayer {
+        let metadata = LayerFileMetadata {
+            generation: timeline.generation,
+            shard: timeline.get_shard_index(),
+            file_size: size as u64,
+        };
+        make_layer_with_metadata(timeline, name, metadata)
+    }
+
+    /// Test helper to construct a layer with the given metadata.
+    fn make_layer_with_metadata(
+        timeline: &Arc<Timeline>,
+        name: &str,
+        metadata: LayerFileMetadata,
+    ) -> ResidentLayer {
+        let name = LayerName::from_str(name).expect("invalid name");
+        let local_path = local_layer_path(
+            timeline.conf,
+            &timeline.tenant_shard_id,
+            &timeline.timeline_id,
+            &name,
+            &metadata.generation,
+        );
+        std::fs::write(&local_path, vec![0; metadata.file_size as usize])
+            .expect("failed to write file");
+        Layer::for_resident(timeline.conf, timeline, local_path, name, metadata)
+    }
+
+    /// Test helper to add a layer to an index and return a new index.
+    fn index_with(index: &IndexPart, layer: &ResidentLayer) -> Box<IndexPart> {
+        let mut index = index.clone();
+        index
+            .layer_metadata
+            .insert(layer.layer_desc().layer_name(), layer.metadata());
+        Box::new(index)
+    }
+
+    /// Test helper to remove a layer from an index and return a new index.
+    fn index_without(index: &IndexPart, layer: &ResidentLayer) -> Box<IndexPart> {
+        let mut index = index.clone();
+        index
+            .layer_metadata
+            .remove(&layer.layer_desc().layer_name());
+        Box::new(index)
+    }
+
+    /// Nothing can bypass a barrier, and it can't bypass inprogress tasks.
+    #[test]
+    fn schedule_barrier() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_empty_remote(&TimelineMetadata::example())?;
+        let tli = make_timeline();
+
+        let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let (barrier, _) = tokio::sync::watch::channel(());
+
+        // Enqueue non-conflicting upload, delete, and index before and after a barrier.
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())],
+            }),
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+            UploadOp::Barrier(barrier),
+            UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())],
+            }),
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Schedule the initial operations ahead of the barrier.
+        let tasks = queue.schedule_ready();
+
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..3]);
+        assert!(matches!(
+            queue.queued_operations.front(),
+            Some(&UploadOp::Barrier(_))
+        ));
+
+        // Complete the initial operations. The barrier isn't scheduled while they're pending.
+        for task in tasks {
+            assert!(queue.schedule_ready().is_empty());
+            queue.complete(task.task_id);
+        }
+
+        // Schedule the barrier. The later tasks won't schedule until it completes.
+        let tasks = queue.schedule_ready();
+
+        assert_eq!(tasks.len(), 1);
+        assert!(matches!(tasks[0].op, UploadOp::Barrier(_)));
+        assert_eq!(queue.queued_operations.len(), 3);
+
+        // Complete the barrier. The rest of the tasks schedule immediately.
+        queue.complete(tasks[0].task_id);
+
+        let tasks = queue.schedule_ready();
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops[4..]);
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Deletes can be scheduled in parallel, even if they're for the same file.
+    #[test]
+    fn schedule_delete_parallel() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_empty_remote(&TimelineMetadata::example())?;
+        let tli = make_timeline();
+
+        // Enqueue a bunch of deletes, some with conflicting names.
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        let ops = [
+            UploadOp::Delete(Delete {
+                layers: vec![(layer0.layer_desc().layer_name(), layer0.metadata())],
+            }),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())],
+            }),
+            UploadOp::Delete(Delete {
+                layers: vec![
+                    (layer1.layer_desc().layer_name(), layer1.metadata()),
+                    (layer2.layer_desc().layer_name(), layer2.metadata()),
+                ],
+            }),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer2.layer_desc().layer_name(), layer2.metadata())],
+            }),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())],
+            }),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Schedule all ready operations. Since deletes don't conflict, they're all scheduled.
+        let tasks = queue.schedule_ready();
+
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops);
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Conflicting uploads are serialized.
+    #[test]
+    fn schedule_upload_conflicts() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let tli = make_timeline();
+
+        // Enqueue three versions of the same layer, with different file sizes.
+        let layer0a = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 1);
+        let layer0b = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 2);
+        let layer0c = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 3);
+
+        let ops = [
+            UploadOp::UploadLayer(layer0a.clone(), layer0a.metadata(), None),
+            UploadOp::UploadLayer(layer0b.clone(), layer0b.metadata(), None),
+            UploadOp::UploadLayer(layer0c.clone(), layer0c.metadata(), None),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Only one version should be scheduled and uploaded at a time.
+        for op in ops {
+            let tasks = queue.schedule_ready();
+            assert_eq!(tasks.len(), 1);
+            assert_same_op(&tasks[0].op, &op);
+            queue.complete(tasks[0].task_id);
+        }
+        assert!(queue.schedule_ready().is_empty());
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Conflicting uploads and deletes are serialized.
+    #[test]
+    fn schedule_upload_delete_conflicts() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let tli = make_timeline();
+
+        // Enqueue two layer uploads, with a delete of both layers in between them. These should be
+        // scheduled one at a time, since deletes can't bypass uploads and vice versa.
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![
+                    (layer0.layer_desc().layer_name(), layer0.metadata()),
+                    (layer1.layer_desc().layer_name(), layer1.metadata()),
+                ],
+            }),
+            UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Only one version should be scheduled and uploaded at a time.
+        for op in ops {
+            let tasks = queue.schedule_ready();
+            assert_eq!(tasks.len(), 1);
+            assert_same_op(&tasks[0].op, &op);
+            queue.complete(tasks[0].task_id);
+        }
+        assert!(queue.schedule_ready().is_empty());
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Non-conflicting uploads and deletes can bypass the queue, avoiding the conflicting
+    /// delete/upload operations at the head of the queue.
+    #[test]
+    fn schedule_upload_delete_conflicts_bypass() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let tli = make_timeline();
+
+        // Enqueue two layer uploads, with a delete of both layers in between them. These should be
+        // scheduled one at a time, since deletes can't bypass uploads and vice versa.
+        //
+        // Also enqueue non-conflicting uploads and deletes at the end. These can bypass the queue
+        // and run immediately.
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![
+                    (layer0.layer_desc().layer_name(), layer0.metadata()),
+                    (layer1.layer_desc().layer_name(), layer1.metadata()),
+                ],
+            }),
+            UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
+            UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())],
+            }),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Operations 0, 3, and 4 are scheduled immediately.
+        let tasks = queue.schedule_ready();
+        assert_same_ops(tasks.iter().map(|t| &t.op), [&ops[0], &ops[3], &ops[4]]);
+        assert_eq!(queue.queued_operations.len(), 2);
+
+        Ok(())
+    }
+
+    /// Non-conflicting uploads are parallelized.
+    #[test]
+    fn schedule_upload_parallel() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let tli = make_timeline();
+
+        // Enqueue three different layer uploads.
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
+            UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // All uploads should be scheduled concurrently.
+        let tasks = queue.schedule_ready();
+
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops);
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Index uploads are serialized.
+    #[test]
+    fn schedule_index_serial() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+
+        // Enqueue three uploads of the current empty index.
+        let index = Box::new(queue.clean.0.clone());
+
+        let ops = [
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // The uploads should run serially.
+        for op in ops {
+            let tasks = queue.schedule_ready();
+            assert_eq!(tasks.len(), 1);
+            assert_same_op(&tasks[0].op, &op);
+            queue.complete(tasks[0].task_id);
+        }
+
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Chains of upload/index operations lead to parallel layer uploads and serial index uploads.
+    /// This is the common case with layer flushes.
+    #[test]
+    fn schedule_index_upload_chain() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let tli = make_timeline();
+
+        // Enqueue three uploads of the current empty index.
+        let index = Box::new(queue.clean.0.clone());
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let index0 = index_with(&index, &layer0);
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let index1 = index_with(&index0, &layer1);
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let index2 = index_with(&index1, &layer2);
+
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::UploadMetadata {
+                uploaded: index0.clone(),
+            },
+            UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
+            UploadOp::UploadMetadata {
+                uploaded: index1.clone(),
+            },
+            UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
+            UploadOp::UploadMetadata {
+                uploaded: index2.clone(),
+            },
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // The layer uploads should be scheduled immediately. The indexes must wait.
+        let upload_tasks = queue.schedule_ready();
+        assert_same_ops(
+            upload_tasks.iter().map(|t| &t.op),
+            [&ops[0], &ops[2], &ops[4]],
+        );
+
+        // layer2 completes first. None of the indexes can upload yet.
+        queue.complete(upload_tasks[2].task_id);
+        assert!(queue.schedule_ready().is_empty());
+
+        // layer0 completes. index0 can upload. It completes.
+        queue.complete(upload_tasks[0].task_id);
+        let index_tasks = queue.schedule_ready();
+        assert_eq!(index_tasks.len(), 1);
+        assert_same_op(&index_tasks[0].op, &ops[1]);
+        queue.complete(index_tasks[0].task_id);
+
+        // layer 1 completes. This unblocks index 1 then index 2.
+        queue.complete(upload_tasks[1].task_id);
+
+        let index_tasks = queue.schedule_ready();
+        assert_eq!(index_tasks.len(), 1);
+        assert_same_op(&index_tasks[0].op, &ops[3]);
+        queue.complete(index_tasks[0].task_id);
+
+        let index_tasks = queue.schedule_ready();
+        assert_eq!(index_tasks.len(), 1);
+        assert_same_op(&index_tasks[0].op, &ops[5]);
+        queue.complete(index_tasks[0].task_id);
+
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// A delete can't bypass an index upload if an index ahead of it still references it.
+    #[test]
+    fn schedule_index_delete_dereferenced() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let tli = make_timeline();
+
+        // Create a layer to upload.
+        let layer = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let index_upload = index_with(&queue.clean.0, &layer);
+
+        // Remove the layer reference in a new index, then delete the layer.
+        let index_deref = index_without(&index_upload, &layer);
+
+        let ops = [
+            // Initial upload.
+            UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
+            UploadOp::UploadMetadata {
+                uploaded: index_upload.clone(),
+            },
+            // Dereference the layer and delete it.
+            UploadOp::UploadMetadata {
+                uploaded: index_deref.clone(),
+            },
+            UploadOp::Delete(Delete {
+                layers: vec![(layer.layer_desc().layer_name(), layer.metadata())],
+            }),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Operations are serialized.
+        for op in ops {
+            let tasks = queue.schedule_ready();
+            assert_eq!(tasks.len(), 1);
+            assert_same_op(&tasks[0].op, &op);
+            queue.complete(tasks[0].task_id);
+        }
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// An upload with a reused layer name doesn't clobber the previous layer. Specifically, a
+    /// dereference/upload/reference cycle can't allow the upload to bypass the reference.
+    #[test]
+    fn schedule_index_upload_dereferenced() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let tli = make_timeline();
+
+        // Create a layer to upload.
+        let layer = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        // Upload the layer. Then dereference the layer, and upload/reference it again.
+        let index_upload = index_with(&queue.clean.0, &layer);
+        let index_deref = index_without(&index_upload, &layer);
+        let index_ref = index_with(&index_deref, &layer);
+
+        let ops = [
+            // Initial upload.
+            UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
+            UploadOp::UploadMetadata {
+                uploaded: index_upload.clone(),
+            },
+            // Dereference the layer.
+            UploadOp::UploadMetadata {
+                uploaded: index_deref.clone(),
+            },
+            // Replace and reference the layer.
+            UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
+            UploadOp::UploadMetadata {
+                uploaded: index_ref.clone(),
+            },
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Operations are serialized.
+        for op in ops {
+            let tasks = queue.schedule_ready();
+            assert_eq!(tasks.len(), 1);
+            assert_same_op(&tasks[0].op, &op);
+            queue.complete(tasks[0].task_id);
+        }
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Nothing can bypass a shutdown, and it waits for inprogress tasks. It's never returned from
+    /// next_ready(), but is left at the head of the queue.
+    #[test]
+    fn schedule_shutdown() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_empty_remote(&TimelineMetadata::example())?;
+        let tli = make_timeline();
+
+        let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        // Enqueue non-conflicting upload, delete, and index before and after a shutdown.
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())],
+            }),
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+            UploadOp::Shutdown,
+            UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())],
+            }),
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Schedule the initial operations ahead of the shutdown.
+        let tasks = queue.schedule_ready();
+
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..3]);
+        assert!(matches!(
+            queue.queued_operations.front(),
+            Some(&UploadOp::Shutdown)
+        ));
+
+        // Complete the initial operations. The shutdown isn't triggered while they're pending.
+        for task in tasks {
+            assert!(queue.schedule_ready().is_empty());
+            queue.complete(task.task_id);
+        }
+
+        // The shutdown is triggered the next time we try to pull an operation. It isn't returned,
+        // but is left in the queue.
+        assert!(!queue.shutdown_ready.is_closed());
+        assert!(queue.next_ready().is_none());
+        assert!(queue.shutdown_ready.is_closed());
+
+        Ok(())
+    }
+
+    /// Tests that can_bypass takes name, generation and shard index into account for all operations.
+    #[test]
+    fn can_bypass_path() -> anyhow::Result<()> {
+        let tli = make_timeline();
+
+        let name0 = &"000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51";
+        let name1 = &"100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51";
+
+        // Asserts that layers a and b either can or can't bypass each other, for all combinations
+        // of operations (except Delete and UploadMetadata which are special-cased).
+        #[track_caller]
+        fn assert_can_bypass(a: ResidentLayer, b: ResidentLayer, can_bypass: bool) {
+            let index = IndexPart::empty(TimelineMetadata::example());
+            for (a, b) in make_ops(a).into_iter().zip(make_ops(b)) {
+                match (&a, &b) {
+                    // Deletes can always bypass each other.
+                    (UploadOp::Delete(_), UploadOp::Delete(_)) => assert!(a.can_bypass(&b, &index)),
+                    // Indexes can never bypass each other.
+                    (UploadOp::UploadMetadata { .. }, UploadOp::UploadMetadata { .. }) => {
+                        assert!(!a.can_bypass(&b, &index))
+                    }
+                    // For other operations, assert as requested.
+                    (a, b) => assert_eq!(a.can_bypass(b, &index), can_bypass),
+                }
+            }
+        }
+
+        fn make_ops(layer: ResidentLayer) -> Vec<UploadOp> {
+            let mut index = IndexPart::empty(TimelineMetadata::example());
+            index
+                .layer_metadata
+                .insert(layer.layer_desc().layer_name(), layer.metadata());
+            vec![
+                UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
+                UploadOp::Delete(Delete {
+                    layers: vec![(layer.layer_desc().layer_name(), layer.metadata())],
+                }),
+                UploadOp::UploadMetadata {
+                    uploaded: Box::new(index),
+                },
+            ]
+        }
+
+        // Makes a ResidentLayer.
+        let layer = |name: &'static str, shard: Option<u8>, generation: u32| -> ResidentLayer {
+            let shard = shard
+                .map(|n| ShardIndex::new(ShardNumber(n), ShardCount(8)))
+                .unwrap_or(ShardIndex::unsharded());
+            let metadata = LayerFileMetadata {
+                shard,
+                generation: Generation::Valid(generation),
+                file_size: 0,
+            };
+            make_layer_with_metadata(&tli, name, metadata)
+        };
+
+        // Same name and metadata can't bypass. This goes both for unsharded and sharded, as well as
+        // 0 or >0 generation.
+        assert_can_bypass(layer(name0, None, 0), layer(name0, None, 0), false);
+        assert_can_bypass(layer(name0, Some(0), 0), layer(name0, Some(0), 0), false);
+        assert_can_bypass(layer(name0, None, 1), layer(name0, None, 1), false);
+
+        // Different names can bypass.
+        assert_can_bypass(layer(name0, None, 0), layer(name1, None, 0), true);
+
+        // Different shards can bypass. Shard 0 is different from unsharded.
+        assert_can_bypass(layer(name0, Some(0), 0), layer(name0, Some(1), 0), true);
+        assert_can_bypass(layer(name0, Some(0), 0), layer(name0, None, 0), true);
+
+        // Different generations can bypass, both sharded and unsharded.
+        assert_can_bypass(layer(name0, None, 0), layer(name0, None, 1), true);
+        assert_can_bypass(layer(name0, Some(1), 0), layer(name0, Some(1), 1), true);
+
+        Ok(())
+    }
+}

From d36112d20fdc249c324308f82b2b81bb124065d9 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 14 Jan 2025 19:02:35 +0200
Subject: [PATCH 184/583] Simplify compute dockerfile by setting PATH just once
 (#10357)

By setting PATH in the 'pg-build' layer, all the extension build layers
will inherit. No need to pass PG_CONFIG to all the various make
invocations either: once pg_config is in PATH, the Makefiles will pick
it up from there.
---
 compute/compute-node.Dockerfile | 168 ++++++++++++--------------------
 1 file changed, 63 insertions(+), 105 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 89cee6761f..299f4444a3 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -104,16 +104,18 @@ RUN cd postgres && \
         esac; \
     done;
 
+# Set PATH for all the subsequent build steps
+ENV PATH="/usr/local/pgsql/bin:$PATH"
+
 #########################################################################################
 #
 # Layer "postgis-build"
 # Build PostGIS from the upstream PostGIS mirror.
 #
 #########################################################################################
-FROM build-deps AS postgis-build
+FROM pg-build AS postgis-build
 ARG DEBIAN_VERSION
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
     apt install --no-install-recommends --no-install-suggests -y \
     gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
@@ -151,8 +153,6 @@ RUN case "${DEBIAN_VERSION}" in \
     DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \
     ninja clean && cp -R /sfcgal/* /
 
-ENV PATH="/usr/local/pgsql/bin:$PATH"
-
 # Postgis 3.5.0 supports v17
 RUN case "${PG_VERSION}" in \
     "v17") \
@@ -227,9 +227,8 @@ RUN case "${PG_VERSION}" in \
 # Build plv8
 #
 #########################################################################################
-FROM build-deps AS plv8-build
+FROM pg-build AS plv8-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch
 
@@ -264,7 +263,6 @@ RUN case "${PG_VERSION}" in \
     # generate and copy upgrade scripts
     mkdir -p upgrade && ./generate_upgrade.sh ${PLV8_TAG#v} && \
     cp upgrade/* /usr/local/pgsql/share/extension/ && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
     make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
     rm -rf /plv8-* && \
     find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
@@ -291,9 +289,8 @@ RUN case "${PG_VERSION}" in \
 # Build h3_pg
 #
 #########################################################################################
-FROM build-deps AS h3-pg-build
+FROM pg-build AS h3-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v4.1.0 - Jan 18, 2023
@@ -314,7 +311,6 @@ RUN mkdir -p /h3/usr/ && \
 RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
     echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
     mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
@@ -326,17 +322,16 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3
 # compile unit extension
 #
 #########################################################################################
-FROM build-deps AS unit-pg-build
+FROM pg-build AS unit-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release 7.9 - Sep 15, 2024
 RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -O postgresql-unit.tar.gz && \
     echo "e46de6245dcc8b2c2ecf29873dbd43b2b346773f31dd5ce4b8315895a052b456 postgresql-unit.tar.gz" | sha256sum --check && \
     mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
     # We move the extension from '/usr/local/pgsql/' to '/usr/local/'  after it is build. So we need to adjust the path.
     # This one-liner removes pgsql/ part of the path.
@@ -350,9 +345,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -
 # compile pgvector extension
 #
 #########################################################################################
-FROM build-deps AS vector-pg-build
+FROM pg-build AS vector-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY compute/patches/pgvector.patch /pgvector.patch
 
@@ -366,8 +360,8 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O
     echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
     patch -p1 < /pgvector.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
 
 #########################################################################################
@@ -376,16 +370,15 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O
 # compile pgjwt extension
 #
 #########################################################################################
-FROM build-deps AS pgjwt-pg-build
+FROM pg-build AS pgjwt-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # doesn't use releases, last commit f3d82fd - Mar 2, 2023
 RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \
     echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \
     mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
 
 #########################################################################################
@@ -394,17 +387,16 @@ RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71
 # compile hypopg extension
 #
 #########################################################################################
-FROM build-deps AS hypopg-pg-build
+FROM pg-build AS hypopg-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # HypoPG 1.4.1 supports v17
 # last release 1.4.1 - Apr 28, 2024
 RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \
     echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \
     mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
 
 #########################################################################################
@@ -413,17 +405,16 @@ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypo
 # compile pg_hashids extension
 #
 #########################################################################################
-FROM build-deps AS pg-hashids-pg-build
+FROM pg-build AS pg-hashids-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v1.2.1 -Jan 12, 2018
 RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
     echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
     mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
 
 #########################################################################################
@@ -432,9 +423,8 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
 # compile rum extension
 #
 #########################################################################################
-FROM build-deps AS rum-pg-build
+FROM pg-build AS rum-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY compute/patches/rum.patch /rum.patch
 
@@ -445,8 +435,8 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea
     echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \
     mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
     patch -p1 < /rum.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
 
 #########################################################################################
@@ -455,17 +445,16 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea
 # compile pgTAP extension
 #
 #########################################################################################
-FROM build-deps AS pgtap-pg-build
+FROM pg-build AS pgtap-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # pgtap 1.3.3 supports v17
 # last release v1.3.3 - Apr 8, 2024
 RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \
     echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \
     mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
 
 #########################################################################################
@@ -474,17 +463,16 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgta
 # compile ip4r extension
 #
 #########################################################################################
-FROM build-deps AS ip4r-pg-build
+FROM pg-build AS ip4r-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v2.4.2 - Jul 29, 2023
 RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
     echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
     mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control
 
 #########################################################################################
@@ -493,17 +481,16 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i
 # compile Prefix extension
 #
 #########################################################################################
-FROM build-deps AS prefix-pg-build
+FROM pg-build AS prefix-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v1.2.10  - Jul 5, 2023
 RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
     echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
     mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
 
 #########################################################################################
@@ -512,17 +499,16 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p
 # compile hll extension
 #
 #########################################################################################
-FROM build-deps AS hll-pg-build
+FROM pg-build AS hll-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v2.18 - Aug 29, 2023
 RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
     echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
     mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
 
 #########################################################################################
@@ -531,17 +517,16 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
 # compile plpgsql_check extension
 #
 #########################################################################################
-FROM build-deps AS plpgsql-check-pg-build
+FROM pg-build AS plpgsql-check-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # plpgsql_check v2.7.11 supports v17
 # last release v2.7.11 - Sep 16, 2024
 RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \
     echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \
     mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
 
 #########################################################################################
@@ -550,11 +535,8 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz
 # compile timescaledb extension
 #
 #########################################################################################
-FROM build-deps AS timescaledb-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
+FROM pg-build AS timescaledb-pg-build
 ARG PG_VERSION
-ENV PATH="/usr/local/pgsql/bin:$PATH"
 
 RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
@@ -585,11 +567,8 @@ RUN case "${PG_VERSION}" in \
 # compile pg_hint_plan extension
 #
 #########################################################################################
-FROM build-deps AS pg-hint-plan-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
+FROM pg-build AS pg-hint-plan-pg-build
 ARG PG_VERSION
-ENV PATH="/usr/local/pgsql/bin:$PATH"
 
 # version-specific, has separate releases for each version
 RUN case "${PG_VERSION}" in \
@@ -627,14 +606,12 @@ RUN case "${PG_VERSION}" in \
 # compile pg_cron extension
 #
 #########################################################################################
-FROM build-deps AS pg-cron-pg-build
+FROM pg-build AS pg-cron-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # This is an experimental extension that we do not support on prod yet.
 # !Do not remove!
 # We set it in shared_preload_libraries and computes will fail to start if library is not found.
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \
     echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \
     mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
@@ -648,9 +625,8 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O
 # compile rdkit extension
 #
 #########################################################################################
-FROM build-deps AS rdkit-pg-build
+FROM pg-build AS rdkit-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN apt update && \
     apt install --no-install-recommends --no-install-suggests -y \
@@ -668,7 +644,13 @@ RUN apt update && \
 # Use new version only for v17
 # because Release_2024_09_1 has some backward incompatible changes
 # https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
-ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
+
+# XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find
+# pg_config. For some reason the rdkit cmake script doesn't work with just that,
+# however. By also adding /usr/local/pgsql, it works, which is weird because there
+# are no executables in that directory.
+ENV PATH="/usr/local/pgsql:$PATH"
+
 RUN case "${PG_VERSION}" in \
     "v17") \
         export RDKIT_VERSION=Release_2024_09_1 \
@@ -721,13 +703,11 @@ RUN case "${PG_VERSION}" in \
 # compile pg_uuidv7 extension
 #
 #########################################################################################
-FROM build-deps AS pg-uuidv7-pg-build
+FROM pg-build AS pg-uuidv7-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v1.6.0 - Oct 9, 2024
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \
     echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \
     mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
@@ -741,13 +721,11 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz
 # compile pg_roaringbitmap extension
 #
 #########################################################################################
-FROM build-deps AS pg-roaringbitmap-pg-build
+FROM pg-build AS pg-roaringbitmap-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v0.5.4 - Jun 28, 2022
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
     echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
     mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
@@ -761,16 +739,14 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
 # compile pg_semver extension
 #
 #########################################################################################
-FROM build-deps AS pg-semver-pg-build
+FROM pg-build AS pg-semver-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # Release 0.40.0 breaks backward compatibility with previous versions
 # see release note https://github.com/theory/pg-semver/releases/tag/v0.40.0
 # Use new version only for v17
 #
 # last release v0.40.0 - Jul 22, 2024
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
     "v17") \
         export SEMVER_VERSION=0.40.0 \
@@ -797,13 +773,11 @@ RUN case "${PG_VERSION}" in \
 # compile pg_embedding extension
 #
 #########################################################################################
-FROM build-deps AS pg-embedding-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+FROM pg-build AS pg-embedding-pg-build
 
 # This is our extension, support stopped in favor of pgvector
 # TODO: deprecate it
 ARG PG_VERSION
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
         export PG_EMBEDDING_VERSION=0.3.5 \
@@ -824,20 +798,18 @@ RUN case "${PG_VERSION}" in \
 # compile anon extension
 #
 #########################################################################################
-FROM build-deps AS pg-anon-pg-build
+FROM pg-build AS pg-anon-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # This is an experimental extension, never got to real production.
 # !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
     echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
     esac && \
     wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
     echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
     mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control
 
 #########################################################################################
@@ -846,9 +818,8 @@ RUN case "${PG_VERSION}" in "v17") \
 # This layer is used to build `pgrx` deps
 #
 #########################################################################################
-FROM build-deps AS rust-extensions-build
+FROM pg-build AS rust-extensions-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN apt update && \
     apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
@@ -856,7 +827,7 @@ RUN apt update && \
     useradd -ms /bin/bash nonroot -b /home
 
 ENV HOME=/home/nonroot
-ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/home/nonroot/.cargo/bin:$PATH"
 USER nonroot
 WORKDIR /home/nonroot
 
@@ -883,9 +854,8 @@ USER root
 # and eventually get merged with `rust-extensions-build`
 #
 #########################################################################################
-FROM build-deps AS rust-extensions-build-pgrx12
+FROM pg-build AS rust-extensions-build-pgrx12
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN apt update && \
     apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
@@ -893,7 +863,7 @@ RUN apt update && \
     useradd -ms /bin/bash nonroot -b /home
 
 ENV HOME=/home/nonroot
-ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/home/nonroot/.cargo/bin:$PATH"
 USER nonroot
 WORKDIR /home/nonroot
 
@@ -1068,13 +1038,11 @@ RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0
 #
 #########################################################################################
 
-FROM build-deps AS wal2json-pg-build
+FROM pg-build AS wal2json-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # wal2json wal2json_2_6 supports v17
 # last release wal2json_2_6 - Apr 25, 2024
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \
     echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \
     mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \
@@ -1087,13 +1055,11 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.
 # compile pg_ivm extension
 #
 #########################################################################################
-FROM build-deps AS pg-ivm-build
+FROM pg-build AS pg-ivm-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # pg_ivm v1.9 supports v17
 # last release v1.9 - Jul 31
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \
     echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \
     mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
@@ -1107,13 +1073,11 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_iv
 # compile pg_partman extension
 #
 #########################################################################################
-FROM build-deps AS pg-partman-build
+FROM pg-build AS pg-partman-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # should support v17 https://github.com/pgpartman/pg_partman/discussions/693
 # last release 5.1.0  Apr 2, 2024
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \
     echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \
     mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
@@ -1129,9 +1093,6 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
 #########################################################################################
 FROM rust-extensions-build AS pg-mooncake-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 
 RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \
     echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \
@@ -1147,11 +1108,8 @@ RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/p
 #
 #########################################################################################
 
-FROM build-deps AS pg-repack-build
+FROM pg-build AS pg-repack-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 
 RUN wget https://github.com/reorg/pg_repack/archive/refs/tags/ver_1.5.2.tar.gz -O pg_repack.tar.gz && \
     echo '4516cad42251ed3ad53ff619733004db47d5755acac83f75924cd94d1c4fb681 pg_repack.tar.gz' | sha256sum --check && \

From e58e29e63994373e5645a5123e0be191bf693813 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 14 Jan 2025 19:01:14 +0100
Subject: [PATCH 185/583] pageserver: limit number of upload queue tasks
 (#10384)

## Problem

The upload queue can currently schedule an arbitrary number of tasks.
This can both spawn an unbounded number of Tokio tasks, and also
significantly slow down upload queue scheduling as it's quadratic in
number of operations.

Touches #10096.

## Summary of changes

Limit the number of inprogress tasks to the remote storage upload
concurrency. While this concurrency limit is shared across all tenants,
there's certainly no point in scheduling more than this -- we could even
consider setting the limit lower, but don't for now to avoid
artificially constraining tenants.
---
 libs/remote_storage/src/config.rs             | 11 +++
 pageserver/benches/upload_queue.rs            |  2 +-
 .../src/tenant/remote_timeline_client.rs      | 29 +++++++-
 pageserver/src/tenant/upload_queue.rs         | 73 ++++++++++++++++---
 4 files changed, 99 insertions(+), 16 deletions(-)

diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index 49b1d9dc87..dae141bf77 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -43,6 +43,17 @@ impl RemoteStorageKind {
     }
 }
 
+impl RemoteStorageConfig {
+    /// Helper to fetch the configured concurrency limit.
+    pub fn concurrency_limit(&self) -> Option<usize> {
+        match &self.storage {
+            RemoteStorageKind::LocalFs { .. } => None,
+            RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()),
+            RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()),
+        }
+    }
+}
+
 fn default_timeout() -> Duration {
     RemoteStorageConfig::DEFAULT_TIMEOUT
 }
diff --git a/pageserver/benches/upload_queue.rs b/pageserver/benches/upload_queue.rs
index 528b3d5490..ed644b0e3c 100644
--- a/pageserver/benches/upload_queue.rs
+++ b/pageserver/benches/upload_queue.rs
@@ -53,7 +53,7 @@ fn bench_upload_queue_next_ready(c: &mut Criterion) {
 
         // Construct the queue.
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_with_current_remote_index_part(&index)?;
+        let queue = queue.initialize_with_current_remote_index_part(&index, 0)?;
 
         // Populate inprogress_tasks with a bunch of layer1 deletions.
         let delete = UploadOp::Delete(Delete {
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 75e8da496d..1602765585 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -425,8 +425,16 @@ impl RemoteTimelineClient {
     /// an index file upload, i.e., it's not empty.
     /// The given `index_part` must be the one on the remote.
     pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
+        // Set the maximum number of inprogress tasks to the remote storage concurrency. There's
+        // certainly no point in starting more upload tasks than this.
+        let inprogress_limit = self
+            .conf
+            .remote_storage_config
+            .as_ref()
+            .and_then(|r| r.concurrency_limit())
+            .unwrap_or(0);
         let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_with_current_remote_index_part(index_part)?;
+        upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
         self.update_remote_physical_size_gauge(Some(index_part));
         info!(
             "initialized upload queue from remote index with {} layer files",
@@ -441,8 +449,16 @@ impl RemoteTimelineClient {
         &self,
         local_metadata: &TimelineMetadata,
     ) -> anyhow::Result<()> {
+        // Set the maximum number of inprogress tasks to the remote storage concurrency. There's
+        // certainly no point in starting more upload tasks than this.
+        let inprogress_limit = self
+            .conf
+            .remote_storage_config
+            .as_ref()
+            .and_then(|r| r.concurrency_limit())
+            .unwrap_or(0);
         let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_empty_remote(local_metadata)?;
+        upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
         self.update_remote_physical_size_gauge(None);
         info!("initialized upload queue as empty");
         Ok(())
@@ -458,9 +474,15 @@ impl RemoteTimelineClient {
         let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
             "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
         ))?;
+        let inprogress_limit = self
+            .conf
+            .remote_storage_config
+            .as_ref()
+            .and_then(|r| r.concurrency_limit())
+            .unwrap_or(0);
 
         let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_with_current_remote_index_part(index_part)?;
+        upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
         self.update_remote_physical_size_gauge(Some(index_part));
         self.stop_impl(&mut upload_queue);
 
@@ -2355,6 +2377,7 @@ impl RemoteTimelineClient {
                     // but for this use case it doesnt really makes sense to bring unsafe code only for this usage point.
                     // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
                     let upload_queue_for_deletion = UploadQueueInitialized {
+                        inprogress_limit: initialized.inprogress_limit,
                         task_counter: 0,
                         dirty: initialized.dirty.clone(),
                         clean: initialized.clean.clone(),
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index bd524e8153..09c8f6ad8c 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -51,6 +51,9 @@ pub enum OpType {
 
 /// This keeps track of queued and in-progress tasks.
 pub struct UploadQueueInitialized {
+    /// Maximum number of inprogress tasks to schedule. 0 is no limit.
+    pub(crate) inprogress_limit: usize,
+
     /// Counter to assign task IDs
     pub(crate) task_counter: u64,
 
@@ -128,8 +131,14 @@ impl UploadQueueInitialized {
     /// the queue if it doesn't conflict with operations ahead of it.
     ///
     /// None may be returned even if the queue isn't empty, if no operations are ready yet.
+    ///
+    /// NB: this is quadratic, but queues are expected to be small, and bounded by inprogress_limit.
     pub fn next_ready(&mut self) -> Option<UploadOp> {
-        // NB: this is quadratic, but queues are expected to be small.
+        // If inprogress_tasks is already at limit, don't schedule anything more.
+        if self.inprogress_limit > 0 && self.inprogress_tasks.len() >= self.inprogress_limit {
+            return None;
+        }
+
         for (i, candidate) in self.queued_operations.iter().enumerate() {
             // If this candidate is ready, go for it. Otherwise, try the next one.
             if self.is_ready(i) {
@@ -289,6 +298,7 @@ impl UploadQueue {
     pub fn initialize_empty_remote(
         &mut self,
         metadata: &TimelineMetadata,
+        inprogress_limit: usize,
     ) -> anyhow::Result<&mut UploadQueueInitialized> {
         match self {
             UploadQueue::Uninitialized => (),
@@ -302,6 +312,7 @@ impl UploadQueue {
         let index_part = IndexPart::empty(metadata.clone());
 
         let state = UploadQueueInitialized {
+            inprogress_limit,
             dirty: index_part.clone(),
             clean: (index_part, None),
             latest_files_changes_since_metadata_upload_scheduled: 0,
@@ -325,6 +336,7 @@ impl UploadQueue {
     pub fn initialize_with_current_remote_index_part(
         &mut self,
         index_part: &IndexPart,
+        inprogress_limit: usize,
     ) -> anyhow::Result<&mut UploadQueueInitialized> {
         match self {
             UploadQueue::Uninitialized => (),
@@ -339,6 +351,7 @@ impl UploadQueue {
         );
 
         let state = UploadQueueInitialized {
+            inprogress_limit,
             dirty: index_part.clone(),
             clean: (index_part.clone(), None),
             latest_files_changes_since_metadata_upload_scheduled: 0,
@@ -633,7 +646,7 @@ mod tests {
     #[test]
     fn schedule_barrier() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_empty_remote(&TimelineMetadata::example())?;
+        let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?;
         let tli = make_timeline();
 
         let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter
@@ -700,7 +713,7 @@ mod tests {
     #[test]
     fn schedule_delete_parallel() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_empty_remote(&TimelineMetadata::example())?;
+        let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?;
         let tli = make_timeline();
 
         // Enqueue a bunch of deletes, some with conflicting names.
@@ -745,7 +758,7 @@ mod tests {
     #[test]
     fn schedule_upload_conflicts() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
         let tli = make_timeline();
 
         // Enqueue three versions of the same layer, with different file sizes.
@@ -778,7 +791,7 @@ mod tests {
     #[test]
     fn schedule_upload_delete_conflicts() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
         let tli = make_timeline();
 
         // Enqueue two layer uploads, with a delete of both layers in between them. These should be
@@ -817,7 +830,7 @@ mod tests {
     #[test]
     fn schedule_upload_delete_conflicts_bypass() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
         let tli = make_timeline();
 
         // Enqueue two layer uploads, with a delete of both layers in between them. These should be
@@ -859,7 +872,7 @@ mod tests {
     #[test]
     fn schedule_upload_parallel() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
         let tli = make_timeline();
 
         // Enqueue three different layer uploads.
@@ -888,7 +901,7 @@ mod tests {
     #[test]
     fn schedule_index_serial() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
 
         // Enqueue three uploads of the current empty index.
         let index = Box::new(queue.clean.0.clone());
@@ -925,7 +938,7 @@ mod tests {
     #[test]
     fn schedule_index_upload_chain() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
         let tli = make_timeline();
 
         // Enqueue three uploads of the current empty index.
@@ -994,7 +1007,7 @@ mod tests {
     #[test]
     fn schedule_index_delete_dereferenced() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
         let tli = make_timeline();
 
         // Create a layer to upload.
@@ -1038,7 +1051,7 @@ mod tests {
     #[test]
     fn schedule_index_upload_dereferenced() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example())?;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
         let tli = make_timeline();
 
         // Create a layer to upload.
@@ -1085,7 +1098,7 @@ mod tests {
     #[test]
     fn schedule_shutdown() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_empty_remote(&TimelineMetadata::example())?;
+        let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?;
         let tli = make_timeline();
 
         let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter
@@ -1139,6 +1152,42 @@ mod tests {
         Ok(())
     }
 
+    /// Scheduling respects inprogress_limit.
+    #[test]
+    fn schedule_inprogress_limit() -> anyhow::Result<()> {
+        // Create a queue with inprogress_limit=2.
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 2)?;
+        let tli = make_timeline();
+
+        // Enqueue a bunch of uploads.
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
+            UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
+            UploadOp::UploadLayer(layer3.clone(), layer3.metadata(), None),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Schedule all ready operations. Only 2 are scheduled.
+        let tasks = queue.schedule_ready();
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..2]);
+        assert!(queue.next_ready().is_none());
+
+        // When one completes, another is scheduled.
+        queue.complete(tasks[0].task_id);
+        let tasks = queue.schedule_ready();
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops[2..3]);
+
+        Ok(())
+    }
+
     /// Tests that can_bypass takes name, generation and shard index into account for all operations.
     #[test]
     fn can_bypass_path() -> anyhow::Result<()> {

From 6debb49b87dcf2dcca53bf17ee005b936eac446c Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 14 Jan 2025 22:10:17 +0100
Subject: [PATCH 186/583] pageserver: coalesce index uploads when possible
 (#10248)

## Problem

With upload queue reordering in #10218, we can easily get into a
situation where multiple index uploads are queued back to back, which
can't be parallelized. This will happen e.g. when multiple layer flushes
enqueue layer/index/layer/index/... and the layers skip the queue and
are uploaded in parallel.

These index uploads will incur serial S3 roundtrip latencies, and may
block later operations.

Touches #10096.

## Summary of changes

When multiple back-to-back index uploads are ready to upload, only
upload the most recent index and drop the rest.
---
 pageserver/benches/upload_queue.rs            |  1 +
 .../src/tenant/remote_timeline_client.rs      |  6 +-
 pageserver/src/tenant/upload_queue.rs         | 86 +++++++++++++------
 3 files changed, 67 insertions(+), 26 deletions(-)

diff --git a/pageserver/benches/upload_queue.rs b/pageserver/benches/upload_queue.rs
index ed644b0e3c..ed5daa8ae1 100644
--- a/pageserver/benches/upload_queue.rs
+++ b/pageserver/benches/upload_queue.rs
@@ -67,6 +67,7 @@ fn bench_upload_queue_next_ready(c: &mut Criterion) {
                     task_id,
                     retries: AtomicU32::new(0),
                     op: delete.clone(),
+                    coalesced_ops: Vec::new(),
                 }),
             );
         }
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 1602765585..47c4a8637d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1880,7 +1880,7 @@ impl RemoteTimelineClient {
     /// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has
     /// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks.
     fn launch_queued_tasks(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
-        while let Some(mut next_op) = upload_queue.next_ready() {
+        while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() {
             debug!("starting op: {next_op}");
 
             // Prepare upload.
@@ -1918,6 +1918,7 @@ impl RemoteTimelineClient {
             let task = Arc::new(UploadTask {
                 task_id: upload_task_id,
                 op: next_op,
+                coalesced_ops,
                 retries: AtomicU32::new(0),
             });
             upload_queue
@@ -2285,6 +2286,9 @@ impl RemoteTimelineClient {
         }
 
         self.metric_end(&task.op);
+        for coalesced_op in &task.coalesced_ops {
+            self.metric_end(coalesced_op);
+        }
     }
 
     fn metric_impl(
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 09c8f6ad8c..d302205ffe 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -22,6 +22,11 @@ use tracing::info;
 static DISABLE_UPLOAD_QUEUE_REORDERING: Lazy<bool> =
     Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_REORDERING").as_deref() == Ok("true"));
 
+/// Kill switch for index upload coalescing in case it causes problems.
+/// TODO: remove this once we have confidence in it.
+static DISABLE_UPLOAD_QUEUE_INDEX_COALESCING: Lazy<bool> =
+    Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_INDEX_COALESCING").as_deref() == Ok("true"));
+
 // clippy warns that Uninitialized is much smaller than Initialized, which wastes
 // memory for Uninitialized variants. Doesn't matter in practice, there are not
 // that many upload queues in a running pageserver, and most of them are initialized
@@ -130,10 +135,12 @@ impl UploadQueueInitialized {
     /// the first operation in the queue, to avoid head-of-line blocking -- an operation can jump
     /// the queue if it doesn't conflict with operations ahead of it.
     ///
+    /// Also returns any operations that were coalesced into this one, e.g. multiple index uploads.
+    ///
     /// None may be returned even if the queue isn't empty, if no operations are ready yet.
     ///
     /// NB: this is quadratic, but queues are expected to be small, and bounded by inprogress_limit.
-    pub fn next_ready(&mut self) -> Option<UploadOp> {
+    pub fn next_ready(&mut self) -> Option<(UploadOp, Vec<UploadOp>)> {
         // If inprogress_tasks is already at limit, don't schedule anything more.
         if self.inprogress_limit > 0 && self.inprogress_tasks.len() >= self.inprogress_limit {
             return None;
@@ -151,7 +158,36 @@ impl UploadQueueInitialized {
                     return None;
                 }
 
-                return self.queued_operations.remove(i);
+                let mut op = self.queued_operations.remove(i).expect("i can't disappear");
+
+                // Coalesce any back-to-back index uploads by only uploading the newest one that's
+                // ready. This typically happens with layer/index/layer/index/... sequences, where
+                // the layers bypass the indexes, leaving the indexes queued.
+                //
+                // If other operations are interleaved between index uploads we don't try to
+                // coalesce them, since we may as well update the index concurrently with them.
+                // This keeps the index fresh and avoids starvation.
+                //
+                // NB: we assume that all uploaded indexes have the same remote path. This
+                // is true at the time of writing: the path only depends on the tenant,
+                // timeline and generation, all of which are static for a timeline instance.
+                // Otherwise, we must be careful not to coalesce different paths.
+                let mut coalesced_ops = Vec::new();
+                if matches!(op, UploadOp::UploadMetadata { .. }) {
+                    while let Some(UploadOp::UploadMetadata { .. }) = self.queued_operations.get(i)
+                    {
+                        if *DISABLE_UPLOAD_QUEUE_INDEX_COALESCING {
+                            break;
+                        }
+                        if !self.is_ready(i) {
+                            break;
+                        }
+                        coalesced_ops.push(op);
+                        op = self.queued_operations.remove(i).expect("i can't disappear");
+                    }
+                }
+
+                return Some((op, coalesced_ops));
             }
 
             // Nothing can bypass a barrier or shutdown. If it wasn't scheduled above, give up.
@@ -225,11 +261,12 @@ impl UploadQueueInitialized {
     fn schedule_ready(&mut self) -> Vec<Arc<UploadTask>> {
         let mut tasks = Vec::new();
         // NB: schedule operations one by one, to handle conflicts with inprogress_tasks.
-        while let Some(op) = self.next_ready() {
+        while let Some((op, coalesced_ops)) = self.next_ready() {
             self.task_counter += 1;
             let task = Arc::new(UploadTask {
                 task_id: self.task_counter,
                 op,
+                coalesced_ops,
                 retries: 0.into(),
             });
             self.inprogress_tasks.insert(task.task_id, task.clone());
@@ -407,9 +444,13 @@ impl UploadQueue {
 pub struct UploadTask {
     /// Unique ID of this task. Used as the key in `inprogress_tasks` above.
     pub task_id: u64,
+    /// Number of task retries.
     pub retries: AtomicU32,
-
+    /// The upload operation.
     pub op: UploadOp,
+    /// Any upload operations that were coalesced into this operation. This typically happens with
+    /// back-to-back index uploads, see `UploadQueueInitialized::next_ready()`.
+    pub coalesced_ops: Vec<UploadOp>,
 }
 
 /// A deletion of some layers within the lifetime of a timeline.  This is not used
@@ -512,9 +553,8 @@ impl UploadOp {
                 })
             }
 
-            // Indexes can never bypass each other.
-            // TODO: we could coalesce them though, by only uploading the newest ready index. This
-            // is left for later, out of caution.
+            // Indexes can never bypass each other. They can coalesce though, and
+            // `UploadQueue::next_ready()` currently does this when possible.
             (UploadOp::UploadMetadata { .. }, UploadOp::UploadMetadata { .. }) => false,
         }
     }
@@ -897,9 +937,9 @@ mod tests {
         Ok(())
     }
 
-    /// Index uploads are serialized.
+    /// Index uploads are coalesced.
     #[test]
-    fn schedule_index_serial() -> anyhow::Result<()> {
+    fn schedule_index_coalesce() -> anyhow::Result<()> {
         let mut queue = UploadQueue::Uninitialized;
         let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
 
@@ -920,13 +960,11 @@ mod tests {
 
         queue.queued_operations.extend(ops.clone());
 
-        // The uploads should run serially.
-        for op in ops {
-            let tasks = queue.schedule_ready();
-            assert_eq!(tasks.len(), 1);
-            assert_same_op(&tasks[0].op, &op);
-            queue.complete(tasks[0].task_id);
-        }
+        // The index uploads are coalesced into a single operation.
+        let tasks = queue.schedule_ready();
+        assert_eq!(tasks.len(), 1);
+        assert_same_op(&tasks[0].op, &ops[2]);
+        assert_same_ops(&tasks[0].coalesced_ops, &ops[0..2]);
 
         assert!(queue.queued_operations.is_empty());
 
@@ -985,18 +1023,14 @@ mod tests {
         assert_same_op(&index_tasks[0].op, &ops[1]);
         queue.complete(index_tasks[0].task_id);
 
-        // layer 1 completes. This unblocks index 1 then index 2.
+        // layer 1 completes. This unblocks index 1 and 2, which coalesce into
+        // a single upload for index 2.
         queue.complete(upload_tasks[1].task_id);
 
-        let index_tasks = queue.schedule_ready();
-        assert_eq!(index_tasks.len(), 1);
-        assert_same_op(&index_tasks[0].op, &ops[3]);
-        queue.complete(index_tasks[0].task_id);
-
         let index_tasks = queue.schedule_ready();
         assert_eq!(index_tasks.len(), 1);
         assert_same_op(&index_tasks[0].op, &ops[5]);
-        queue.complete(index_tasks[0].task_id);
+        assert_same_ops(&index_tasks[0].coalesced_ops, &ops[3..4]);
 
         assert!(queue.queued_operations.is_empty());
 
@@ -1018,11 +1052,12 @@ mod tests {
         let index_deref = index_without(&index_upload, &layer);
 
         let ops = [
-            // Initial upload.
+            // Initial upload, with a barrier to prevent index coalescing.
             UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
             UploadOp::UploadMetadata {
                 uploaded: index_upload.clone(),
             },
+            UploadOp::Barrier(tokio::sync::watch::channel(()).0),
             // Dereference the layer and delete it.
             UploadOp::UploadMetadata {
                 uploaded: index_deref.clone(),
@@ -1063,11 +1098,12 @@ mod tests {
         let index_ref = index_with(&index_deref, &layer);
 
         let ops = [
-            // Initial upload.
+            // Initial upload, with a barrier to prevent index coalescing.
             UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
             UploadOp::UploadMetadata {
                 uploaded: index_upload.clone(),
             },
+            UploadOp::Barrier(tokio::sync::watch::channel(()).0),
             // Dereference the layer.
             UploadOp::UploadMetadata {
                 uploaded: index_deref.clone(),

From 47c1640accbbf45071de91b28aadc7356775fc43 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 14 Jan 2025 21:37:32 +0000
Subject: [PATCH 187/583] storage controller: pagination for tenant listing API
 (#10365)

## Problem

For large deployments, the `control/v1/tenant` listing API can time out
transmitting a monolithic serialized response.

## Summary of changes

- Add `limit` and `start_after` parameters to listing API
- Update storcon_cli to use these parameters and limit requests to 1000
items at a time
---
 control_plane/storcon_cli/src/main.rs         | 74 +++++++++++++------
 storage_controller/src/http.rs                |  6 +-
 storage_controller/src/service.rs             | 33 ++++++++-
 test_runner/fixtures/neon_fixtures.py         | 19 ++++-
 .../regress/test_storage_controller.py        | 19 ++++-
 5 files changed, 117 insertions(+), 34 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 9d133e4af1..2ba8f63678 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -477,16 +477,7 @@ async fn main() -> anyhow::Result<()> {
             println!("{table}");
         }
         Command::Tenants { node_id: None } => {
-            let mut resp = storcon_client
-                .dispatch::<(), Vec<TenantDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/tenant".to_string(),
-                    None,
-                )
-                .await?;
-
-            resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id));
-
+            // Set up output formatting
             let mut table = comfy_table::Table::new();
             table.set_header([
                 "TenantId",
@@ -496,20 +487,55 @@ async fn main() -> anyhow::Result<()> {
                 "Placement",
                 "Scheduling",
             ]);
-            for tenant in resp {
-                let shard_zero = tenant.shards.into_iter().next().unwrap();
-                table.add_row([
-                    format!("{}", tenant.tenant_id),
-                    shard_zero
-                        .preferred_az_id
-                        .as_ref()
-                        .cloned()
-                        .unwrap_or("".to_string()),
-                    format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
-                    format!("{:?}", tenant.stripe_size),
-                    format!("{:?}", tenant.policy),
-                    format!("{:?}", shard_zero.scheduling_policy),
-                ]);
+
+            // Pagination loop over listing API
+            let mut start_after = None;
+            const LIMIT: usize = 1000;
+            loop {
+                let path = match start_after {
+                    None => format!("control/v1/tenant?limit={LIMIT}"),
+                    Some(start_after) => {
+                        format!("control/v1/tenant?limit={LIMIT}&start_after={start_after}")
+                    }
+                };
+
+                let resp = storcon_client
+                    .dispatch::<(), Vec<TenantDescribeResponse>>(Method::GET, path, None)
+                    .await?;
+
+                if resp.is_empty() {
+                    // End of data reached
+                    break;
+                }
+
+                // Give some visual feedback while we're building up the table (comfy_table doesn't have
+                // streaming output)
+                if resp.len() >= LIMIT {
+                    eprint!(".");
+                }
+
+                start_after = Some(resp.last().unwrap().tenant_id);
+
+                for tenant in resp {
+                    let shard_zero = tenant.shards.into_iter().next().unwrap();
+                    table.add_row([
+                        format!("{}", tenant.tenant_id),
+                        shard_zero
+                            .preferred_az_id
+                            .as_ref()
+                            .cloned()
+                            .unwrap_or("".to_string()),
+                        format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
+                        format!("{:?}", tenant.stripe_size),
+                        format!("{:?}", tenant.policy),
+                        format!("{:?}", shard_zero.scheduling_policy),
+                    ]);
+                }
+            }
+
+            // Terminate progress dots
+            if table.row_count() > LIMIT {
+                eprint!("");
             }
 
             println!("{table}");
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index c8df4ffe28..03d8f11992 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -653,6 +653,10 @@ async fn handle_tenant_list(
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let limit: Option<usize> = parse_query_param(&req, "limit")?;
+    let start_after: Option<TenantId> = parse_query_param(&req, "start_after")?;
+    tracing::info!("start_after: {:?}", start_after);
+
     match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
             return res;
@@ -660,7 +664,7 @@ async fn handle_tenant_list(
         ForwardOutcome::NotForwarded(_req) => {}
     };
 
-    json_response(StatusCode::OK, service.tenant_list())
+    json_response(StatusCode::OK, service.tenant_list(limit, start_after))
 }
 
 async fn handle_node_register(req: Request<Body>) -> Result<Response<Body>, ApiError> {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index cbb9103880..57f4cc8463 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4158,17 +4158,42 @@ impl Service {
         .ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into()))
     }
 
-    pub(crate) fn tenant_list(&self) -> Vec<TenantDescribeResponse> {
+    /// limit & offset are pagination parameters. Since we are walking an in-memory HashMap, `offset` does not
+    /// avoid traversing data, it just avoid returning it. This is suitable for our purposes, since our in memory
+    /// maps are small enough to traverse fast, our pagination is just to avoid serializing huge JSON responses
+    /// in our external API.
+    pub(crate) fn tenant_list(
+        &self,
+        limit: Option<usize>,
+        start_after: Option<TenantId>,
+    ) -> Vec<TenantDescribeResponse> {
         let locked = self.inner.read().unwrap();
 
+        // Apply start_from parameter
+        let shard_range = match start_after {
+            None => locked.tenants.range(..),
+            Some(tenant_id) => locked.tenants.range(
+                TenantShardId {
+                    tenant_id,
+                    shard_number: ShardNumber(u8::MAX),
+                    shard_count: ShardCount(u8::MAX),
+                }..,
+            ),
+        };
+
         let mut result = Vec::new();
-        for (_tenant_id, tenant_shards) in
-            &locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id)
-        {
+        for (_tenant_id, tenant_shards) in &shard_range.group_by(|(id, _shard)| id.tenant_id) {
             result.push(
                 self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v))
                     .expect("Groups are always non-empty"),
             );
+
+            // Enforce `limit` parameter
+            if let Some(limit) = limit {
+                if result.len() >= limit {
+                    break;
+                }
+            }
         }
 
         result
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index e22e452a52..c47739cd81 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1884,7 +1884,10 @@ class NeonStorageController(MetricsGetter, LogUtils):
         )
         return response.json()
 
-    def tenant_list(self):
+    def tenant_shard_dump(self):
+        """
+        Debug listing API: dumps the internal map of tenant shards
+        """
         response = self.request(
             "GET",
             f"{self.api}/debug/v1/tenant",
@@ -1892,6 +1895,18 @@ class NeonStorageController(MetricsGetter, LogUtils):
         )
         return response.json()
 
+    def tenant_list(self, **kwargs):
+        """
+        Control API tenant listing: a vector of the same content returned by tenant_describe
+        """
+        response = self.request(
+            "GET",
+            f"{self.api}/control/v1/tenant",
+            headers=self.headers(TokenScope.ADMIN),
+            params=kwargs,
+        )
+        return response.json()
+
     def node_configure(self, node_id, body: dict[str, Any]):
         log.info(f"node_configure({node_id}, {body})")
         body["node_id"] = node_id
@@ -2238,7 +2253,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         """
         Get the intent and observed placements of all tenants known to the storage controller.
         """
-        tenants = self.tenant_list()
+        tenants = self.tenant_shard_dump()
 
         tenant_placement: defaultdict[str, dict[str, Any]] = defaultdict(
             lambda: {
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 8ffb6ba6b2..b5d109559f 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -113,6 +113,19 @@ def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination)
     for tid in tenant_ids:
         env.create_tenant(tid, shard_count=shards_per_tenant)
 
+    # Tenant listing API should work
+    listed_tenants = env.storage_controller.tenant_list()
+    log.info(f"listed_tenants: {listed_tenants}")
+    assert set(t["tenant_id"] for t in listed_tenants) == set(str(t) for t in tenant_ids)
+    paged = env.storage_controller.tenant_list(limit=2, start_after=listed_tenants[0]["tenant_id"])
+    assert len(paged) == 2
+    assert paged[0] == listed_tenants[1]
+    assert paged[1] == listed_tenants[2]
+    paged = env.storage_controller.tenant_list(
+        limit=1000, start_after="ffffffffffffffffffffffffffffffff"
+    )
+    assert paged == []
+
     # Validate high level metrics
     assert (
         env.storage_controller.get_metric_value("storage_controller_tenant_shards")
@@ -1506,7 +1519,7 @@ class PageserverFailpoint(Failure):
 
 
 def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]:
-    tenants = env.storage_controller.tenant_list()
+    tenants = env.storage_controller.tenant_shard_dump()
 
     node_to_tenants: dict[int, list[TenantId]] = {}
     for t in tenants:
@@ -2631,7 +2644,7 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
     # Validate that the storcon attempts to forward the request, but stops.
     # when it realises it is still the current leader.
     with pytest.raises(StorageControllerApiException, match="Leader is stepped down instance"):
-        env.storage_controller.tenant_list()
+        env.storage_controller.tenant_shard_dump()
 
     # Validate that we can step down multiple times and the observed state
     # doesn't change.
@@ -2781,7 +2794,7 @@ def test_storage_controller_leadership_transfer(
         # Check that the stepped down instance forwards requests
         # to the new leader while it's still running.
         storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
-        env.storage_controller.tenant_list()
+        env.storage_controller.tenant_shard_dump()
         env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
         status = env.storage_controller.node_status(env.pageservers[0].id)
         assert status["scheduling"] == "Pause"

From c98cbbeac143837eb99dfea1b5246bad21647b22 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Wed, 15 Jan 2025 12:41:49 +0300
Subject: [PATCH 188/583] Add migration details to safekeeper membership RFC.
 (#10272)

## Problem

https://github.com/neondatabase/neon/pull/8455 wasn't specific enough on
migration from current situation to enabling generations.

## Summary of changes

Describe the missing parts, including control plane pushing generation
to compute, which also defines whether generations are enabled -- non
zero value does it.
---
 ...35-safekeeper-dynamic-membership-change.md | 188 ++++++++++++------
 1 file changed, 122 insertions(+), 66 deletions(-)

diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
index 239ec58186..cea9af34ab 100644
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -81,7 +81,7 @@ configuration generation in them is less than its current one. Namely, it
 refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In
 response it sends its current configuration generation to let walproposer know.
 
-Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration` 
+Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration`
 accepting `Configuration`. Safekeeper switches to the given conf it is higher than its
 current one and ignores it otherwise. In any case it replies with
 ```
@@ -103,7 +103,7 @@ currently and tries to communicate with all of them. However, the list does not
 define consensus members. Instead, on start walproposer tracks highest
 configuration it receives from `AcceptorGreeting`s. Once it assembles greetings
 from majority of `sk_set` and majority of `new_sk_set` (if it is present), it
-establishes this configuration as its own and moves to voting. 
+establishes this configuration as its own and moves to voting.
 
 It should stop talking to safekeepers not listed in the configuration at this
 point, though it is not unsafe to continue doing so.
@@ -119,7 +119,7 @@ refusal to accept due to configuration change) it simply restarts.
 The following algorithm can be executed anywhere having access to configuration
 storage and safekeepers. It is safe to interrupt / restart it and run multiple
 instances of it concurrently, though likely one of them won't make
-progress then. It accepts `desired_set: Vec<NodeId>` as input. 
+progress then. It accepts `desired_set: Vec<NodeId>` as input.
 
 Algorithm will refuse to make the change if it encounters previous interrupted
 change attempt, but in this case it will try to finish it.
@@ -140,7 +140,7 @@ storage are reachable.
    safe. Failed CAS aborts the procedure.
 4) Call `PUT` `configuration` on safekeepers from the current set,
    delivering them `joint_conf`. Collecting responses from majority is required
-   to proceed. If any response returned generation higher than 
+   to proceed. If any response returned generation higher than
    `joint_conf.generation`, abort (another switch raced us). Otherwise, choose
    max `<last_log_term, flush_lsn>` among responses and establish it as
    (in memory) `sync_position`. Also choose max `term` and establish it as (in
@@ -149,49 +149,49 @@ storage are reachable.
    without ack from the new set. Similarly, we'll bump term on new majority
    to `sync_term` so that two computes with the same term are never elected.
 4) Initialize timeline on safekeeper(s) from `new_sk_set` where it
-   doesn't exist yet by doing `pull_timeline` from the majority of the 
+   doesn't exist yet by doing `pull_timeline` from the majority of the
    current set. Doing that on majority of `new_sk_set` is enough to
    proceed, but it is reasonable to ensure that all `new_sk_set` members
    are initialized -- if some of them are down why are we migrating there?
-5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set. 
+5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set.
    Success on majority is enough.
 6) Repeatedly call `PUT` `configuration` on safekeepers from the new set,
    delivering them `joint_conf` and collecting their positions. This will
-   switch them to the `joint_conf` which generally won't be needed 
+   switch them to the `joint_conf` which generally won't be needed
    because `pull_timeline` already includes it and plus additionally would be
    broadcast by compute. More importantly, we may proceed to the next step
-   only when `<last_log_term, flush_lsn>` on the majority of the new set reached 
-   `sync_position`. Similarly, on the happy path no waiting is not needed because 
+   only when `<last_log_term, flush_lsn>` on the majority of the new set reached
+   `sync_position`. Similarly, on the happy path no waiting is not needed because
    `pull_timeline` already includes it. However, we should double
     check to be safe. For example, timeline could have been created earlier e.g.
-    manually or after try-to-migrate, abort, try-to-migrate-again sequence. 
-7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new 
-   safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration 
+    manually or after try-to-migrate, abort, try-to-migrate-again sequence.
+7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new
+   safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration
    storage under one more CAS.
 8) Call `PUT` `configuration` on safekeepers from the new set,
-   delivering them `new_conf`. It is enough to deliver it to the majority 
+   delivering them `new_conf`. It is enough to deliver it to the majority
    of the new set; the rest can be updated by compute.
 
 I haven't put huge effort to make the description above very precise, because it
 is natural language prone to interpretations anyway. Instead I'd like to make TLA+
 spec of it.
 
-Description above focuses on safety. To make the flow practical and live, here a few more 
+Description above focuses on safety. To make the flow practical and live, here a few more
 considerations.
-1) It makes sense to ping new set to ensure it we are migrating to live node(s) before 
+1) It makes sense to ping new set to ensure it we are migrating to live node(s) before
   step 3.
-2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed 
+2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed
    it is safe to rollback to the old conf with one more CAS.
-3) On step 4 timeline might be already created on members of the new set for various reasons; 
+3) On step 4 timeline might be already created on members of the new set for various reasons;
    the simplest is the procedure restart. There are more complicated scenarious like mentioned
-   in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving 
-   generations, so seems simpler to treat existing timeline as success. However, this also 
+   in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving
+   generations, so seems simpler to treat existing timeline as success. However, this also
    has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
    the step 5 is never reached until compute is (re)awaken up to synchronize new member(s).
    I don't think we'll observe this in practice, but can add waking up compute if needed.
 4) In the end timeline should be locally deleted on the safekeeper(s) which are
    in the old set but not in the new one, unless they are unreachable. To be
-   safe this also should be done under generation number (deletion proceeds only if 
+   safe this also should be done under generation number (deletion proceeds only if
    current configuration is <= than one in request and safekeeper is not memeber of it).
 5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
    jump to step 7, using it as `new_conf`.
@@ -202,47 +202,87 @@ The procedure ought to be driven from somewhere. Obvious candidates are control
 plane and storage_controller; and as each of them already has db we don't want
 yet another storage. I propose to manage safekeepers in storage_controller
 because 1) since it is in rust it simplifies simulation testing (more on this
-below) 2) it already manages pageservers. 
+below) 2) it already manages pageservers.
 
 This assumes that migration will be fully usable only after we migrate all
 tenants/timelines to storage_controller. It is discussible whether we want also
 to manage pageserver attachments for all of these, but likely we do.
 
-This requires us to define storcon <-> cplane interface.
+This requires us to define storcon <-> cplane interface and changes.
 
-### storage_controller <-> control plane interface
+### storage_controller <-> control plane interface and changes
 
 First of all, control plane should
 [change](https://neondb.slack.com/archives/C03438W3FLZ/p1719226543199829)
 storing safekeepers per timeline instead of per tenant because we can't migrate
-tenants atomically. 
+tenants atomically.
 
 The important question is how updated configuration is delivered from
 storage_controller to control plane to provide it to computes. As always, there
 are two options, pull and push. Let's do it the same push as with pageserver
 `/notify-attach` because 1) it keeps storage_controller out of critical compute
-start path 2) provides easier upgrade: there won't be such a thing as 'timeline
-managed by control plane / storcon', cplane just takes the value out of its db
-when needed 3) uniformity. It makes storage_controller responsible for retrying notifying
-control plane until it succeeds.
+start path 2) uniformity. It makes storage_controller responsible for retrying
+notifying control plane until it succeeds.
 
-So, cplane `/notify-safekeepers` for the timeline accepts `Configuration` and
-updates it in the db if the provided conf generation is higher (the cplane db
-should also store generations for this). Similarly to [`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), it
-should update db which makes the call successful, and then try to schedule
-`apply_config` if possible, it is ok if not. storage_controller 
-should rate limit calling the endpoint, but likely this won't be needed, as migration
+It is not needed for the control plane to fully know the `Configuration`. It is
+enough for it to only to be aware of the list of safekeepers in the latest
+configuration to supply it to compute, plus associated generation number to
+protect from stale update requests and to also pass it to compute.
+
+So, cplane `/notify-safekeepers` for the timeline can accept JSON like
+```
+{
+   tenant_id: String,
+   timeline_id: String,
+   generation: u32,
+   safekeepers: Vec<SafekeeperId>,
+}
+```
+where `SafekeeperId` is
+```
+{
+   node_id: u64,
+   host: String
+}
+```
+In principle `host` is redundant, but may be useful for observability.
+
+The request updates list of safekeepers in the db if the provided conf
+generation is higher (the cplane db should also store generations for this).
+Similarly to
+[`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365),
+it should update db which makes the call successful, and then try to schedule
+`apply_config` if possible, it is ok if not. storage_controller should rate
+limit calling the endpoint, but likely this won't be needed, as migration
 throughput is limited by `pull_timeline`.
 
 Timeline (branch) creation in cplane should call storage_controller POST
 `tenant/:tenant_id/timeline` like it currently does for sharded tenants.
-Response should be augmented with `safekeeper_conf: Configuration`. The call
-should be retried until succeeds.
+Response should be augmented with `safekeepers_generation` and `safekeepers`
+fields like described in `/notify-safekeepers` above. Initially (currently)
+these fields may be absent; in this case cplane chooses safekeepers on its own
+like it currently does. The call should be retried until succeeds.
 
 Timeline deletion and tenant deletion in cplane should call appropriate
 storage_controller endpoints like it currently does for sharded tenants. The
 calls should be retried until they succeed.
 
+When compute receives safekeepers list from control plane it needs to know the
+generation to checked whether it should be updated (note that compute may get
+safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers`
+GUC is just a comma separates list of `host:port`. Let's prefix it with
+`g#<generation>:` to this end, so it will look like
+```
+g#42:safekeeper-0.eu-central-1.aws.neon.tech:6401,safekeeper-2.eu-central-1.aws.neon.tech:6401,safekeeper-1.eu-central-1.aws.neon.tech:6401
+```
+
+To summarize, list of cplane changes:
+- per tenant -> per timeline safekeepers management and addition of int `safekeeper_generation` field.
+- `/notify-safekeepers` endpoint.
+- Branch creation call may return list of safekeepers and when it is
+  present cplane should adopt it instead of choosing on its own like it does currently.
+- `neon.safekeepers` GUC should be prefixed with `g#<generation>:`.
+
 ### storage_controller implementation
 
 Current 'load everything on startup and keep in memory' easy design is fine.
@@ -360,10 +400,10 @@ source safekeeper might fail, which is not a problem if we are going to
 decomission the node but leaves garbage otherwise. I'd propose in the first version
 1) Don't attempt deletion at all if node status is `offline`.
 2) If it failed, just issue warning.
-And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and 
-remove garbage timelines for manual use. It will 1) list all timelines on the 
-safekeeper 2) compare each one against configuration storage: if timeline 
-doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can 
+And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and
+remove garbage timelines for manual use. It will 1) list all timelines on the
+safekeeper 2) compare each one against configuration storage: if timeline
+doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can
 be deleted under generation number if node is not member of current generation.
 
 Automating this is untrivial; we'd need to register all potential missing
@@ -412,8 +452,8 @@ There should be following layers of tests:
 3) Since simulation testing injects at relatively high level points (not
    syscalls), it omits some code, in particular `pull_timeline`. Thus it is
    better to have basic tests covering whole system as well. Extended version of
-   `test_restarts_under_load` would do: start background load and do migration 
-   under it, then restart endpoint and check that no reported commits 
+   `test_restarts_under_load` would do: start background load and do migration
+   under it, then restart endpoint and check that no reported commits
    had been lost. I'd also add one more creating classic network split scenario, with
    one compute talking to AC and another to BD while migration from nodes ABC to ABD
    happens.
@@ -422,35 +462,51 @@ There should be following layers of tests:
 
 ## Order of implementation and rollout
 
-Note that 
+Note that
 - Control plane parts and integration with it is fully independent from everything else
   (tests would use simulation and neon_local).
+- It is reasonable to make compute <-> safekeepers protocol change
+  independent of enabling generations.
 - There is a lot of infra work making storage_controller aware of timelines and safekeepers
   and its impl/rollout should be separate from migration itself.
-- Initially walproposer can just stop working while it observers joint configuration.
+- Initially walproposer can just stop working while it observes joint configuration.
   Such window would be typically very short anyway.
+- Obviously we want to test the whole thing thoroughly on staging and only then
+  gradually enable in prod.
 
-To rollout smoothly, both walproposer and safekeeper should have flag
-`configurations_enabled`; when set to false, they would work as currently, i.e.
-walproposer is able to commit on whatever safekeeper set it is provided. Until
-all timelines are managed by storcon we'd need to use current script to migrate
-and update/drop entries in the storage_controller database if it has any.
+Let's have the following implementation bits for gradual rollout:
+- compute gets `neon.safekeepers_proto_version` flag.
+  Initially both compute and safekeepers will be able to talk both
+  versions so that we can delay force restart of them and for
+  simplicity of rollback in case it is needed.
+- storcon gets `-set-safekeepers` config option disabled by
+  default. Timeline creation request chooses safekeepers
+  (and returns them in response to cplane) only when it is set to
+  true.
+- control_plane [see above](storage_controller-<->-control-plane interface-and-changes)
+  prefixes `neon.safekeepers` GUC with generation number. When it is 0
+  (or prefix not present at all), walproposer behaves as currently, committing on
+  the provided safekeeper list -- generations are disabled.
+  If it is non 0 it follows this RFC rules.
+- We provide a script for manual migration to storage controller.
+  It selects timeline(s) from control plane (specified or all of them) db
+  and calls special import endpoint on storage controller which is very
+  similar to timeline creation: it inserts into the db, sets
+  configuration to initial on the safekeepers, calls cplane
+  `notify-safekeepers`.
 
-Safekeepers would need to be able to talk both current and new protocol version
-with compute to reduce number of computes restarted in prod once v2 protocol is
-deployed (though before completely switching we'd need to force this).
-
-Let's have the following rollout order:
-- storage_controller becomes aware of safekeepers;
-- storage_controller gets timeline creation for new timelines and deletion requests, but
-  doesn't manage all timelines yet. Migration can be tested on these new timelines.
-  To keep control plane and storage_controller databases in sync while control 
-  plane still chooses the safekeepers initially (until all timelines are imported
-  it can choose better), `TimelineCreateRequest` can get optional safekeepers
-  field with safekeepers chosen by cplane.
-- Then we can import all existing timelines from control plane to
-  storage_controller and gradually enable configurations region by region.
+Then the rollout for a region would be:
+- Current situation: safekeepers are choosen by control_plane.
+- We manually migrate some timelines, test moving them around.
+- Then we enable `--set-safekeepers` so that all new timelines
+  are on storage controller.
+- Finally migrate all existing timelines using the script (no
+  compute should be speaking old proto version at this point).
 
+Until all timelines are managed by storcon we'd need to use current ad hoc
+script to migrate if needed. To keep state clean, all storage controller managed
+timelines must be migrated before that, or controller db and configurations
+state of safekeepers dropped manually.
 
 Very rough implementation order:
 - Add concept of configurations to safekeepers (including control file),
@@ -458,10 +514,10 @@ Very rough implementation order:
 - Implement walproposer changes, including protocol.
 - Implement storconn part. Use it in neon_local (and pytest).
 - Make cplane store safekeepers per timeline instead of per tenant.
-- Implement cplane/storcon integration. Route branch creation/deletion 
+- Implement cplane/storcon integration. Route branch creation/deletion
   through storcon. Then we can test migration of new branches.
-- Finally import existing branches. Then we can drop cplane 
-  safekeeper selection code. Gradually enable configurations at 
+- Finally import existing branches. Then we can drop cplane
+  safekeeper selection code. Gradually enable configurations at
   computes and safekeepers. Before that, all computes must talk only
   v3 protocol version.
 

From 2d0ea085244208a8b0238b69c7dc5a4f93890d90 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Wed, 15 Jan 2025 12:45:58 +0300
Subject: [PATCH 189/583] Add safekeeper membership conf to control file.
 (#10196)

## Problem

https://github.com/neondatabase/neon/issues/9965

## Summary of changes

Add safekeeper membership configuration struct itself and storing it in
the control file. In passing also add creation timestamp to the control
file (there were cases where I wanted it in the past).

Remove obsolete unused PersistedPeerInfo struct from control file (still
keep it control_file_upgrade.rs to have it in old upgrade code).

Remove the binary representation of cfile in the roundtrip test.
Updating it is annoying, and we still test the actual roundtrip.

Also add configuration to timeline creation http request, currently used
only in one python test. In passing, slightly change LSNs meaning in the
request: normally start_lsn is passed (the same as ancestor_start_lsn in
similar pageserver call), but we allow specifying higher commit_lsn for
manual intervention if needed. Also when given LSN initialize
term_history with it.
---
 Cargo.lock                                    |   2 +
 libs/safekeeper_api/Cargo.toml                |   2 +
 libs/safekeeper_api/src/lib.rs                |   5 +-
 libs/safekeeper_api/src/membership.rs         | 164 ++++++++++++++++++
 libs/safekeeper_api/src/models.rs             |  13 +-
 safekeeper/src/control_file.rs                |  20 ++-
 safekeeper/src/control_file_upgrade.rs        | 145 ++++++++++++++--
 safekeeper/src/copy_timeline.rs               |   5 +-
 safekeeper/src/http/routes.rs                 |  13 +-
 safekeeper/src/json_ctrl.rs                   |   2 +
 safekeeper/src/receive_wal.rs                 |   9 +-
 safekeeper/src/safekeeper.rs                  | 117 +++----------
 safekeeper/src/state.rs                       |  69 ++++----
 safekeeper/src/timelines_global_map.rs        |   7 +-
 .../tests/walproposer_sim/safekeeper.rs       |  11 +-
 test_runner/fixtures/pageserver/http.py       |  11 +-
 test_runner/fixtures/safekeeper/http.py       |  46 +++--
 test_runner/fixtures/utils.py                 |  18 ++
 test_runner/regress/test_wal_acceptor.py      |  10 +-
 19 files changed, 477 insertions(+), 192 deletions(-)
 create mode 100644 libs/safekeeper_api/src/membership.rs

diff --git a/Cargo.lock b/Cargo.lock
index 1e29f4fc08..f543057933 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5707,10 +5707,12 @@ dependencies = [
 name = "safekeeper_api"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
  "const_format",
  "postgres_ffi",
  "pq_proto",
  "serde",
+ "serde_json",
  "tokio",
  "utils",
 ]
diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml
index 4234ec6779..7652c3d413 100644
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -5,8 +5,10 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
+anyhow.workspace = true
 const_format.workspace = true
 serde.workspace = true
+serde_json.workspace = true
 postgres_ffi.workspace = true
 pq_proto.workspace = true
 tokio.workspace = true
diff --git a/libs/safekeeper_api/src/lib.rs b/libs/safekeeper_api/src/lib.rs
index be6923aca9..fa86523ad7 100644
--- a/libs/safekeeper_api/src/lib.rs
+++ b/libs/safekeeper_api/src/lib.rs
@@ -4,12 +4,15 @@ use const_format::formatcp;
 use pq_proto::SystemId;
 use serde::{Deserialize, Serialize};
 
+pub mod membership;
 /// Public API types
 pub mod models;
 
 /// Consensus logical timestamp. Note: it is a part of sk control file.
 pub type Term = u64;
-pub const INVALID_TERM: Term = 0;
+/// With this term timeline is created initially. It
+/// is a normal term except wp is never elected with it.
+pub const INITIAL_TERM: Term = 0;
 
 /// Information about Postgres. Safekeeper gets it once and then verifies all
 /// further connections from computes match. Note: it is a part of sk control
diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs
new file mode 100644
index 0000000000..fe30204545
--- /dev/null
+++ b/libs/safekeeper_api/src/membership.rs
@@ -0,0 +1,164 @@
+//! Types defining safekeeper membership, see
+//! rfcs/035-safekeeper-dynamic-membership-change.md
+//! for details.
+
+use std::{collections::HashSet, fmt::Display};
+
+use anyhow;
+use anyhow::bail;
+use serde::{Deserialize, Serialize};
+use utils::id::NodeId;
+
+/// Number uniquely identifying safekeeper configuration.
+/// Note: it is a part of sk control file.
+pub type Generation = u32;
+/// 1 is the first valid generation, 0 is used as
+/// a placeholder before we fully migrate to generations.
+pub const INVALID_GENERATION: Generation = 0;
+pub const INITIAL_GENERATION: Generation = 1;
+
+/// Membership is defined by ids so e.g. walproposer uses them to figure out
+/// quorums, but we also carry host and port to give wp idea where to connect.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct SafekeeperId {
+    pub id: NodeId,
+    pub host: String,
+    pub pg_port: u16,
+}
+
+impl Display for SafekeeperId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[id={}, ep={}:{}]", self.id, self.host, self.pg_port)
+    }
+}
+
+/// Set of safekeepers.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[serde(transparent)]
+pub struct MemberSet {
+    pub members: Vec<SafekeeperId>,
+}
+
+impl MemberSet {
+    pub fn empty() -> Self {
+        MemberSet {
+            members: Vec::new(),
+        }
+    }
+
+    pub fn new(members: Vec<SafekeeperId>) -> anyhow::Result<Self> {
+        let hs: HashSet<NodeId> = HashSet::from_iter(members.iter().map(|sk| sk.id));
+        if hs.len() != members.len() {
+            bail!("duplicate safekeeper id in the set {:?}", members);
+        }
+        Ok(MemberSet { members })
+    }
+
+    pub fn contains(&self, sk: &SafekeeperId) -> bool {
+        self.members.iter().any(|m| m.id == sk.id)
+    }
+
+    pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> {
+        if self.contains(&sk) {
+            bail!(format!(
+                "sk {} is already member of the set {}",
+                sk.id, self
+            ));
+        }
+        self.members.push(sk);
+        Ok(())
+    }
+}
+
+impl Display for MemberSet {
+    /// Display as a comma separated list of members.
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let sks_str = self
+            .members
+            .iter()
+            .map(|m| m.to_string())
+            .collect::<Vec<_>>();
+        write!(f, "({})", sks_str.join(", "))
+    }
+}
+
+/// Safekeeper membership configuration.
+/// Note: it is a part of both control file and http API.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct Configuration {
+    /// Unique id.
+    pub generation: Generation,
+    /// Current members of the configuration.
+    pub members: MemberSet,
+    /// Some means it is a joint conf.
+    pub new_members: Option<MemberSet>,
+}
+
+impl Configuration {
+    /// Used for pre-generations timelines, will be removed eventually.
+    pub fn empty() -> Self {
+        Configuration {
+            generation: INVALID_GENERATION,
+            members: MemberSet::empty(),
+            new_members: None,
+        }
+    }
+}
+
+impl Display for Configuration {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "gen={}, members={}, new_members={}",
+            self.generation,
+            self.members,
+            self.new_members
+                .as_ref()
+                .map(ToString::to_string)
+                .unwrap_or(String::from("none"))
+        )
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{MemberSet, SafekeeperId};
+    use utils::id::NodeId;
+
+    #[test]
+    fn test_member_set() {
+        let mut members = MemberSet::empty();
+        members
+            .add(SafekeeperId {
+                id: NodeId(42),
+                host: String::from("lala.org"),
+                pg_port: 5432,
+            })
+            .unwrap();
+
+        members
+            .add(SafekeeperId {
+                id: NodeId(42),
+                host: String::from("lala.org"),
+                pg_port: 5432,
+            })
+            .expect_err("duplicate must not be allowed");
+
+        members
+            .add(SafekeeperId {
+                id: NodeId(43),
+                host: String::from("bubu.org"),
+                pg_port: 5432,
+            })
+            .unwrap();
+
+        println!("members: {}", members);
+
+        let j = serde_json::to_string(&members).expect("failed to serialize");
+        println!("members json: {}", j);
+        assert_eq!(
+            j,
+            r#"[{"id":42,"host":"lala.org","pg_port":5432},{"id":43,"host":"bubu.org","pg_port":5432}]"#
+        );
+    }
+}
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index 3e424a792c..ad38986357 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -11,7 +11,7 @@ use utils::{
     pageserver_feedback::PageserverFeedback,
 };
 
-use crate::{ServerInfo, Term};
+use crate::{membership::Configuration, ServerInfo, Term};
 
 #[derive(Debug, Serialize)]
 pub struct SafekeeperStatus {
@@ -22,13 +22,16 @@ pub struct SafekeeperStatus {
 pub struct TimelineCreateRequest {
     pub tenant_id: TenantId,
     pub timeline_id: TimelineId,
-    pub peer_ids: Option<Vec<NodeId>>,
+    pub mconf: Configuration,
     pub pg_version: u32,
     pub system_id: Option<u64>,
+    // By default WAL_SEGMENT_SIZE
     pub wal_seg_size: Option<u32>,
-    pub commit_lsn: Lsn,
-    // If not passed, it is assigned to the beginning of commit_lsn segment.
-    pub local_start_lsn: Option<Lsn>,
+    pub start_lsn: Lsn,
+    // Normal creation should omit this field (start_lsn initializes all LSNs).
+    // However, we allow specifying custom value higher than start_lsn for
+    // manual recovery case, see test_s3_wal_replay.
+    pub commit_lsn: Option<Lsn>,
 }
 
 /// Same as TermLsn, but serializes LSN using display serializer
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index 06e5afbf74..e92ca881e1 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -3,6 +3,7 @@
 use anyhow::{bail, ensure, Context, Result};
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
 use camino::{Utf8Path, Utf8PathBuf};
+use safekeeper_api::membership::INVALID_GENERATION;
 use tokio::fs::File;
 use tokio::io::AsyncWriteExt;
 use utils::crashsafe::durable_rename;
@@ -13,14 +14,14 @@ use std::ops::Deref;
 use std::path::Path;
 use std::time::Instant;
 
-use crate::control_file_upgrade::downgrade_v9_to_v8;
+use crate::control_file_upgrade::downgrade_v10_to_v9;
 use crate::control_file_upgrade::upgrade_control_file;
 use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
 use crate::state::{EvictionState, TimelinePersistentState};
 use utils::bin_ser::LeSer;
 
 pub const SK_MAGIC: u32 = 0xcafeceefu32;
-pub const SK_FORMAT_VERSION: u32 = 9;
+pub const SK_FORMAT_VERSION: u32 = 10;
 
 // contains persistent metadata for safekeeper
 pub const CONTROL_FILE_NAME: &str = "safekeeper.control";
@@ -169,10 +170,11 @@ impl TimelinePersistentState {
         let mut buf: Vec<u8> = Vec::new();
         WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
 
-        if self.eviction_state == EvictionState::Present {
-            // temp hack for forward compatibility
-            const PREV_FORMAT_VERSION: u32 = 8;
-            let prev = downgrade_v9_to_v8(self);
+        if self.mconf.generation == INVALID_GENERATION {
+            // Temp hack for forward compatibility test: in case of none
+            // configuration save cfile in previous v9 format.
+            const PREV_FORMAT_VERSION: u32 = 9;
+            let prev = downgrade_v10_to_v9(self);
             WriteBytesExt::write_u32::<LittleEndian>(&mut buf, PREV_FORMAT_VERSION)?;
             prev.ser_into(&mut buf)?;
         } else {
@@ -233,6 +235,7 @@ impl Storage for FileStorage {
 #[cfg(test)]
 mod test {
     use super::*;
+    use safekeeper_api::membership::{Configuration, MemberSet};
     use tokio::fs;
     use utils::lsn::Lsn;
 
@@ -242,6 +245,11 @@ mod test {
     async fn test_read_write_safekeeper_state() -> anyhow::Result<()> {
         let tempdir = camino_tempfile::tempdir()?;
         let mut state = TimelinePersistentState::empty();
+        state.mconf = Configuration {
+            generation: 42,
+            members: MemberSet::empty(),
+            new_members: None,
+        };
         let mut storage = FileStorage::create_new(tempdir.path(), state.clone(), NO_SYNC).await?;
 
         // Make a change.
diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs
index dd152fd4cc..904e79f976 100644
--- a/safekeeper/src/control_file_upgrade.rs
+++ b/safekeeper/src/control_file_upgrade.rs
@@ -1,17 +1,22 @@
 //! Code to deal with safekeeper control file upgrades
+use std::vec;
+
 use crate::{
     safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn},
-    state::{EvictionState, PersistedPeers, TimelinePersistentState},
+    state::{EvictionState, TimelinePersistentState},
     wal_backup_partial,
 };
 use anyhow::{bail, Result};
 use pq_proto::SystemId;
-use safekeeper_api::{ServerInfo, Term};
+use safekeeper_api::{
+    membership::{Configuration, INVALID_GENERATION},
+    ServerInfo, Term,
+};
 use serde::{Deserialize, Serialize};
 use tracing::*;
 use utils::{
     bin_ser::LeSer,
-    id::{TenantId, TimelineId},
+    id::{NodeId, TenantId, TimelineId},
     lsn::Lsn,
 };
 
@@ -233,6 +238,90 @@ pub struct SafeKeeperStateV8 {
     pub partial_backup: wal_backup_partial::State,
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct PersistedPeerInfo {
+    /// LSN up to which safekeeper offloaded WAL to s3.
+    pub backup_lsn: Lsn,
+    /// Term of the last entry.
+    pub term: Term,
+    /// LSN of the last record.
+    pub flush_lsn: Lsn,
+    /// Up to which LSN safekeeper regards its WAL as committed.
+    pub commit_lsn: Lsn,
+}
+
+impl PersistedPeerInfo {
+    pub fn new() -> Self {
+        Self {
+            backup_lsn: Lsn::INVALID,
+            term: safekeeper_api::INITIAL_TERM,
+            flush_lsn: Lsn(0),
+            commit_lsn: Lsn(0),
+        }
+    }
+}
+
+// make clippy happy
+impl Default for PersistedPeerInfo {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Note: SafekeeperStateVn is old name for TimelinePersistentStateVn.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct TimelinePersistentStateV9 {
+    #[serde(with = "hex")]
+    pub tenant_id: TenantId,
+    #[serde(with = "hex")]
+    pub timeline_id: TimelineId,
+    /// persistent acceptor state
+    pub acceptor_state: AcceptorState,
+    /// information about server
+    pub server: ServerInfo,
+    /// Unique id of the last *elected* proposer we dealt with. Not needed
+    /// for correctness, exists for monitoring purposes.
+    #[serde(with = "hex")]
+    pub proposer_uuid: PgUuid,
+    /// Since which LSN this timeline generally starts. Safekeeper might have
+    /// joined later.
+    pub timeline_start_lsn: Lsn,
+    /// Since which LSN safekeeper has (had) WAL for this timeline.
+    /// All WAL segments next to one containing local_start_lsn are
+    /// filled with data from the beginning.
+    pub local_start_lsn: Lsn,
+    /// Part of WAL acknowledged by quorum *and available locally*. Always points
+    /// to record boundary.
+    pub commit_lsn: Lsn,
+    /// LSN that points to the end of the last backed up segment. Useful to
+    /// persist to avoid finding out offloading progress on boot.
+    pub backup_lsn: Lsn,
+    /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
+    /// of last record streamed to everyone). Persisting it helps skipping
+    /// recovery in walproposer, generally we compute it from peers. In
+    /// walproposer proto called 'truncate_lsn'. Updates are currently drived
+    /// only by walproposer.
+    pub peer_horizon_lsn: Lsn,
+    /// LSN of the oldest known checkpoint made by pageserver and successfully
+    /// pushed to s3. We don't remove WAL beyond it. Persisted only for
+    /// informational purposes, we receive it from pageserver (or broker).
+    pub remote_consistent_lsn: Lsn,
+    /// Peers and their state as we remember it. Knowing peers themselves is
+    /// fundamental; but state is saved here only for informational purposes and
+    /// obviously can be stale. (Currently not saved at all, but let's provision
+    /// place to have less file version upgrades).
+    pub peers: PersistedPeers,
+    /// Holds names of partial segments uploaded to remote storage. Used to
+    /// clean up old objects without leaving garbage in remote storage.
+    pub partial_backup: wal_backup_partial::State,
+    /// Eviction state of the timeline. If it's Offloaded, we should download
+    /// WAL files from remote storage to serve the timeline.
+    pub eviction_state: EvictionState,
+}
+
 pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersistentState> {
     // migrate to storing full term history
     if version == 1 {
@@ -248,6 +337,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
         return Ok(TimelinePersistentState {
             tenant_id: oldstate.server.tenant_id,
             timeline_id: oldstate.server.timeline_id,
+            mconf: Configuration::empty(),
             acceptor_state: ac,
             server: ServerInfo {
                 pg_version: oldstate.server.pg_version,
@@ -261,9 +351,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             backup_lsn: Lsn(0),
             peer_horizon_lsn: oldstate.truncate_lsn,
             remote_consistent_lsn: Lsn(0),
-            peers: PersistedPeers(vec![]),
             partial_backup: wal_backup_partial::State::default(),
             eviction_state: EvictionState::Present,
+            creation_ts: std::time::SystemTime::UNIX_EPOCH,
         });
     // migrate to hexing some ids
     } else if version == 2 {
@@ -277,6 +367,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
         return Ok(TimelinePersistentState {
             tenant_id: oldstate.server.tenant_id,
             timeline_id: oldstate.server.timeline_id,
+            mconf: Configuration::empty(),
             acceptor_state: oldstate.acceptor_state,
             server,
             proposer_uuid: oldstate.proposer_uuid,
@@ -286,9 +377,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             backup_lsn: Lsn(0),
             peer_horizon_lsn: oldstate.truncate_lsn,
             remote_consistent_lsn: Lsn(0),
-            peers: PersistedPeers(vec![]),
             partial_backup: wal_backup_partial::State::default(),
             eviction_state: EvictionState::Present,
+            creation_ts: std::time::SystemTime::UNIX_EPOCH,
         });
     // migrate to moving tenant_id/timeline_id to the top and adding some lsns
     } else if version == 3 {
@@ -302,6 +393,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
         return Ok(TimelinePersistentState {
             tenant_id: oldstate.server.tenant_id,
             timeline_id: oldstate.server.timeline_id,
+            mconf: Configuration::empty(),
             acceptor_state: oldstate.acceptor_state,
             server,
             proposer_uuid: oldstate.proposer_uuid,
@@ -311,9 +403,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             backup_lsn: Lsn(0),
             peer_horizon_lsn: oldstate.truncate_lsn,
             remote_consistent_lsn: Lsn(0),
-            peers: PersistedPeers(vec![]),
             partial_backup: wal_backup_partial::State::default(),
             eviction_state: EvictionState::Present,
+            creation_ts: std::time::SystemTime::UNIX_EPOCH,
         });
     // migrate to having timeline_start_lsn
     } else if version == 4 {
@@ -327,6 +419,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
         return Ok(TimelinePersistentState {
             tenant_id: oldstate.tenant_id,
             timeline_id: oldstate.timeline_id,
+            mconf: Configuration::empty(),
             acceptor_state: oldstate.acceptor_state,
             server,
             proposer_uuid: oldstate.proposer_uuid,
@@ -336,9 +429,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             backup_lsn: Lsn::INVALID,
             peer_horizon_lsn: oldstate.peer_horizon_lsn,
             remote_consistent_lsn: Lsn(0),
-            peers: PersistedPeers(vec![]),
             partial_backup: wal_backup_partial::State::default(),
             eviction_state: EvictionState::Present,
+            creation_ts: std::time::SystemTime::UNIX_EPOCH,
         });
     } else if version == 5 {
         info!("reading safekeeper control file version {}", version);
@@ -372,6 +465,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
         return Ok(TimelinePersistentState {
             tenant_id: oldstate.tenant_id,
             timeline_id: oldstate.timeline_id,
+            mconf: Configuration::empty(),
             acceptor_state: oldstate.acceptor_state,
             server: oldstate.server,
             proposer_uuid: oldstate.proposer_uuid,
@@ -381,9 +475,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             backup_lsn: oldstate.backup_lsn,
             peer_horizon_lsn: oldstate.peer_horizon_lsn,
             remote_consistent_lsn: oldstate.remote_consistent_lsn,
-            peers: oldstate.peers,
             partial_backup: wal_backup_partial::State::default(),
             eviction_state: EvictionState::Present,
+            creation_ts: std::time::SystemTime::UNIX_EPOCH,
         });
     } else if version == 8 {
         let oldstate = SafeKeeperStateV8::des(&buf[..buf.len()])?;
@@ -391,6 +485,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
         return Ok(TimelinePersistentState {
             tenant_id: oldstate.tenant_id,
             timeline_id: oldstate.timeline_id,
+            mconf: Configuration::empty(),
             acceptor_state: oldstate.acceptor_state,
             server: oldstate.server,
             proposer_uuid: oldstate.proposer_uuid,
@@ -400,9 +495,28 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             backup_lsn: oldstate.backup_lsn,
             peer_horizon_lsn: oldstate.peer_horizon_lsn,
             remote_consistent_lsn: oldstate.remote_consistent_lsn,
-            peers: oldstate.peers,
             partial_backup: oldstate.partial_backup,
             eviction_state: EvictionState::Present,
+            creation_ts: std::time::SystemTime::UNIX_EPOCH,
+        });
+    } else if version == 9 {
+        let oldstate = TimelinePersistentStateV9::des(&buf[..buf.len()])?;
+        return Ok(TimelinePersistentState {
+            tenant_id: oldstate.tenant_id,
+            timeline_id: oldstate.timeline_id,
+            mconf: Configuration::empty(),
+            acceptor_state: oldstate.acceptor_state,
+            server: oldstate.server,
+            proposer_uuid: oldstate.proposer_uuid,
+            timeline_start_lsn: oldstate.timeline_start_lsn,
+            local_start_lsn: oldstate.local_start_lsn,
+            commit_lsn: oldstate.commit_lsn,
+            backup_lsn: oldstate.backup_lsn,
+            peer_horizon_lsn: oldstate.peer_horizon_lsn,
+            remote_consistent_lsn: oldstate.remote_consistent_lsn,
+            partial_backup: oldstate.partial_backup,
+            eviction_state: oldstate.eviction_state,
+            creation_ts: std::time::SystemTime::UNIX_EPOCH,
         });
     }
 
@@ -412,9 +526,11 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
     bail!("unsupported safekeeper control file version {}", version)
 }
 
-pub fn downgrade_v9_to_v8(state: &TimelinePersistentState) -> SafeKeeperStateV8 {
-    assert!(state.eviction_state == EvictionState::Present);
-    SafeKeeperStateV8 {
+// Used as a temp hack to make forward compatibility test work. Should be
+// removed after PR adding v10 is merged.
+pub fn downgrade_v10_to_v9(state: &TimelinePersistentState) -> TimelinePersistentStateV9 {
+    assert!(state.mconf.generation == INVALID_GENERATION);
+    TimelinePersistentStateV9 {
         tenant_id: state.tenant_id,
         timeline_id: state.timeline_id,
         acceptor_state: state.acceptor_state.clone(),
@@ -426,8 +542,9 @@ pub fn downgrade_v9_to_v8(state: &TimelinePersistentState) -> SafeKeeperStateV8
         backup_lsn: state.backup_lsn,
         peer_horizon_lsn: state.peer_horizon_lsn,
         remote_consistent_lsn: state.remote_consistent_lsn,
-        peers: state.peers.clone(),
+        peers: PersistedPeers(vec![]),
         partial_backup: state.partial_backup.clone(),
+        eviction_state: state.eviction_state,
     }
 }
 
@@ -437,7 +554,7 @@ mod tests {
 
     use utils::{id::NodeId, Hex};
 
-    use crate::safekeeper::PersistedPeerInfo;
+    use crate::control_file_upgrade::PersistedPeerInfo;
 
     use super::*;
 
diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs
index 28ef2b1d23..10a761e1f5 100644
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -1,6 +1,7 @@
 use anyhow::{bail, Result};
 use camino::Utf8PathBuf;
 use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
+use safekeeper_api::membership::Configuration;
 use std::sync::Arc;
 use tokio::{
     fs::OpenOptions,
@@ -147,10 +148,10 @@ pub async fn handle_request(
 
     let mut new_state = TimelinePersistentState::new(
         &request.destination_ttid,
+        Configuration::empty(),
         state.server.clone(),
-        vec![],
-        request.until_lsn,
         start_lsn,
+        request.until_lsn,
     )?;
     new_state.timeline_start_lsn = start_lsn;
     new_state.peer_horizon_lsn = request.until_lsn;
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 6186f4c3ba..3835d39698 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -111,14 +111,15 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
         system_id: request_data.system_id.unwrap_or(0),
         wal_seg_size: request_data.wal_seg_size.unwrap_or(WAL_SEGMENT_SIZE as u32),
     };
-    let local_start_lsn = request_data.local_start_lsn.unwrap_or_else(|| {
-        request_data
-            .commit_lsn
-            .segment_lsn(server_info.wal_seg_size as usize)
-    });
     let global_timelines = get_global_timelines(&request);
     global_timelines
-        .create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
+        .create(
+            ttid,
+            request_data.mconf,
+            server_info,
+            request_data.start_lsn,
+            request_data.commit_lsn.unwrap_or(request_data.start_lsn),
+        )
         .await
         .map_err(ApiError::InternalServerError)?;
 
diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs
index 256e350ceb..19e17c4a75 100644
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -8,6 +8,7 @@
 
 use anyhow::Context;
 use postgres_backend::QueryError;
+use safekeeper_api::membership::Configuration;
 use safekeeper_api::{ServerInfo, Term};
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -105,6 +106,7 @@ async fn prepare_safekeeper(
         .global_timelines
         .create(
             spg.ttid,
+            Configuration::empty(),
             ServerInfo {
                 pg_version,
                 wal_seg_size: WAL_SEGMENT_SIZE as u32,
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 3e9ce1da8e..daaa8a253d 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -21,6 +21,7 @@ use postgres_backend::PostgresBackend;
 use postgres_backend::PostgresBackendReader;
 use postgres_backend::QueryError;
 use pq_proto::BeMessage;
+use safekeeper_api::membership::Configuration;
 use safekeeper_api::models::{ConnectionId, WalReceiverState, WalReceiverStatus};
 use safekeeper_api::ServerInfo;
 use std::future;
@@ -337,7 +338,13 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
                 };
                 let tli = self
                     .global_timelines
-                    .create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
+                    .create(
+                        self.ttid,
+                        Configuration::empty(),
+                        server_info,
+                        Lsn::INVALID,
+                        Lsn::INVALID,
+                    )
                     .await
                     .context("create timeline")?;
                 tli.wal_residence_guard().await?
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 6ceaf325b0..06403228e9 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -7,7 +7,6 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
 use postgres_ffi::{TimeLineID, MAX_SEND_SIZE};
 use safekeeper_api::models::HotStandbyFeedback;
 use safekeeper_api::Term;
-use safekeeper_api::INVALID_TERM;
 use serde::{Deserialize, Serialize};
 use std::cmp::max;
 use std::cmp::min;
@@ -193,36 +192,6 @@ impl AcceptorState {
     }
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
-pub struct PersistedPeerInfo {
-    /// LSN up to which safekeeper offloaded WAL to s3.
-    pub backup_lsn: Lsn,
-    /// Term of the last entry.
-    pub term: Term,
-    /// LSN of the last record.
-    pub flush_lsn: Lsn,
-    /// Up to which LSN safekeeper regards its WAL as committed.
-    pub commit_lsn: Lsn,
-}
-
-impl PersistedPeerInfo {
-    pub fn new() -> Self {
-        Self {
-            backup_lsn: Lsn::INVALID,
-            term: INVALID_TERM,
-            flush_lsn: Lsn(0),
-            commit_lsn: Lsn(0),
-        }
-    }
-}
-
-// make clippy happy
-impl Default for PersistedPeerInfo {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 // protocol messages
 
 /// Initial Proposer -> Acceptor message
@@ -1010,7 +979,7 @@ where
 
     /// Update commit_lsn from peer safekeeper data.
     pub async fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> {
-        if (Lsn(sk_info.commit_lsn) != Lsn::INVALID) && (sk_info.last_log_term != INVALID_TERM) {
+        if Lsn(sk_info.commit_lsn) != Lsn::INVALID {
             // Note: the check is too restrictive, generally we can update local
             // commit_lsn if our history matches (is part of) history of advanced
             // commit_lsn provider.
@@ -1025,12 +994,20 @@ where
 #[cfg(test)]
 mod tests {
     use futures::future::BoxFuture;
+
     use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE};
-    use safekeeper_api::ServerInfo;
+    use safekeeper_api::{
+        membership::{Configuration, MemberSet, SafekeeperId},
+        ServerInfo,
+    };
 
     use super::*;
-    use crate::state::{EvictionState, PersistedPeers, TimelinePersistentState};
-    use std::{ops::Deref, str::FromStr, time::Instant};
+    use crate::state::{EvictionState, TimelinePersistentState};
+    use std::{
+        ops::Deref,
+        str::FromStr,
+        time::{Instant, UNIX_EPOCH},
+    };
 
     // fake storage for tests
     struct InMemoryState {
@@ -1313,12 +1290,21 @@ mod tests {
 
     #[test]
     fn test_sk_state_bincode_serde_roundtrip() {
-        use utils::Hex;
         let tenant_id = TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap();
         let timeline_id = TimelineId::from_str("112ded66422aa5e953e5440fa5427ac4").unwrap();
         let state = TimelinePersistentState {
             tenant_id,
             timeline_id,
+            mconf: Configuration {
+                generation: 42,
+                members: MemberSet::new(vec![SafekeeperId {
+                    id: NodeId(1),
+                    host: "hehe.org".to_owned(),
+                    pg_port: 5432,
+                }])
+                .expect("duplicate member"),
+                new_members: None,
+            },
             acceptor_state: AcceptorState {
                 term: 42,
                 term_history: TermHistory(vec![TermLsn {
@@ -1342,70 +1328,13 @@ mod tests {
             backup_lsn: Lsn(1234567300),
             peer_horizon_lsn: Lsn(9999999),
             remote_consistent_lsn: Lsn(1234560000),
-            peers: PersistedPeers(vec![(
-                NodeId(1),
-                PersistedPeerInfo {
-                    backup_lsn: Lsn(1234567000),
-                    term: 42,
-                    flush_lsn: Lsn(1234567800 - 8),
-                    commit_lsn: Lsn(1234567600),
-                },
-            )]),
             partial_backup: crate::wal_backup_partial::State::default(),
             eviction_state: EvictionState::Present,
+            creation_ts: UNIX_EPOCH,
         };
 
         let ser = state.ser().unwrap();
 
-        #[rustfmt::skip]
-        let expected = [
-            // tenant_id as length prefixed hex
-            0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x63, 0x66, 0x30, 0x34, 0x38, 0x30, 0x39, 0x32, 0x39, 0x37, 0x30, 0x37, 0x65, 0x65, 0x37, 0x35, 0x33, 0x37, 0x32, 0x33, 0x33, 0x37, 0x65, 0x66, 0x61, 0x61, 0x35, 0x65, 0x63, 0x66, 0x39, 0x36,
-            // timeline_id as length prefixed hex
-            0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x31, 0x31, 0x32, 0x64, 0x65, 0x64, 0x36, 0x36, 0x34, 0x32, 0x32, 0x61, 0x61, 0x35, 0x65, 0x39, 0x35, 0x33, 0x65, 0x35, 0x34, 0x34, 0x30, 0x66, 0x61, 0x35, 0x34, 0x32, 0x37, 0x61, 0x63, 0x34,
-            // term
-            0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // length prefix
-            0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // unsure why this order is swapped
-            0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // pg_version
-            0x0e, 0x00, 0x00, 0x00,
-            // systemid
-            0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12,
-            // wal_seg_size
-            0x78, 0x56, 0x34, 0x12,
-            // pguuid as length prefixed hex
-            0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x63, 0x34, 0x37, 0x61, 0x34, 0x32, 0x61, 0x35, 0x30, 0x66, 0x34, 0x34, 0x65, 0x35, 0x35, 0x33, 0x65, 0x39, 0x61, 0x35, 0x32, 0x61, 0x34, 0x32, 0x36, 0x36, 0x65, 0x64, 0x32, 0x64, 0x31, 0x31,
-
-            // timeline_start_lsn
-            0x00, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00, 0x00,
-            0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x78, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
-            0x84, 0x00, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
-            0x7f, 0x96, 0x98, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0xe4, 0x95, 0x49, 0x00, 0x00, 0x00, 0x00,
-            // length prefix for persistentpeers
-            0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // nodeid
-            0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // backuplsn
-            0x58, 0xff, 0x95, 0x49, 0x00, 0x00, 0x00, 0x00,
-            0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
-            0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
-            // partial_backup
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // eviction_state
-            0x00, 0x00, 0x00, 0x00,
-        ];
-
-        assert_eq!(Hex(&ser), Hex(&expected));
-
         let deser = TimelinePersistentState::des(&ser).unwrap();
 
         assert_eq!(deser, state);
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index c6ae6c1d2b..1c3bb1b4dc 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -1,20 +1,22 @@
 //! Defines per timeline data stored persistently (SafeKeeperPersistentState)
 //! and its wrapper with in memory layer (SafekeeperState).
 
-use std::{cmp::max, ops::Deref};
+use std::{cmp::max, ops::Deref, time::SystemTime};
 
 use anyhow::{bail, Result};
 use postgres_ffi::WAL_SEGMENT_SIZE;
-use safekeeper_api::{models::TimelineTermBumpResponse, ServerInfo, Term};
+use safekeeper_api::{
+    membership::Configuration, models::TimelineTermBumpResponse, ServerInfo, Term, INITIAL_TERM,
+};
 use serde::{Deserialize, Serialize};
 use utils::{
-    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
+    id::{TenantId, TenantTimelineId, TimelineId},
     lsn::Lsn,
 };
 
 use crate::{
     control_file,
-    safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, TermHistory, UNKNOWN_SERVER_VERSION},
+    safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn, UNKNOWN_SERVER_VERSION},
     timeline::TimelineError,
     wal_backup_partial::{self},
 };
@@ -27,6 +29,8 @@ pub struct TimelinePersistentState {
     pub tenant_id: TenantId,
     #[serde(with = "hex")]
     pub timeline_id: TimelineId,
+    /// Membership configuration.
+    pub mconf: Configuration,
     /// persistent acceptor state
     pub acceptor_state: AcceptorState,
     /// information about server
@@ -58,22 +62,15 @@ pub struct TimelinePersistentState {
     /// pushed to s3. We don't remove WAL beyond it. Persisted only for
     /// informational purposes, we receive it from pageserver (or broker).
     pub remote_consistent_lsn: Lsn,
-    /// Peers and their state as we remember it. Knowing peers themselves is
-    /// fundamental; but state is saved here only for informational purposes and
-    /// obviously can be stale. (Currently not saved at all, but let's provision
-    /// place to have less file version upgrades).
-    pub peers: PersistedPeers,
     /// Holds names of partial segments uploaded to remote storage. Used to
     /// clean up old objects without leaving garbage in remote storage.
     pub partial_backup: wal_backup_partial::State,
     /// Eviction state of the timeline. If it's Offloaded, we should download
     /// WAL files from remote storage to serve the timeline.
     pub eviction_state: EvictionState,
+    pub creation_ts: SystemTime,
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
-pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);
-
 /// State of the local WAL files. Used to track current timeline state,
 /// that can be either WAL files are present on disk or last partial segment
 /// is offloaded to remote storage.
@@ -87,12 +84,14 @@ pub enum EvictionState {
 }
 
 impl TimelinePersistentState {
+    /// commit_lsn is the same as start_lsn in the normal creaiton; see
+    /// `TimelineCreateRequest` comments.`
     pub fn new(
         ttid: &TenantTimelineId,
+        mconf: Configuration,
         server_info: ServerInfo,
-        peers: Vec<NodeId>,
+        start_lsn: Lsn,
         commit_lsn: Lsn,
-        local_start_lsn: Lsn,
     ) -> anyhow::Result<TimelinePersistentState> {
         if server_info.wal_seg_size == 0 {
             bail!(TimelineError::UninitializedWalSegSize(*ttid));
@@ -102,49 +101,59 @@ impl TimelinePersistentState {
             bail!(TimelineError::UninitialinzedPgVersion(*ttid));
         }
 
-        if commit_lsn < local_start_lsn {
+        if commit_lsn < start_lsn {
             bail!(
-                "commit_lsn {} is smaller than local_start_lsn {}",
+                "commit_lsn {} is smaller than start_lsn {}",
                 commit_lsn,
-                local_start_lsn
+                start_lsn
             );
         }
 
+        // If we are given with init LSN, initialize term history with it. It
+        // ensures that walproposer always must be able to find a common point
+        // in histories; if it can't something is corrupted. Not having LSN here
+        // is so far left for legacy case where timeline is created by compute
+        // and LSN during creation is not known yet.
+        let term_history = if commit_lsn != Lsn::INVALID {
+            TermHistory(vec![TermLsn {
+                term: INITIAL_TERM,
+                lsn: start_lsn,
+            }])
+        } else {
+            TermHistory::empty()
+        };
+
         Ok(TimelinePersistentState {
             tenant_id: ttid.tenant_id,
             timeline_id: ttid.timeline_id,
+            mconf,
             acceptor_state: AcceptorState {
-                term: 0,
-                term_history: TermHistory::empty(),
+                term: INITIAL_TERM,
+                term_history,
             },
             server: server_info,
             proposer_uuid: [0; 16],
-            timeline_start_lsn: Lsn(0),
-            local_start_lsn,
+            timeline_start_lsn: start_lsn,
+            local_start_lsn: start_lsn,
             commit_lsn,
-            backup_lsn: local_start_lsn,
-            peer_horizon_lsn: local_start_lsn,
+            backup_lsn: start_lsn,
+            peer_horizon_lsn: start_lsn,
             remote_consistent_lsn: Lsn(0),
-            peers: PersistedPeers(
-                peers
-                    .iter()
-                    .map(|p| (*p, PersistedPeerInfo::new()))
-                    .collect(),
-            ),
             partial_backup: wal_backup_partial::State::default(),
             eviction_state: EvictionState::Present,
+            creation_ts: SystemTime::now(),
         })
     }
 
     pub fn empty() -> Self {
         TimelinePersistentState::new(
             &TenantTimelineId::empty(),
+            Configuration::empty(),
             ServerInfo {
                 pg_version: 170000, /* Postgres server version (major * 10000) */
                 system_id: 0,       /* Postgres system identifier */
                 wal_seg_size: WAL_SEGMENT_SIZE as u32,
             },
-            vec![],
             Lsn::INVALID,
             Lsn::INVALID,
         )
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index ad29c9f66c..a701534f65 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -12,6 +12,7 @@ use crate::{control_file, wal_storage, SafeKeeperConf};
 use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
 use camino_tempfile::Utf8TempDir;
+use safekeeper_api::membership::Configuration;
 use safekeeper_api::ServerInfo;
 use serde::Serialize;
 use std::collections::HashMap;
@@ -214,9 +215,10 @@ impl GlobalTimelines {
     pub(crate) async fn create(
         &self,
         ttid: TenantTimelineId,
+        mconf: Configuration,
         server_info: ServerInfo,
+        start_lsn: Lsn,
         commit_lsn: Lsn,
-        local_start_lsn: Lsn,
     ) -> Result<Arc<Timeline>> {
         let (conf, _, _) = {
             let state = self.state.lock().unwrap();
@@ -239,8 +241,7 @@ impl GlobalTimelines {
 
         // TODO: currently we create only cfile. It would be reasonable to
         // immediately initialize first WAL segment as well.
-        let state =
-            TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
+        let state = TimelinePersistentState::new(&ttid, mconf, server_info, start_lsn, commit_lsn)?;
         control_file::FileStorage::create_new(&tmp_dir_path, state, conf.no_sync).await?;
         let timeline = self.load_temp_timeline(ttid, &tmp_dir_path, true).await?;
         Ok(timeline)
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index efcdd89e7d..a99de71a04 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -21,7 +21,7 @@ use safekeeper::{
     wal_storage::Storage,
     SafeKeeperConf,
 };
-use safekeeper_api::ServerInfo;
+use safekeeper_api::{membership::Configuration, ServerInfo};
 use tracing::{debug, info_span, warn};
 use utils::{
     id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -96,8 +96,13 @@ impl GlobalMap {
         let commit_lsn = Lsn::INVALID;
         let local_start_lsn = Lsn::INVALID;
 
-        let state =
-            TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
+        let state = TimelinePersistentState::new(
+            &ttid,
+            Configuration::empty(),
+            server_info,
+            commit_lsn,
+            local_start_lsn,
+        )?;
 
         let disk_timeline = self.disk.put_state(&ttid, state);
         let control_store = DiskStateStorage::new(disk_timeline.clone());
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 378e568622..364aff325d 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -15,7 +15,6 @@ from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 
 from fixtures.common_types import (
-    Id,
     Lsn,
     TenantId,
     TenantShardId,
@@ -25,7 +24,7 @@ from fixtures.common_types import (
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pg_version import PgVersion
-from fixtures.utils import Fn
+from fixtures.utils import EnhancedJSONEncoder, Fn
 
 
 class PageserverApiException(Exception):
@@ -83,14 +82,6 @@ class TimelineCreateRequest:
     mode: TimelineCreateRequestMode
 
     def to_json(self) -> str:
-        class EnhancedJSONEncoder(json.JSONEncoder):
-            def default(self, o):
-                if dataclasses.is_dataclass(o) and not isinstance(o, type):
-                    return dataclasses.asdict(o)
-                elif isinstance(o, Id):
-                    return o.id.hex()
-                return super().default(o)
-
         # mode is flattened
         this = dataclasses.asdict(self)
         mode = this.pop("mode")
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 286f80ba69..4826cae3ee 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -10,7 +10,7 @@ import requests
 from fixtures.common_types import Lsn, TenantId, TenantTimelineId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
-from fixtures.utils import wait_until
+from fixtures.utils import EnhancedJSONEncoder, wait_until
 
 if TYPE_CHECKING:
     from typing import Any
@@ -69,6 +69,34 @@ class TermBumpResponse:
         )
 
 
+@dataclass
+class SafekeeperId:
+    id: int
+    host: str
+    pg_port: str
+
+
+@dataclass
+class Configuration:
+    generation: int
+    members: list[SafekeeperId]
+    new_members: list[SafekeeperId] | None
+
+
+@dataclass
+class TimelineCreateRequest:
+    tenant_id: TenantId
+    timeline_id: TimelineId
+    mconf: Configuration
+    # not exactly PgVersion, for example 150002 for 15.2
+    pg_version: int
+    start_lsn: Lsn
+    commit_lsn: Lsn | None
+
+    def to_json(self) -> str:
+        return json.dumps(self, cls=EnhancedJSONEncoder)
+
+
 class SafekeeperHttpClient(requests.Session, MetricsGetter):
     HTTPError = requests.HTTPError
 
@@ -131,20 +159,8 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         resj = res.json()
         return [TenantTimelineId.from_json(ttidj) for ttidj in resj]
 
-    def timeline_create(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        pg_version: int,  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
-        commit_lsn: Lsn,
-    ):
-        body = {
-            "tenant_id": str(tenant_id),
-            "timeline_id": str(timeline_id),
-            "pg_version": pg_version,
-            "commit_lsn": str(commit_lsn),
-        }
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body)
+    def timeline_create(self, r: TimelineCreateRequest):
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", data=r.to_json())
         res.raise_for_status()
 
     def timeline_status(
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index c34ac298d1..e160c617cd 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import contextlib
+import dataclasses
 import json
 import os
 import re
@@ -21,6 +22,7 @@ import zstandard
 from psycopg2.extensions import cursor
 from typing_extensions import override
 
+from fixtures.common_types import Id, Lsn
 from fixtures.log_helper import log
 from fixtures.pageserver.common_types import (
     parse_delta_layer,
@@ -605,6 +607,22 @@ class PropagatingThread(threading.Thread):
         return self.ret
 
 
+class EnhancedJSONEncoder(json.JSONEncoder):
+    """
+    Default json.JSONEncoder works only on primitive builtins. Extend it to any
+    dataclass plus our custom types.
+    """
+
+    def default(self, o):
+        if dataclasses.is_dataclass(o) and not isinstance(o, type):
+            return dataclasses.asdict(o)
+        elif isinstance(o, Id):
+            return o.id.hex()
+        elif isinstance(o, Lsn):
+            return str(o)  # standard hex notation
+        return super().default(o)
+
+
 def human_bytes(amt: float) -> str:
     """
     Render a bytes amount into nice IEC bytes string.
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 0a8900b351..d39c6a6b5b 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -48,7 +48,7 @@ from fixtures.remote_storage import (
     default_remote_storage,
     s3_storage,
 )
-from fixtures.safekeeper.http import SafekeeperHttpClient
+from fixtures.safekeeper.http import Configuration, SafekeeperHttpClient, TimelineCreateRequest
 from fixtures.safekeeper.utils import wait_walreceivers_absent
 from fixtures.utils import (
     PropagatingThread,
@@ -658,7 +658,13 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
     for sk in env.safekeepers:
         sk.start()
         cli = sk.http_client()
-        cli.timeline_create(tenant_id, timeline_id, pg_version, last_lsn)
+        mconf = Configuration(generation=0, members=[], new_members=None)
+        # set start_lsn to the beginning of the first segment to allow reading
+        # WAL from there (could you intidb LSN as well).
+        r = TimelineCreateRequest(
+            tenant_id, timeline_id, mconf, pg_version, Lsn("0/1000000"), commit_lsn=last_lsn
+        )
+        cli.timeline_create(r)
         f_partial_path = (
             Path(sk.data_dir) / str(tenant_id) / str(timeline_id) / f_partial_saved.name
         )

From 05d17a10aee949b56317bc1ecaf24b76b097b9d2 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 15 Jan 2025 11:35:38 +0100
Subject: [PATCH 190/583] rfc: add CPU and heap profiling RFC (#10085)

This document proposes a standard cross-team pattern for CPU and memory
profiling across applications and languages, using the
[pprof](https://github.com/google/pprof) profile format.

It enables both ad hoc profiles via HTTP endpoints, and continuous
profiling across the fleet via [Grafana Cloud
Profiles](https://grafana.com/docs/grafana-cloud/monitor-applications/profiles/).
Continuous profiling incurs an overhead of about 0.1% CPU usage and 3%
slower heap allocations.


[Rendered](https://github.com/neondatabase/neon/blob/erik/profiling-rfc/docs/rfcs/040-profiling.md)

Touches #9534.
Touches https://github.com/neondatabase/cloud/issues/14888.
---
 docs/rfcs/040-profiling.md | 247 +++++++++++++++++++++++++++++++++++++
 1 file changed, 247 insertions(+)
 create mode 100644 docs/rfcs/040-profiling.md

diff --git a/docs/rfcs/040-profiling.md b/docs/rfcs/040-profiling.md
new file mode 100644
index 0000000000..8da9e50774
--- /dev/null
+++ b/docs/rfcs/040-profiling.md
@@ -0,0 +1,247 @@
+# CPU and Memory Profiling
+
+Created 2025-01-12 by Erik Grinaker.
+
+See also [internal user guide](https://www.notion.so/neondatabase/Storage-CPU-Memory-Profiling-14bf189e004780228ec7d04442742324?pvs=4).
+
+## Summary
+
+This document proposes a standard cross-team pattern for CPU and memory profiling across
+applications and languages, using the [pprof](https://github.com/google/pprof) profile format.
+
+It enables both ad hoc profiles via HTTP endpoints, and continuous profiling across the fleet via
+[Grafana Cloud Profiles](https://grafana.com/docs/grafana-cloud/monitor-applications/profiles/).
+Continuous profiling incurs an overhead of about 0.1% CPU usage and 3% slower heap allocations.
+
+## Motivation
+
+CPU and memory profiles are crucial observability tools for understanding performance issues,
+resource exhaustion, and resource costs. They allow answering questions like:
+
+* Why is this process using 100% CPU?
+* How do I make this go faster?
+* Why did this process run out of memory?
+* Why are we paying for all these CPU cores and memory chips?
+
+Go has [first-class support](https://pkg.go.dev/net/http/pprof) for profiling included in its
+standard library, using the [pprof](https://github.com/google/pprof) profile format and associated
+tooling.
+
+This is not the case for Rust and C, where obtaining profiles can be rather cumbersome. It requires
+installing and running additional tools like `perf` as root on production nodes, with analysis tools
+that can be hard to use and often don't give good results. This is not only annoying, but can also
+significantly affect the resolution time of production incidents.
+
+This proposal will:
+
+* Provide CPU and heap profiles in pprof format via HTTP API.
+* Record continuous profiles in Grafana for aggregate historical analysis.
+* Make it easy for anyone to see a flamegraph in less than one minute.
+* Be reasonably consistent across teams and services (Rust, Go, C).
+
+## Non Goals (For Now)
+
+* [Additional profile types](https://grafana.com/docs/pyroscope/next/configure-client/profile-types/)
+  like mutexes, locks, goroutines, etc.
+* [Runtime trace integration](https://grafana.com/docs/pyroscope/next/configure-client/trace-span-profiles/).
+* [Profile-guided optimization](https://en.wikipedia.org/wiki/Profile-guided_optimization).
+
+## Using Profiles
+
+Ready-to-use profiles can be obtained using e.g. `curl`. For Rust services:
+
+```
+$ curl localhost:9898/profile/cpu >profile.pb.gz
+```
+
+pprof profiles can be explored using the [`pprof`](https://github.com/google/pprof) web UI, which
+provides flamegraphs, call graphs, plain text listings, and more:
+
+```
+$ pprof -http :6060 <profile>
+```
+
+Some endpoints (e.g. Rust-based ones) can also generate flamegraph SVGs directly:
+
+```
+$ curl localhost:9898/profile/cpu?format=svg >profile.svg
+$ open profile.svg
+```
+
+Continuous profiles are available in Grafana under Explore → Profiles → Explore Profiles
+(currently only in [staging](https://neonstaging.grafana.net/a/grafana-pyroscope-app/profiles-explorer)).
+
+## API Requirements
+
+* HTTP endpoints that return a profile in pprof format (with symbols).
+  * CPU: records a profile over the request time interval (`seconds` query parameter).
+  * Memory: returns the current in-use heap allocations.
+* Unauthenticated, as it should not expose user data or pose a denial-of-service risk.
+* Default sample frequency should not impact service (maximum 5% CPU overhead).
+* Linux-compatibility.
+
+Nice to have:
+
+* Return flamegraph SVG directly from the HTTP endpoint if requested.
+* Configurable sample frequency for CPU profiles.
+* Historical heap allocations, by count and bytes.
+* macOS-compatiblity.
+
+## Rust Profiling
+
+[`libs/utils/src/http/endpoint.rs`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs)
+contains ready-to-use HTTP endpoints for CPU and memory profiling:
+[`profile_cpu_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L338) and [`profile_heap_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L416).
+
+### CPU
+
+CPU profiles are provided by [pprof-rs](https://github.com/tikv/pprof-rs) via
+[`profile_cpu_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L338).
+Expose it unauthenticated at `/profile/cpu`.
+
+Parameters:
+
+* `format`: profile output format (`pprof` or `svg`; default `pprof`).
+* `seconds`: duration to collect profile over, in seconds (default `5`).
+* `frequency`: how often to sample thread stacks, in Hz (default `99`).
+* `force`: if `true`, cancel a running profile and start a new one (default `false`).
+
+Works on Linux and macOS.
+
+### Memory
+
+Use the jemalloc allocator via [`tikv-jemallocator`](https://github.com/tikv/jemallocator),
+and enable profiling with samples every 2 MB allocated:
+
+```rust
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
+#[allow(non_upper_case_globals)]
+#[export_name = "malloc_conf"]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
+```
+
+pprof profiles are generated by
+[`jemalloc-pprof`](https://github.com/polarsignals/rust-jemalloc-pprof) via
+[`profile_heap_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L416).
+Expose it unauthenticated at `/profile/heap`.
+
+Parameters:
+
+* `format`: profile output format (`pprof`, `svg`, or `jemalloc`; default `pprof`).
+
+Works on Linux only, due to [jemalloc limitations](https://github.com/jemalloc/jemalloc/issues/26).
+
+## Go Profiling
+
+The Go standard library includes pprof profiling via HTTP API in
+[`net/http/pprof`](https://pkg.go.dev/net/http/pprof). Expose it unauthenticated at
+`/debug/pprof`.
+
+Works on Linux and macOS.
+
+### CPU 
+
+Via `/debug/pprof/profile`. Parameters:
+
+* `debug`: profile output format (`0` is pprof, `1` or above is plaintext; default `0`).
+* `seconds`: duration to collect profile over, in seconds (default `30`).
+
+Does not support a frequency parameter (see [#57488](https://github.com/golang/go/issues/57488)),
+and defaults to 100 Hz. A lower frequency can be hardcoded via `SetCPUProfileRate`, but the default
+is likely ok (estimated 1% overhead).
+
+### Memory
+
+Via `/debug/pprof/heap`. Parameters:
+
+* `seconds`: take a delta profile over the given duration, in seconds (default `0`).
+* `gc`: if `1`, garbage collect before taking profile.
+
+## C Profiling
+
+[gperftools](https://github.com/gperftools/gperftools) provides in-process CPU and heap profiling
+with pprof output.
+
+However, continuous profiling of PostgreSQL is expensive (many computes), and has limited value
+since we don't own the internals anyway.
+
+Ad hoc profiling might still be useful, but the compute team considers existing tooling sufficient,
+so this is not a priority at the moment.
+
+## Grafana Continuous Profiling
+
+[Grafana Alloy](https://grafana.com/docs/alloy/latest/) continually scrapes CPU and memory profiles
+across the fleet, and archives them as time series. This can be used to analyze resource usage over
+time, either in aggregate or zoomed in to specific events and nodes.
+
+Profiles are retained for 30 days. Profile ingestion volume for CPU+heap at 60-second intervals
+is about 0.5 GB/node/day, or about $0.25/node/day = $7.5/node/month ($0.50/GB).
+
+It is currently enabled in [staging](https://neonstaging.grafana.net/a/grafana-pyroscope-app/profiles-explorer)
+for Pageserver and Safekeeper.
+
+### Scraping
+
+* CPU profiling: 59 seconds at 19 Hz every 60 seconds.
+* Heap profiling: heap snapshot with 2 MB frequency every 60 seconds.
+
+There are two main approaches that can be taken for CPU profiles:
+
+* Continuous low-frequency profiles (e.g. 19 Hz for 60 seconds every 60 seconds).
+* Occasional high-frequency profiles (e.g. 99 Hz for 5 seconds every 60 seconds).
+
+We choose continuous low-frequency profiles where possible. This has a fixed low overhead, instead
+of a spiky high overhead. It likely also gives a more representative view of resource usage.
+However, a 19 Hz rate gives a minimum resolution of 52.6 ms per sample, which may be larger than the
+actual runtime of small functions. Note that Go does not support a frequency parameter, so we must
+use a fixed frequency for all profiles via `SetCPUProfileRate()` (default 100 Hz).
+
+Only one CPU profile can be taken at a time. With continuous profiling, one will always be running.
+To allow also taking an ad hoc CPU profile, the Rust endpoint supports a `force` query parameter to
+cancel a running profile and start a new one.
+
+### Overhead
+
+With Rust:
+
+* CPU profiles at 19 Hz frequency: 0.1% overhead.
+* Heap profiles at 2 MB frequency: 3% allocation overhead.
+* Profile call/encoding/symbolization: 20 ms every 60 seconds, or 0.03% of 1 CPU (for Pageserver).
+* Profile symbolization caches: 125 MB memory, or 0.4% of 32 GB (for Pageserver).
+
+Benchmarks with pprof-rs showed that the CPU time for taking a stack trace of a 40-frame stack was
+11 µs using the `frame-pointer` feature, and 1.4 µs using `libunwind` with DWARF. `libunwind` saw
+frequent seg faults, so we use `frame-pointer` and build binaries with frame pointers (negligible
+overhead).
+
+CPU profiles work by installing an `ITIMER_PROF` for the process, which triggers a `SIGPROF` signal
+after a given amount of cumulative CPU time across all CPUs. The signal handler will run for one
+of the currently executing threads and take a stack trace. Thus, a 19 Hz profile will take 1 stack
+trace every 52.6 ms CPU time -- assuming 11 µs for a stack trace, this is 0.02% overhead, but
+likely 0.1% in practice (given e.g. context switches).
+
+Heap profiles work by probabilistically taking a stack trace on allocations, adjusted for the
+allocation size. A 1 MB allocation takes about 15 µs in benchmarks, and a stack trace about 1 µs,
+so we can estimate that a 2 MB sampling frequency has about 3% allocation overhead -- this is 
+consistent with benchmarks. This is significantly larger than CPU profiles, but mitigated by the
+fact that performance-sensitive code will avoid allocations as far as possible.
+
+Profile symbolization uses in-memory caches for symbol lookups. These take about 125 MB for
+Pageserver.
+
+## Alternatives Considered
+
+* eBPF profiles.
+  * Don't require instrumenting the binary.
+  * Use less resources.
+  * Can profile in kernel space too.
+  * Supported by Grafana.
+  * Less information about stack frames and spans.
+  * Limited tooling for local analysis.
+  * Does not support heap profiles.
+  * Does not work on macOS.
+
+* [Polar Signals](https://www.polarsignals.com) instead of Grafana.
+  * We already use Grafana for everything else. Appears good enough.

From 157743040896e59214260a78542899c07638f94d Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 15 Jan 2025 11:10:24 +0000
Subject: [PATCH 191/583] safekeeper: decode and interpret for multiple shards
 in one go (#10201)

## Problem

Currently, we call `InterpretedWalRecord::from_bytes_filtered`
from each shard. To serve multiple shards at the same time,
the API needs to allow for enquiring about multiple shards.

## Summary of changes

This commit tweaks it a pretty brute force way. Naively, we could
just generate the shard for a key, but pre and post split shards
may be subscribed at the same time, so doing it efficiently is more
complex.
---
 Cargo.lock                                    |   9 +
 libs/pageserver_api/src/shard.rs              |  21 +-
 libs/postgres_ffi/src/walrecord.rs            |  14 +-
 libs/wal_decoder/Cargo.toml                   |  15 ++
 libs/wal_decoder/benches/README.md            |  34 +++
 .../benches/bench_interpret_wal.rs            | 250 ++++++++++++++++++
 libs/wal_decoder/src/decoder.rs               | 140 ++++++----
 libs/wal_decoder/src/models.rs                |  54 ++--
 libs/wal_decoder/src/serialized_batch.rs      | 191 ++++++-------
 pageserver/src/import_datadir.rs              |  15 +-
 .../walreceiver/walreceiver_connection.rs     |   8 +-
 pageserver/src/walingest.rs                   |   4 +-
 safekeeper/src/send_interpreted_wal.rs        |   8 +-
 13 files changed, 574 insertions(+), 189 deletions(-)
 create mode 100644 libs/wal_decoder/benches/README.md
 create mode 100644 libs/wal_decoder/benches/bench_interpret_wal.rs

diff --git a/Cargo.lock b/Cargo.lock
index f543057933..0669899617 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7560,12 +7560,21 @@ dependencies = [
  "anyhow",
  "async-compression",
  "bytes",
+ "camino",
+ "camino-tempfile",
+ "criterion",
+ "futures",
  "pageserver_api",
  "postgres_ffi",
+ "pprof",
  "prost",
+ "remote_storage",
  "serde",
+ "serde_json",
  "thiserror",
+ "tikv-jemallocator",
  "tokio",
+ "tokio-util",
  "tonic",
  "tonic-build",
  "tracing",
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 4cc0a739e8..e03df02afb 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -31,6 +31,8 @@
 //! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
 //!   and their slugs are 0004, 0104, 0204, and 0304.
 
+use std::hash::{Hash, Hasher};
+
 use crate::{key::Key, models::ShardParameters};
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
@@ -48,6 +50,23 @@ pub struct ShardIdentity {
     layout: ShardLayout,
 }
 
+/// Hash implementation
+///
+/// The stripe size cannot change dynamically, so it can be ignored for efficiency reasons.
+impl Hash for ShardIdentity {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let ShardIdentity {
+            number,
+            count,
+            stripe_size: _,
+            layout: _,
+        } = self;
+
+        number.0.hash(state);
+        count.0.hash(state);
+    }
+}
+
 /// Stripe size in number of pages
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);
@@ -59,7 +78,7 @@ impl Default for ShardStripeSize {
 }
 
 /// Layout version: for future upgrades where we might change how the key->shard mapping works
-#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)]
 pub struct ShardLayout(u8);
 
 const LAYOUT_V1: ShardLayout = ShardLayout(1);
diff --git a/libs/postgres_ffi/src/walrecord.rs b/libs/postgres_ffi/src/walrecord.rs
index b32106632a..fce37e2fdd 100644
--- a/libs/postgres_ffi/src/walrecord.rs
+++ b/libs/postgres_ffi/src/walrecord.rs
@@ -16,7 +16,7 @@ use utils::bin_ser::DeserializeError;
 use utils::lsn::Lsn;
 
 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlMultiXactCreate {
     pub mid: MultiXactId,
     /* new MultiXact's ID */
@@ -46,7 +46,7 @@ impl XlMultiXactCreate {
 }
 
 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlMultiXactTruncate {
     pub oldest_multi_db: Oid,
     /* to-be-truncated range of multixact offsets */
@@ -72,7 +72,7 @@ impl XlMultiXactTruncate {
 }
 
 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlRelmapUpdate {
     pub dbid: Oid,   /* database ID, or 0 for shared map */
     pub tsid: Oid,   /* database's tablespace, or pg_global */
@@ -90,7 +90,7 @@ impl XlRelmapUpdate {
 }
 
 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlReploriginDrop {
     pub node_id: RepOriginId,
 }
@@ -104,7 +104,7 @@ impl XlReploriginDrop {
 }
 
 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlReploriginSet {
     pub remote_lsn: Lsn,
     pub node_id: RepOriginId,
@@ -911,7 +911,7 @@ impl XlSmgrCreate {
 }
 
 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlSmgrTruncate {
     pub blkno: BlockNumber,
     pub rnode: RelFileNode,
@@ -984,7 +984,7 @@ impl XlDropDatabase {
 /// xl_xact_parsed_abort structs in PostgreSQL, but we use the same
 /// struct for commits and aborts.
 ///
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlXactParsedRecord {
     pub xid: TransactionId,
     pub info: u8,
diff --git a/libs/wal_decoder/Cargo.toml b/libs/wal_decoder/Cargo.toml
index 8fac4e38ca..09c4afb18a 100644
--- a/libs/wal_decoder/Cargo.toml
+++ b/libs/wal_decoder/Cargo.toml
@@ -24,3 +24,18 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
 
 [build-dependencies]
 tonic-build.workspace = true
+
+[dev-dependencies]
+criterion.workspace = true
+camino.workspace = true
+camino-tempfile.workspace = true
+remote_storage.workspace = true
+tokio-util.workspace = true
+serde_json.workspace = true
+futures.workspace = true
+tikv-jemallocator.workspace = true
+pprof.workspace = true
+
+[[bench]]
+name = "bench_interpret_wal"
+harness = false
diff --git a/libs/wal_decoder/benches/README.md b/libs/wal_decoder/benches/README.md
new file mode 100644
index 0000000000..14885afecf
--- /dev/null
+++ b/libs/wal_decoder/benches/README.md
@@ -0,0 +1,34 @@
+## WAL Decoding and Interpretation Benchmarks
+
+Note that these benchmarks pull WAL from a public bucket in S3
+as a preparation step. Hence, you need a way to auth with AWS.
+You can achieve this by copying the `~/.aws/config` file from
+the AWS SSO notion page and exporting `AWS_PROFILE=dev` when invoking
+the benchmarks.
+
+To run benchmarks:
+
+```sh
+aws sso login --profile dev
+
+# All benchmarks.
+AWS_PROFILE=dev cargo bench --package wal_decoder
+
+# Specific file.
+AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal
+
+# Specific benchmark.
+AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal unsharded
+
+# List available benchmarks.
+cargo bench --package wal_decoder --benches -- --list
+
+# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds.
+# Output in target/criterion/*/profile/flamegraph.svg.
+AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal unsharded -- --profile-time 10
+```
+
+Additional charts and statistics are available in `target/criterion/report/index.html`.
+
+Benchmarks are automatically compared against the previous run. To compare against other runs, see
+`--baseline` and `--save-baseline`.
diff --git a/libs/wal_decoder/benches/bench_interpret_wal.rs b/libs/wal_decoder/benches/bench_interpret_wal.rs
new file mode 100644
index 0000000000..846904cf87
--- /dev/null
+++ b/libs/wal_decoder/benches/bench_interpret_wal.rs
@@ -0,0 +1,250 @@
+use anyhow::Context;
+use criterion::{criterion_group, criterion_main, Criterion};
+use futures::{stream::FuturesUnordered, StreamExt};
+use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
+use postgres_ffi::{waldecoder::WalStreamDecoder, MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
+use pprof::criterion::{Output, PProfProfiler};
+use serde::Deserialize;
+use std::{env, num::NonZeroUsize, sync::Arc};
+
+use camino::{Utf8Path, Utf8PathBuf};
+use camino_tempfile::Utf8TempDir;
+use remote_storage::{
+    DownloadOpts, GenericRemoteStorage, ListingMode, RemoteStorageConfig, RemoteStorageKind,
+    S3Config,
+};
+use tokio_util::sync::CancellationToken;
+use utils::{
+    lsn::Lsn,
+    shard::{ShardCount, ShardNumber},
+};
+use wal_decoder::models::InterpretedWalRecord;
+
+const S3_BUCKET: &str = "neon-github-public-dev";
+const S3_REGION: &str = "eu-central-1";
+const BUCKET_PREFIX: &str = "wal-snapshots/bulk-insert/";
+const METADATA_FILENAME: &str = "metadata.json";
+
+/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB.
+/// This mirrors the configuration in bin/safekeeper.rs.
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
+#[allow(non_upper_case_globals)]
+#[export_name = "malloc_conf"]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+
+async fn create_s3_client() -> anyhow::Result<Arc<GenericRemoteStorage>> {
+    let remote_storage_config = RemoteStorageConfig {
+        storage: RemoteStorageKind::AwsS3(S3Config {
+            bucket_name: S3_BUCKET.to_string(),
+            bucket_region: S3_REGION.to_string(),
+            prefix_in_bucket: Some(BUCKET_PREFIX.to_string()),
+            endpoint: None,
+            concurrency_limit: NonZeroUsize::new(100).unwrap(),
+            max_keys_per_list_response: None,
+            upload_storage_class: None,
+        }),
+        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
+        small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
+    };
+    Ok(Arc::new(
+        GenericRemoteStorage::from_config(&remote_storage_config)
+            .await
+            .context("remote storage init")?,
+    ))
+}
+
+async fn download_bench_data(
+    client: Arc<GenericRemoteStorage>,
+    cancel: &CancellationToken,
+) -> anyhow::Result<Utf8TempDir> {
+    let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into()?;
+    let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent)?;
+
+    eprintln!("Downloading benchmark data to {:?}", temp_dir);
+
+    let listing = client
+        .list(None, ListingMode::NoDelimiter, None, cancel)
+        .await?;
+
+    let mut downloads = listing
+        .keys
+        .into_iter()
+        .map(|obj| {
+            let client = client.clone();
+            let temp_dir_path = temp_dir.path().to_owned();
+
+            async move {
+                let remote_path = obj.key;
+                let download = client
+                    .download(&remote_path, &DownloadOpts::default(), cancel)
+                    .await?;
+                let mut body = tokio_util::io::StreamReader::new(download.download_stream);
+
+                let file_name = remote_path.object_name().unwrap();
+                let file_path = temp_dir_path.join(file_name);
+                let file = tokio::fs::OpenOptions::new()
+                    .create(true)
+                    .truncate(true)
+                    .write(true)
+                    .open(&file_path)
+                    .await?;
+
+                let mut writer = tokio::io::BufWriter::new(file);
+                tokio::io::copy_buf(&mut body, &mut writer).await?;
+
+                Ok::<(), anyhow::Error>(())
+            }
+        })
+        .collect::<FuturesUnordered<_>>();
+
+    while let Some(download) = downloads.next().await {
+        download?;
+    }
+
+    Ok(temp_dir)
+}
+
+struct BenchmarkData {
+    wal: Vec<u8>,
+    meta: BenchmarkMetadata,
+}
+
+#[derive(Deserialize)]
+struct BenchmarkMetadata {
+    pg_version: u32,
+    start_lsn: Lsn,
+}
+
+async fn load_bench_data(path: &Utf8Path, input_size: usize) -> anyhow::Result<BenchmarkData> {
+    eprintln!("Loading benchmark data from {:?}", path);
+
+    let mut entries = tokio::fs::read_dir(path).await?;
+    let mut ordered_segment_paths = Vec::new();
+    let mut metadata = None;
+
+    while let Some(entry) = entries.next_entry().await? {
+        if entry.file_name() == METADATA_FILENAME {
+            let bytes = tokio::fs::read(entry.path()).await?;
+            metadata = Some(
+                serde_json::from_slice::<BenchmarkMetadata>(&bytes)
+                    .context("failed to deserialize metadata.json")?,
+            );
+        } else {
+            ordered_segment_paths.push(entry.path());
+        }
+    }
+
+    ordered_segment_paths.sort();
+
+    let mut buffer = Vec::new();
+    for path in ordered_segment_paths {
+        if buffer.len() >= input_size {
+            break;
+        }
+
+        use async_compression::tokio::bufread::ZstdDecoder;
+        let file = tokio::fs::File::open(path).await?;
+        let reader = tokio::io::BufReader::new(file);
+        let decoder = ZstdDecoder::new(reader);
+        let mut reader = tokio::io::BufReader::new(decoder);
+        tokio::io::copy_buf(&mut reader, &mut buffer).await?;
+    }
+
+    buffer.truncate(input_size);
+
+    Ok(BenchmarkData {
+        wal: buffer,
+        meta: metadata.unwrap(),
+    })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    const INPUT_SIZE: usize = 128 * 1024 * 1024;
+
+    let setup_runtime = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    let (_temp_dir, bench_data) = setup_runtime.block_on(async move {
+        let cancel = CancellationToken::new();
+        let client = create_s3_client().await.unwrap();
+        let temp_dir = download_bench_data(client, &cancel).await.unwrap();
+        let bench_data = load_bench_data(temp_dir.path(), INPUT_SIZE).await.unwrap();
+
+        (temp_dir, bench_data)
+    });
+
+    eprintln!(
+        "Benchmarking against {} MiB of WAL",
+        INPUT_SIZE / 1024 / 1024
+    );
+
+    let mut group = c.benchmark_group("decode-interpret-wal");
+    group.throughput(criterion::Throughput::Bytes(bench_data.wal.len() as u64));
+    group.sample_size(10);
+
+    group.bench_function("unsharded", |b| {
+        b.iter(|| decode_interpret_main(&bench_data, &[ShardIdentity::unsharded()]))
+    });
+
+    let eight_shards = (0..8)
+        .map(|i| ShardIdentity::new(ShardNumber(i), ShardCount(8), ShardStripeSize(8)).unwrap())
+        .collect::<Vec<_>>();
+
+    group.bench_function("8/8-shards", |b| {
+        b.iter(|| decode_interpret_main(&bench_data, &eight_shards))
+    });
+
+    let four_shards = eight_shards
+        .into_iter()
+        .filter(|s| s.number.0 % 2 == 0)
+        .collect::<Vec<_>>();
+    group.bench_function("4/8-shards", |b| {
+        b.iter(|| decode_interpret_main(&bench_data, &four_shards))
+    });
+
+    let two_shards = four_shards
+        .into_iter()
+        .filter(|s| s.number.0 % 4 == 0)
+        .collect::<Vec<_>>();
+    group.bench_function("2/8-shards", |b| {
+        b.iter(|| decode_interpret_main(&bench_data, &two_shards))
+    });
+}
+
+fn decode_interpret_main(bench: &BenchmarkData, shards: &[ShardIdentity]) {
+    let r = decode_interpret(bench, shards);
+    if let Err(e) = r {
+        panic!("{e:?}");
+    }
+}
+
+fn decode_interpret(bench: &BenchmarkData, shard: &[ShardIdentity]) -> anyhow::Result<()> {
+    let mut decoder = WalStreamDecoder::new(bench.meta.start_lsn, bench.meta.pg_version);
+    let xlogoff: usize = bench.meta.start_lsn.segment_offset(WAL_SEGMENT_SIZE);
+
+    for chunk in bench.wal[xlogoff..].chunks(MAX_SEND_SIZE) {
+        decoder.feed_bytes(chunk);
+        while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
+            assert!(lsn.is_aligned());
+            let _ = InterpretedWalRecord::from_bytes_filtered(
+                recdata,
+                shard,
+                lsn,
+                bench.meta.pg_version,
+            )
+            .unwrap();
+        }
+    }
+
+    Ok(())
+}
+criterion_group!(
+    name=benches;
+    config=Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets=criterion_benchmark
+);
+criterion_main!(benches);
diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs
index aa50c62911..ebb38ceb52 100644
--- a/libs/wal_decoder/src/decoder.rs
+++ b/libs/wal_decoder/src/decoder.rs
@@ -1,6 +1,8 @@
 //! This module contains logic for decoding and interpreting
 //! raw bytes which represent a raw Postgres WAL record.
 
+use std::collections::HashMap;
+
 use crate::models::*;
 use crate::serialized_batch::SerializedValueBatch;
 use bytes::{Buf, Bytes};
@@ -14,15 +16,15 @@ use utils::lsn::Lsn;
 
 impl InterpretedWalRecord {
     /// Decode and interpreted raw bytes which represent one Postgres WAL record.
-    /// Data blocks which do not match the provided shard identity are filtered out.
+    /// Data blocks which do not match any of the provided shard identities are filtered out.
     /// Shard 0 is a special case since it tracks all relation sizes. We only give it
     /// the keys that are being written as that is enough for updating relation sizes.
     pub fn from_bytes_filtered(
         buf: Bytes,
-        shard: &ShardIdentity,
+        shards: &[ShardIdentity],
         next_record_lsn: Lsn,
         pg_version: u32,
-    ) -> anyhow::Result<InterpretedWalRecord> {
+    ) -> anyhow::Result<HashMap<ShardIdentity, InterpretedWalRecord>> {
         let mut decoded = DecodedWALRecord::default();
         decode_wal_record(buf, &mut decoded, pg_version)?;
         let xid = decoded.xl_xid;
@@ -33,43 +35,57 @@ impl InterpretedWalRecord {
             FlushUncommittedRecords::No
         };
 
-        let metadata_record =
-            MetadataRecord::from_decoded_filtered(&decoded, shard, next_record_lsn, pg_version)?;
-        let batch = SerializedValueBatch::from_decoded_filtered(
+        let mut shard_records: HashMap<ShardIdentity, InterpretedWalRecord> =
+            HashMap::with_capacity(shards.len());
+        for shard in shards {
+            shard_records.insert(
+                *shard,
+                InterpretedWalRecord {
+                    metadata_record: None,
+                    batch: SerializedValueBatch::default(),
+                    next_record_lsn,
+                    flush_uncommitted,
+                    xid,
+                },
+            );
+        }
+
+        MetadataRecord::from_decoded_filtered(
+            &decoded,
+            &mut shard_records,
+            next_record_lsn,
+            pg_version,
+        )?;
+        SerializedValueBatch::from_decoded_filtered(
             decoded,
-            shard,
+            &mut shard_records,
             next_record_lsn,
             pg_version,
         )?;
 
-        Ok(InterpretedWalRecord {
-            metadata_record,
-            batch,
-            next_record_lsn,
-            flush_uncommitted,
-            xid,
-        })
+        Ok(shard_records)
     }
 }
 
 impl MetadataRecord {
-    /// Builds a metadata record for this WAL record, if any.
+    /// Populates the given `shard_records` with metadata records from this WAL record, if any,
+    /// discarding those belonging to other shards.
     ///
-    /// Only metadata records relevant for the given shard are emitted. Currently, most metadata
+    /// Only metadata records relevant for the given shards is emitted. Currently, most metadata
     /// records are broadcast to all shards for simplicity, but this should be improved.
     fn from_decoded_filtered(
         decoded: &DecodedWALRecord,
-        shard: &ShardIdentity,
+        shard_records: &mut HashMap<ShardIdentity, InterpretedWalRecord>,
         next_record_lsn: Lsn,
         pg_version: u32,
-    ) -> anyhow::Result<Option<MetadataRecord>> {
+    ) -> anyhow::Result<()> {
         // Note: this doesn't actually copy the bytes since
         // the [`Bytes`] type implements it via a level of indirection.
         let mut buf = decoded.record.clone();
         buf.advance(decoded.main_data_offset);
 
         // First, generate metadata records from the decoded WAL record.
-        let mut metadata_record = match decoded.xl_rmid {
+        let metadata_record = match decoded.xl_rmid {
             pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
                 Self::decode_heapam_record(&mut buf, decoded, pg_version)?
             }
@@ -112,41 +128,65 @@ impl MetadataRecord {
         };
 
         // Next, filter the metadata record by shard.
-        match metadata_record {
-            Some(
-                MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits))
-                | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)),
-            ) => {
-                // Route VM page updates to the shards that own them. VM pages are stored in the VM fork
-                // of the main relation. These are sharded and managed just like regular relation pages.
-                // See: https://github.com/neondatabase/neon/issues/9855
-                let is_local_vm_page = |heap_blk| {
-                    let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
-                    shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
-                };
-                // Send the old and new VM page updates to their respective shards.
-                clear_vm_bits.old_heap_blkno = clear_vm_bits
-                    .old_heap_blkno
-                    .filter(|&blkno| is_local_vm_page(blkno));
-                clear_vm_bits.new_heap_blkno = clear_vm_bits
-                    .new_heap_blkno
-                    .filter(|&blkno| is_local_vm_page(blkno));
-                // If neither VM page belongs to this shard, discard the record.
-                if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none()
-                {
-                    metadata_record = None
+        for (shard, record) in shard_records.iter_mut() {
+            match metadata_record {
+                Some(
+                    MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref clear_vm_bits))
+                    | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref clear_vm_bits)),
+                ) => {
+                    // Route VM page updates to the shards that own them. VM pages are stored in the VM fork
+                    // of the main relation. These are sharded and managed just like regular relation pages.
+                    // See: https://github.com/neondatabase/neon/issues/9855
+                    let is_local_vm_page = |heap_blk| {
+                        let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
+                        shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
+                    };
+                    // Send the old and new VM page updates to their respective shards.
+                    let updated_old_heap_blkno = clear_vm_bits
+                        .old_heap_blkno
+                        .filter(|&blkno| is_local_vm_page(blkno));
+                    let updated_new_heap_blkno = clear_vm_bits
+                        .new_heap_blkno
+                        .filter(|&blkno| is_local_vm_page(blkno));
+                    // If neither VM page belongs to this shard, discard the record.
+                    if updated_old_heap_blkno.is_some() || updated_new_heap_blkno.is_some() {
+                        // Clone the record and update it for the current shard.
+                        let mut for_shard = metadata_record.clone();
+                        match for_shard {
+                            Some(
+                                MetadataRecord::Heapam(HeapamRecord::ClearVmBits(
+                                    ref mut clear_vm_bits,
+                                ))
+                                | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(
+                                    ref mut clear_vm_bits,
+                                )),
+                            ) => {
+                                clear_vm_bits.old_heap_blkno = updated_old_heap_blkno;
+                                clear_vm_bits.new_heap_blkno = updated_new_heap_blkno;
+                                record.metadata_record = for_shard;
+                            }
+                            _ => {
+                                unreachable!("for_shard is a clone of what we checked above")
+                            }
+                        }
+                    }
+                }
+                Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => {
+                    // Filter LogicalMessage records (AUX files) to only be stored on shard zero
+                    if shard.is_shard_zero() {
+                        record.metadata_record = metadata_record;
+                        // No other shards should receive this record, so we stop traversing shards early.
+                        break;
+                    }
+                }
+                _ => {
+                    // All other metadata records are sent to all shards.
+                    record.metadata_record = metadata_record.clone();
                 }
             }
-            Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => {
-                // Filter LogicalMessage records (AUX files) to only be stored on shard zero
-                if !shard.is_shard_zero() {
-                    metadata_record = None;
-                }
-            }
-            _ => {}
         }
 
-        Ok(metadata_record)
+        Ok(())
     }
 
     fn decode_heapam_record(
diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs
index 6576dd0eba..8bfa48faac 100644
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -48,7 +48,7 @@ pub mod proto {
     tonic::include_proto!("interpreted_wal");
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Copy, Clone, Serialize, Deserialize)]
 pub enum FlushUncommittedRecords {
     Yes,
     No,
@@ -107,7 +107,7 @@ impl InterpretedWalRecord {
 
 /// The interpreted part of the Postgres WAL record which requires metadata
 /// writes to the underlying storage engine.
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum MetadataRecord {
     Heapam(HeapamRecord),
     Neonrmgr(NeonrmgrRecord),
@@ -123,12 +123,12 @@ pub enum MetadataRecord {
     Replorigin(ReploriginRecord),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum HeapamRecord {
     ClearVmBits(ClearVmBits),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct ClearVmBits {
     pub new_heap_blkno: Option<u32>,
     pub old_heap_blkno: Option<u32>,
@@ -136,29 +136,29 @@ pub struct ClearVmBits {
     pub flags: u8,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum NeonrmgrRecord {
     ClearVmBits(ClearVmBits),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum SmgrRecord {
     Create(SmgrCreate),
     Truncate(XlSmgrTruncate),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct SmgrCreate {
     pub rel: RelTag,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum DbaseRecord {
     Create(DbaseCreate),
     Drop(DbaseDrop),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct DbaseCreate {
     pub db_id: Oid,
     pub tablespace_id: Oid,
@@ -166,32 +166,32 @@ pub struct DbaseCreate {
     pub src_tablespace_id: Oid,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct DbaseDrop {
     pub db_id: Oid,
     pub tablespace_ids: Vec<Oid>,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum ClogRecord {
     ZeroPage(ClogZeroPage),
     Truncate(ClogTruncate),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct ClogZeroPage {
     pub segno: u32,
     pub rpageno: u32,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct ClogTruncate {
     pub pageno: u32,
     pub oldest_xid: TransactionId,
     pub oldest_xid_db: Oid,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum XactRecord {
     Commit(XactCommon),
     Abort(XactCommon),
@@ -200,7 +200,7 @@ pub enum XactRecord {
     Prepare(XactPrepare),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct XactCommon {
     pub parsed: XlXactParsedRecord,
     pub origin_id: u16,
@@ -209,73 +209,73 @@ pub struct XactCommon {
     pub lsn: Lsn,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct XactPrepare {
     pub xl_xid: TransactionId,
     pub data: Bytes,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum MultiXactRecord {
     ZeroPage(MultiXactZeroPage),
     Create(XlMultiXactCreate),
     Truncate(XlMultiXactTruncate),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct MultiXactZeroPage {
     pub slru_kind: SlruKind,
     pub segno: u32,
     pub rpageno: u32,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum RelmapRecord {
     Update(RelmapUpdate),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct RelmapUpdate {
     pub update: XlRelmapUpdate,
     pub buf: Bytes,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum XlogRecord {
     Raw(RawXlogRecord),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct RawXlogRecord {
     pub info: u8,
     pub lsn: Lsn,
     pub buf: Bytes,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum LogicalMessageRecord {
     Put(PutLogicalMessage),
     #[cfg(feature = "testing")]
     Failpoint,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct PutLogicalMessage {
     pub path: String,
     pub buf: Bytes,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum StandbyRecord {
     RunningXacts(StandbyRunningXacts),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct StandbyRunningXacts {
     pub oldest_running_xid: TransactionId,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum ReploriginRecord {
     Set(XlReploriginSet),
     Drop(XlReploriginDrop),
diff --git a/libs/wal_decoder/src/serialized_batch.rs b/libs/wal_decoder/src/serialized_batch.rs
index af2b179e05..c70ff05b8e 100644
--- a/libs/wal_decoder/src/serialized_batch.rs
+++ b/libs/wal_decoder/src/serialized_batch.rs
@@ -5,7 +5,7 @@
 //! Such batches are created from decoded PG wal records and ingested
 //! by the pageserver by writing directly to the ephemeral file.
 
-use std::collections::BTreeSet;
+use std::collections::{BTreeSet, HashMap};
 
 use bytes::{Bytes, BytesMut};
 use pageserver_api::key::rel_block_to_key;
@@ -22,6 +22,8 @@ use utils::lsn::Lsn;
 
 use pageserver_api::key::Key;
 
+use crate::models::InterpretedWalRecord;
+
 static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 
 /// Accompanying metadata for the batch
@@ -128,7 +130,8 @@ impl Default for SerializedValueBatch {
 }
 
 impl SerializedValueBatch {
-    /// Build a batch of serialized values from a decoded PG WAL record
+    /// Populates the given `shard_records` with value batches from this WAL record, if any,
+    /// discarding those belonging to other shards.
     ///
     /// The batch will only contain values for keys targeting the specifiec
     /// shard. Shard 0 is a special case, where any keys that don't belong to
@@ -136,21 +139,20 @@ impl SerializedValueBatch {
     /// but absent from the raw buffer [`SerializedValueBatch::raw`]).
     pub(crate) fn from_decoded_filtered(
         decoded: DecodedWALRecord,
-        shard: &ShardIdentity,
+        shard_records: &mut HashMap<ShardIdentity, InterpretedWalRecord>,
         next_record_lsn: Lsn,
         pg_version: u32,
-    ) -> anyhow::Result<SerializedValueBatch> {
-        // First determine how big the buffer needs to be and allocate it up-front.
+    ) -> anyhow::Result<()> {
+        // First determine how big the buffers need to be and allocate it up-front.
         // This duplicates some of the work below, but it's empirically much faster.
-        let estimated_buffer_size = Self::estimate_buffer_size(&decoded, shard, pg_version);
-        let mut buf = Vec::<u8>::with_capacity(estimated_buffer_size);
+        for (shard, record) in shard_records.iter_mut() {
+            assert!(record.batch.is_empty());
+
+            let estimate = Self::estimate_buffer_size(&decoded, shard, pg_version);
+            record.batch.raw = Vec::with_capacity(estimate);
+        }
 
-        let mut metadata: Vec<ValueMeta> = Vec::with_capacity(decoded.blocks.len());
-        let mut max_lsn: Lsn = Lsn(0);
-        let mut len: usize = 0;
         for blk in decoded.blocks.iter() {
-            let relative_off = buf.len() as u64;
-
             let rel = RelTag {
                 spcnode: blk.rnode_spcnode,
                 dbnode: blk.rnode_dbnode,
@@ -168,99 +170,98 @@ impl SerializedValueBatch {
                 );
             }
 
-            let key_is_local = shard.is_key_local(&key);
+            for (shard, record) in shard_records.iter_mut() {
+                let key_is_local = shard.is_key_local(&key);
 
-            tracing::debug!(
-                lsn=%next_record_lsn,
-                key=%key,
-                "ingest: shard decision {}",
-                if !key_is_local { "drop" } else { "keep" },
-            );
+                tracing::debug!(
+                    lsn=%next_record_lsn,
+                    key=%key,
+                    "ingest: shard decision {}",
+                    if !key_is_local { "drop" } else { "keep" },
+                );
 
-            if !key_is_local {
-                if shard.is_shard_zero() {
-                    // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
-                    // its blkno in case it implicitly extends a relation.
-                    metadata.push(ValueMeta::Observed(ObservedValueMeta {
+                if !key_is_local {
+                    if shard.is_shard_zero() {
+                        // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
+                        // its blkno in case it implicitly extends a relation.
+                        record
+                            .batch
+                            .metadata
+                            .push(ValueMeta::Observed(ObservedValueMeta {
+                                key: key.to_compact(),
+                                lsn: next_record_lsn,
+                            }))
+                    }
+
+                    continue;
+                }
+
+                // Instead of storing full-page-image WAL record,
+                // it is better to store extracted image: we can skip wal-redo
+                // in this case. Also some FPI records may contain multiple (up to 32) pages,
+                // so them have to be copied multiple times.
+                //
+                let val = if Self::block_is_image(&decoded, blk, pg_version) {
+                    // Extract page image from FPI record
+                    let img_len = blk.bimg_len as usize;
+                    let img_offs = blk.bimg_offset as usize;
+                    let mut image = BytesMut::with_capacity(BLCKSZ as usize);
+                    // TODO(vlad): skip the copy
+                    image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
+
+                    if blk.hole_length != 0 {
+                        let tail = image.split_off(blk.hole_offset as usize);
+                        image.resize(image.len() + blk.hole_length as usize, 0u8);
+                        image.unsplit(tail);
+                    }
+                    //
+                    // Match the logic of XLogReadBufferForRedoExtended:
+                    // The page may be uninitialized. If so, we can't set the LSN because
+                    // that would corrupt the page.
+                    //
+                    if !page_is_new(&image) {
+                        page_set_lsn(&mut image, next_record_lsn)
+                    }
+                    assert_eq!(image.len(), BLCKSZ as usize);
+
+                    Value::Image(image.freeze())
+                } else {
+                    Value::WalRecord(NeonWalRecord::Postgres {
+                        will_init: blk.will_init || blk.apply_image,
+                        rec: decoded.record.clone(),
+                    })
+                };
+
+                let relative_off = record.batch.raw.len() as u64;
+
+                val.ser_into(&mut record.batch.raw)
+                    .expect("Writing into in-memory buffer is infallible");
+
+                let val_ser_size = record.batch.raw.len() - relative_off as usize;
+
+                record
+                    .batch
+                    .metadata
+                    .push(ValueMeta::Serialized(SerializedValueMeta {
                         key: key.to_compact(),
                         lsn: next_record_lsn,
-                    }))
-                }
-
-                continue;
+                        batch_offset: relative_off,
+                        len: val_ser_size,
+                        will_init: val.will_init(),
+                    }));
+                record.batch.max_lsn = std::cmp::max(record.batch.max_lsn, next_record_lsn);
+                record.batch.len += 1;
             }
-
-            // Instead of storing full-page-image WAL record,
-            // it is better to store extracted image: we can skip wal-redo
-            // in this case. Also some FPI records may contain multiple (up to 32) pages,
-            // so them have to be copied multiple times.
-            //
-            let val = if Self::block_is_image(&decoded, blk, pg_version) {
-                // Extract page image from FPI record
-                let img_len = blk.bimg_len as usize;
-                let img_offs = blk.bimg_offset as usize;
-                let mut image = BytesMut::with_capacity(BLCKSZ as usize);
-                // TODO(vlad): skip the copy
-                image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
-
-                if blk.hole_length != 0 {
-                    let tail = image.split_off(blk.hole_offset as usize);
-                    image.resize(image.len() + blk.hole_length as usize, 0u8);
-                    image.unsplit(tail);
-                }
-                //
-                // Match the logic of XLogReadBufferForRedoExtended:
-                // The page may be uninitialized. If so, we can't set the LSN because
-                // that would corrupt the page.
-                //
-                if !page_is_new(&image) {
-                    page_set_lsn(&mut image, next_record_lsn)
-                }
-                assert_eq!(image.len(), BLCKSZ as usize);
-
-                Value::Image(image.freeze())
-            } else {
-                Value::WalRecord(NeonWalRecord::Postgres {
-                    will_init: blk.will_init || blk.apply_image,
-                    rec: decoded.record.clone(),
-                })
-            };
-
-            val.ser_into(&mut buf)
-                .expect("Writing into in-memory buffer is infallible");
-
-            let val_ser_size = buf.len() - relative_off as usize;
-
-            metadata.push(ValueMeta::Serialized(SerializedValueMeta {
-                key: key.to_compact(),
-                lsn: next_record_lsn,
-                batch_offset: relative_off,
-                len: val_ser_size,
-                will_init: val.will_init(),
-            }));
-            max_lsn = std::cmp::max(max_lsn, next_record_lsn);
-            len += 1;
         }
 
         if cfg!(any(debug_assertions, test)) {
-            let batch = Self {
-                raw: buf,
-                metadata,
-                max_lsn,
-                len,
-            };
-
-            batch.validate_lsn_order();
-
-            return Ok(batch);
+            // Validate that the batches are correct
+            for record in shard_records.values() {
+                record.batch.validate_lsn_order();
+            }
         }
 
-        Ok(Self {
-            raw: buf,
-            metadata,
-            max_lsn,
-            len,
-        })
+        Ok(())
     }
 
     /// Look into the decoded PG WAL record and determine
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index c061714010..a73fa5cec8 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -278,6 +278,8 @@ async fn import_wal(
 
     let mut walingest = WalIngest::new(tline, startpoint, ctx).await?;
 
+    let shard = vec![*tline.get_shard_identity()];
+
     while last_lsn <= endpoint {
         // FIXME: assume postgresql tli 1 for now
         let filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE);
@@ -314,10 +316,12 @@ async fn import_wal(
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 let interpreted = InterpretedWalRecord::from_bytes_filtered(
                     recdata,
-                    tline.get_shard_identity(),
+                    &shard,
                     lsn,
                     tline.pg_version,
-                )?;
+                )?
+                .remove(tline.get_shard_identity())
+                .unwrap();
 
                 walingest
                     .ingest_record(interpreted, &mut modification, ctx)
@@ -411,6 +415,7 @@ pub async fn import_wal_from_tar(
     let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
     let mut last_lsn = start_lsn;
     let mut walingest = WalIngest::new(tline, start_lsn, ctx).await?;
+    let shard = vec![*tline.get_shard_identity()];
 
     // Ingest wal until end_lsn
     info!("importing wal until {}", end_lsn);
@@ -459,10 +464,12 @@ pub async fn import_wal_from_tar(
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 let interpreted = InterpretedWalRecord::from_bytes_filtered(
                     recdata,
-                    tline.get_shard_identity(),
+                    &shard,
                     lsn,
                     tline.pg_version,
-                )?;
+                )?
+                .remove(tline.get_shard_identity())
+                .unwrap();
 
                 walingest
                     .ingest_record(interpreted, &mut modification, ctx)
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 3a8796add8..129b987e57 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -264,6 +264,8 @@ pub(super) async fn handle_walreceiver_connection(
 
     let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
 
+    let shard = vec![*timeline.get_shard_identity()];
+
     let interpreted_proto_config = match protocol {
         PostgresClientProtocol::Vanilla => None,
         PostgresClientProtocol::Interpreted {
@@ -476,10 +478,12 @@ pub(super) async fn handle_walreceiver_connection(
                         // Deserialize and interpret WAL record
                         let interpreted = InterpretedWalRecord::from_bytes_filtered(
                             recdata,
-                            modification.tline.get_shard_identity(),
+                            &shard,
                             next_record_lsn,
                             modification.tline.pg_version,
-                        )?;
+                        )?
+                        .remove(timeline.get_shard_identity())
+                        .unwrap();
 
                         if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
                             && uncommitted_records > 0
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 7253af8507..ad7bcc0714 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -2163,10 +2163,12 @@ mod tests {
             while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
                 let interpreted = InterpretedWalRecord::from_bytes_filtered(
                     recdata,
-                    modification.tline.get_shard_identity(),
+                    &[*modification.tline.get_shard_identity()],
                     lsn,
                     modification.tline.pg_version,
                 )
+                .unwrap()
+                .remove(modification.tline.get_shard_identity())
                 .unwrap();
 
                 walingest
diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
index 7d215176dd..a718c16a6a 100644
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -57,6 +57,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
         keepalive_ticker.reset();
 
         let (tx, mut rx) = tokio::sync::mpsc::channel::<Batch>(2);
+        let shard = vec![self.shard];
 
         loop {
             tokio::select! {
@@ -80,14 +81,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
                         assert!(next_record_lsn.is_aligned());
                         max_next_record_lsn = Some(next_record_lsn);
 
+
                         // Deserialize and interpret WAL record
                         let interpreted = InterpretedWalRecord::from_bytes_filtered(
                             recdata,
-                            &self.shard,
+                            &shard,
                             next_record_lsn,
                             self.pg_version,
                         )
-                        .with_context(|| "Failed to interpret WAL")?;
+                        .with_context(|| "Failed to interpret WAL")?
+                        .remove(&self.shard)
+                        .unwrap();
 
                         if !interpreted.is_empty() {
                             records.push(interpreted);

From b9464865b619d35257500bfade7651e3adfc07e4 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 15 Jan 2025 13:05:05 +0000
Subject: [PATCH 192/583] benchmarks: report successful runs to slack as well
 (#10393)

## Problem

Successful `benchmarks` runs doesn't have enough visibility

Ref https://neondb.slack.com/archives/C069Z2199DL/p1736868055094539

## Summary of changes
- Report both successful and failed `benchmarks` to Slack
- Update `slackapi/slack-github-action` action
---
 .github/actionlint.yml               |  1 +
 .github/workflows/build_and_test.yml | 25 +++++++++++--------------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 7a97e2ae55..aec5b4ee75 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -25,3 +25,4 @@ config-variables:
   - PGREGRESS_PG17_PROJECT_ID
   - SLACK_ON_CALL_QA_STAGING_STREAM
   - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
+  - SLACK_ON_CALL_STORAGE_STAGING_STREAM
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index cd95a5b16d..489a93f46d 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -346,25 +346,22 @@ jobs:
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 
-  report-benchmarks-failures:
+  report-benchmarks-results-to-slack:
     needs: [ benchmarks, create-test-report ]
-    if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
-    permissions:
-      id-token: write # aws-actions/configure-aws-credentials
-      statuses: write
-      contents: write
-      pull-requests: write
+    if: github.ref_name == 'main' && !cancelled() && contains(fromJSON('["success", "failure"]'), needs.benchmarks.result)
     runs-on: ubuntu-22.04
 
     steps:
-    - uses: slackapi/slack-github-action@v1
+    - uses: slackapi/slack-github-action@v2
       with:
-        channel-id: C060CNA47S9 # on-call-staging-storage-stream
-        slack-message: |
-          Benchmarks failed on main <${{ github.event.head_commit.url }}|${{ github.sha }}>
-          <${{ needs.create-test-report.outputs.report-url }}|Allure report>
-      env:
-        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+        method: chat.postMessage
+        token: ${{ secrets.SLACK_BOT_TOKEN }}
+        payload: |
+          channel: "${{ vars.SLACK_ON_CALL_STORAGE_STAGING_STREAM }}"
+          text: |
+            Benchmarks on main: *${{ needs.benchmarks.result }}*
+            - <${{ needs.create-test-report.outputs.report-url }}|Allure report>
+            - <${{ github.event.head_commit.url }}|${{ github.sha }}>
 
   create-test-report:
     needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]

From 05a71c7d6a14d471dcbce9d9d27d5eed124c9947 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Wed, 15 Jan 2025 17:16:04 +0300
Subject: [PATCH 193/583] safekeeper: add membership configuration switch
 endpoint (#10241)

## Problem

https://github.com/neondatabase/neon/issues/9965

## Summary of changes

Add to safekeeper http endpoint to switch membership configuration. Also
add it to python client for tests, and add simple test itself.
---
 libs/safekeeper_api/src/membership.rs    |  2 +
 libs/safekeeper_api/src/models.rs        | 15 ++++++
 safekeeper/src/http/routes.rs            | 28 +++++++++++
 safekeeper/src/state.rs                  | 30 ++++++++++-
 safekeeper/src/timeline.rs               | 20 +++++++-
 test_runner/fixtures/safekeeper/http.py  | 43 +++++++++++++++-
 test_runner/regress/test_wal_acceptor.py | 64 +++++++++++++++++++++++-
 7 files changed, 198 insertions(+), 4 deletions(-)

diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs
index fe30204545..a39fda526f 100644
--- a/libs/safekeeper_api/src/membership.rs
+++ b/libs/safekeeper_api/src/membership.rs
@@ -23,6 +23,8 @@ pub const INITIAL_GENERATION: Generation = 1;
 pub struct SafekeeperId {
     pub id: NodeId,
     pub host: String,
+    /// We include here only port for computes -- that is, pg protocol tenant
+    /// only port, or wide pg protocol port if the former is not configured.
     pub pg_port: u16,
 }
 
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index ad38986357..a6f90154f4 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -175,6 +175,7 @@ pub enum WalReceiverStatus {
 pub struct TimelineStatus {
     pub tenant_id: TenantId,
     pub timeline_id: TimelineId,
+    pub mconf: Configuration,
     pub acceptor_state: AcceptorStateStatus,
     pub pg_info: ServerInfo,
     pub flush_lsn: Lsn,
@@ -189,6 +190,20 @@ pub struct TimelineStatus {
     pub walreceivers: Vec<WalReceiverState>,
 }
 
+/// Request to switch membership configuration.
+#[derive(Serialize, Deserialize)]
+#[serde(transparent)]
+pub struct TimelineMembershipSwitchRequest {
+    pub mconf: Configuration,
+}
+
+/// In response both previous and current configuration are sent.
+#[derive(Serialize, Deserialize)]
+pub struct TimelineMembershipSwitchResponse {
+    pub previous_conf: Configuration,
+    pub current_conf: Configuration,
+}
+
 fn lsn_invalid() -> Lsn {
     Lsn::INVALID
 }
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 3835d39698..5ecde4b125 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -1,4 +1,5 @@
 use hyper::{Body, Request, Response, StatusCode};
+use safekeeper_api::models;
 use safekeeper_api::models::AcceptorStateStatus;
 use safekeeper_api::models::SafekeeperStatus;
 use safekeeper_api::models::TermSwitchApiEntry;
@@ -183,6 +184,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
     let status = TimelineStatus {
         tenant_id: ttid.tenant_id,
         timeline_id: ttid.timeline_id,
+        mconf: state.mconf,
         acceptor_state: acc_state,
         pg_info: state.server,
         flush_lsn,
@@ -268,6 +270,28 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
     Ok(response)
 }
 
+/// Consider switching timeline membership configuration to the provided one.
+async fn timeline_membership_handler(
+    mut request: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+    check_permission(&request, Some(ttid.tenant_id))?;
+
+    let global_timelines = get_global_timelines(&request);
+    let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
+
+    let data: models::TimelineMembershipSwitchRequest = json_request(&mut request).await?;
+    let response = tli
+        .membership_switch(data.mconf)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, response)
+}
+
 async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
 
@@ -619,6 +643,10 @@ pub fn make_router(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id",
             |r| request_span(r, timeline_snapshot_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/membership",
+            |r| request_span(r, timeline_membership_handler),
+        )
         .post(
             "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
             |r| request_span(r, timeline_copy_handler),
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index 1c3bb1b4dc..4d566b12a0 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -6,9 +6,12 @@ use std::{cmp::max, ops::Deref, time::SystemTime};
 use anyhow::{bail, Result};
 use postgres_ffi::WAL_SEGMENT_SIZE;
 use safekeeper_api::{
-    membership::Configuration, models::TimelineTermBumpResponse, ServerInfo, Term, INITIAL_TERM,
+    membership::Configuration,
+    models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse},
+    ServerInfo, Term, INITIAL_TERM,
 };
 use serde::{Deserialize, Serialize};
+use tracing::info;
 use utils::{
     id::{TenantId, TenantTimelineId, TimelineId},
     lsn::Lsn,
@@ -258,6 +261,31 @@ where
             current_term: after,
         })
     }
+
+    /// Switch into membership configuration `to` if it is higher than the
+    /// current one.
+    pub async fn membership_switch(
+        &mut self,
+        to: Configuration,
+    ) -> Result<TimelineMembershipSwitchResponse> {
+        let before = self.mconf.clone();
+        // Is switch allowed?
+        if to.generation <= self.mconf.generation {
+            info!(
+                "ignoring request to switch membership conf to lower {}, current conf {}",
+                to, self.mconf
+            );
+        } else {
+            let mut state = self.start_change();
+            state.mconf = to.clone();
+            self.finish_change(&state).await?;
+            info!("switched membership conf to {} from {}", to, before);
+        }
+        Ok(TimelineMembershipSwitchResponse {
+            previous_conf: before,
+            current_conf: self.mconf.clone(),
+        })
+    }
 }
 
 impl<CTRL> Deref for TimelineState<CTRL>
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 36860a0da2..2882391074 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -4,7 +4,10 @@
 use anyhow::{anyhow, bail, Result};
 use camino::{Utf8Path, Utf8PathBuf};
 use remote_storage::RemotePath;
-use safekeeper_api::models::{PeerInfo, TimelineTermBumpResponse};
+use safekeeper_api::membership::Configuration;
+use safekeeper_api::models::{
+    PeerInfo, TimelineMembershipSwitchResponse, TimelineTermBumpResponse,
+};
 use safekeeper_api::Term;
 use tokio::fs::{self};
 use tokio_util::sync::CancellationToken;
@@ -188,6 +191,13 @@ impl StateSK {
         self.state_mut().term_bump(to).await
     }
 
+    pub async fn membership_switch(
+        &mut self,
+        to: Configuration,
+    ) -> Result<TimelineMembershipSwitchResponse> {
+        self.state_mut().membership_switch(to).await
+    }
+
     /// Close open WAL files to release FDs.
     fn close_wal_store(&mut self) {
         if let StateSK::Loaded(sk) = self {
@@ -768,6 +778,14 @@ impl Timeline {
         state.sk.term_bump(to).await
     }
 
+    pub async fn membership_switch(
+        self: &Arc<Self>,
+        to: Configuration,
+    ) -> Result<TimelineMembershipSwitchResponse> {
+        let mut state = self.write_shared_state().await;
+        state.sk.membership_switch(to).await
+    }
+
     /// Guts of [`Self::wal_residence_guard`] and [`Self::try_wal_residence_guard`]
     async fn do_wal_residence_guard(
         self: &Arc<Self>,
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 4826cae3ee..493ce7334e 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -25,6 +25,7 @@ class Walreceiver:
 
 @dataclass
 class SafekeeperTimelineStatus:
+    mconf: Configuration | None
     term: int
     last_log_term: int
     pg_version: int  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
@@ -73,7 +74,7 @@ class TermBumpResponse:
 class SafekeeperId:
     id: int
     host: str
-    pg_port: str
+    pg_port: int
 
 
 @dataclass
@@ -82,6 +83,16 @@ class Configuration:
     members: list[SafekeeperId]
     new_members: list[SafekeeperId] | None
 
+    @classmethod
+    def from_json(cls, d: dict[str, Any]) -> Configuration:
+        generation = d["generation"]
+        members = d["members"]
+        new_members = d.get("new_members")
+        return Configuration(generation, members, new_members)
+
+    def to_json(self) -> str:
+        return json.dumps(self, cls=EnhancedJSONEncoder)
+
 
 @dataclass
 class TimelineCreateRequest:
@@ -97,6 +108,18 @@ class TimelineCreateRequest:
         return json.dumps(self, cls=EnhancedJSONEncoder)
 
 
+@dataclass
+class TimelineMembershipSwitchResponse:
+    previous_conf: Configuration
+    current_conf: Configuration
+
+    @classmethod
+    def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse:
+        previous_conf = Configuration.from_json(d["previous_conf"])
+        current_conf = Configuration.from_json(d["current_conf"])
+        return TimelineMembershipSwitchResponse(previous_conf, current_conf)
+
+
 class SafekeeperHttpClient(requests.Session, MetricsGetter):
     HTTPError = requests.HTTPError
 
@@ -170,7 +193,10 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         res.raise_for_status()
         resj = res.json()
         walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
+        # It is always normally not None, it is allowed only to make forward compat tests happy.
+        mconf = Configuration.from_json(resj["mconf"]) if "mconf" in resj else None
         return SafekeeperTimelineStatus(
+            mconf=mconf,
             term=resj["acceptor_state"]["term"],
             last_log_term=resj["acceptor_state"]["epoch"],
             pg_version=resj["pg_info"]["pg_version"],
@@ -196,6 +222,11 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
     def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
         return self.timeline_status(tenant_id, timeline_id).commit_lsn
 
+    # Get timeline membership configuration.
+    def get_membership(self, tenant_id: TenantId, timeline_id: TimelineId) -> Configuration:
+        # make mypy happy
+        return self.timeline_status(tenant_id, timeline_id).mconf  # type: ignore
+
     # only_local doesn't remove segments in the remote storage.
     def timeline_delete(
         self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
@@ -242,6 +273,16 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json, dict)
         return res_json
 
+    def membership_switch(
+        self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration
+    ) -> TimelineMembershipSwitchResponse:
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership",
+            data=to.to_json(),
+        )
+        res.raise_for_status()
+        return TimelineMembershipSwitchResponse.from_json(res.json())
+
     def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: dict[str, Any]):
         res = self.post(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index d39c6a6b5b..2b6a267bdf 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -48,7 +48,12 @@ from fixtures.remote_storage import (
     default_remote_storage,
     s3_storage,
 )
-from fixtures.safekeeper.http import Configuration, SafekeeperHttpClient, TimelineCreateRequest
+from fixtures.safekeeper.http import (
+    Configuration,
+    SafekeeperHttpClient,
+    SafekeeperId,
+    TimelineCreateRequest,
+)
 from fixtures.safekeeper.utils import wait_walreceivers_absent
 from fixtures.utils import (
     PropagatingThread,
@@ -2243,6 +2248,63 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
     wait_until(unevicted_on_dest, interval=0.1, timeout=1.0)
 
 
+# Basic test for http API membership related calls: create timeline and switch
+# configuration. Normally these are called by storage controller, but this
+# allows to test them separately.
+@run_only_on_default_postgres("tests only safekeeper API")
+def test_membership_api(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    sk = env.safekeepers[0]
+    http_cli = sk.http_client()
+
+    sk_id_1 = SafekeeperId(env.safekeepers[0].id, "localhost", sk.port.pg_tenant_only)
+    sk_id_2 = SafekeeperId(11, "localhost", 5434)  # just a mock
+
+    # Request to switch before timeline creation should fail.
+    init_conf = Configuration(generation=1, members=[sk_id_1], new_members=None)
+    with pytest.raises(requests.exceptions.HTTPError):
+        http_cli.membership_switch(tenant_id, timeline_id, init_conf)
+
+    # Create timeline.
+    create_r = TimelineCreateRequest(
+        tenant_id, timeline_id, init_conf, 150002, Lsn("0/1000000"), commit_lsn=None
+    )
+    log.info(f"sending {create_r.to_json()}")
+    http_cli.timeline_create(create_r)
+
+    # Switch into some conf.
+    joint_conf = Configuration(generation=4, members=[sk_id_1], new_members=[sk_id_2])
+    resp = http_cli.membership_switch(tenant_id, timeline_id, joint_conf)
+    log.info(f"joint switch resp: {resp}")
+    assert resp.previous_conf.generation == 1
+    assert resp.current_conf.generation == 4
+
+    # Restart sk, conf should be preserved.
+    sk.stop().start()
+    after_restart = http_cli.get_membership(tenant_id, timeline_id)
+    log.info(f"conf after restart: {after_restart}")
+    assert after_restart.generation == 4
+
+    # Switch into disjoint conf.
+    non_joint = Configuration(generation=5, members=[sk_id_2], new_members=None)
+    resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint)
+    log.info(f"non joint switch resp: {resp}")
+    assert resp.previous_conf.generation == 4
+    assert resp.current_conf.generation == 5
+
+    # Switch request to lower conf should be ignored.
+    lower_conf = Configuration(generation=3, members=[], new_members=None)
+    resp = http_cli.membership_switch(tenant_id, timeline_id, lower_conf)
+    log.info(f"lower switch resp: {resp}")
+    assert resp.previous_conf.generation == 5
+    assert resp.current_conf.generation == 5
+
+
 # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
 # when compute is active, but there are no writes to the timeline. In that case
 # pageserver should maintain a single connection to safekeeper and don't attempt

From 3e529f124f45f20f0e10948b4b4a8ba881b59b06 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 15 Jan 2025 09:29:52 -0600
Subject: [PATCH 194/583] Remove leading slashes when downloading remote files
 (#10396)

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 210a0ba3af..46082f2088 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 210a0ba3afd8134ea910b203f274b165bd4f05d7
+Subproject commit 46082f20884f087a2d974b33ac65d63af26142bd
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index d3141e17a7..dd0b28d6fb 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit d3141e17a7155e3d07c8deba4a10c748a29ba1e6
+Subproject commit dd0b28d6fbad39e227f3b77296fcca879af8b3a9
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index f63b141cfb..d674efd776 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit f63b141cfb0c813725a6b2574049565bff643018
+Subproject commit d674efd776f59d78e8fa1535bd2f95c3e6984fca
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 0f8da73ed0..a8dd6e779d 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 0f8da73ed08d4fc4ee58cccea008c75bfb20baa8
+Subproject commit a8dd6e779dde907778006adb436b557ad652fb97
diff --git a/vendor/revisions.json b/vendor/revisions.json
index b4d57ab709..c899dbaa5a 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.2",
-    "0f8da73ed08d4fc4ee58cccea008c75bfb20baa8"
+    "a8dd6e779dde907778006adb436b557ad652fb97"
   ],
   "v16": [
     "16.6",
-    "f63b141cfb0c813725a6b2574049565bff643018"
+    "d674efd776f59d78e8fa1535bd2f95c3e6984fca"
   ],
   "v15": [
     "15.10",
-    "d3141e17a7155e3d07c8deba4a10c748a29ba1e6"
+    "dd0b28d6fbad39e227f3b77296fcca879af8b3a9"
   ],
   "v14": [
     "14.15",
-    "210a0ba3afd8134ea910b203f274b165bd4f05d7"
+    "46082f20884f087a2d974b33ac65d63af26142bd"
   ]
 }

From dbebede7bf5ff0cefd303f266781457b7472d070 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 15 Jan 2025 15:33:54 +0000
Subject: [PATCH 195/583] safekeeper: fan out from single wal reader to
 multiple shards (#10190)

## Problem

Safekeepers currently decode and interpret WAL for each shard
separately.
This is wasteful in terms of CPU memory usage - we've seen this in
profiles.

## Summary of changes

Fan-out interpreted WAL to multiple shards.
The basic is that wal decoding and interpretation happens in a separate
tokio task and senders
attach to it. Senders only receive batches concerning their shard and
only past the Lsn they've last seen.

Fan-out is gated behind the `wal_reader_fanout` safekeeper flag
(disabled by default for now).

When fan-out is enabled, it might be desirable to control the absolute
delta between the
current position and a new shard's desired position (i.e. how far behind
or ahead a shard may be).
`max_delta_for_fanout` is a new optional safekeeper flag which dictates
whether to create a new
WAL reader or attach to the existing one. By default, this behaviour is
disabled. Let's consider enabling
it if we spot the need for it in the field.

## Testing

Tests passed [here](https://github.com/neondatabase/neon/pull/10301)
with wal reader fanout enabled
as of
https://github.com/neondatabase/neon/pull/10190/commits/34f6a717182c431847bbd5b7828fd0f89027b2be.

Related: https://github.com/neondatabase/neon/issues/9337
Epic: https://github.com/neondatabase/neon/issues/9329
---
 Cargo.lock                                    |   3 +
 libs/safekeeper_api/Cargo.toml                |   1 +
 libs/safekeeper_api/src/models.rs             |  20 +-
 libs/wal_decoder/src/models.rs                |   2 +-
 libs/wal_decoder/src/serialized_batch.rs      |   8 +-
 safekeeper/Cargo.toml                         |   3 +
 safekeeper/src/bin/safekeeper.rs              |   9 +
 safekeeper/src/http/routes.rs                 |   2 +-
 safekeeper/src/lib.rs                         |   4 +
 safekeeper/src/metrics.rs                     |  32 +-
 safekeeper/src/send_interpreted_wal.rs        | 765 ++++++++++++++++--
 safekeeper/src/send_wal.rs                    | 354 ++++++--
 safekeeper/src/test_utils.rs                  |  65 +-
 safekeeper/src/timeline.rs                    |  14 +-
 safekeeper/src/wal_reader_stream.rs           | 396 ++++++---
 .../tests/walproposer_sim/safekeeper.rs       |   2 +
 16 files changed, 1410 insertions(+), 270 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0669899617..afe16ff848 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5655,6 +5655,7 @@ dependencies = [
  "crc32c",
  "criterion",
  "desim",
+ "env_logger 0.10.2",
  "fail",
  "futures",
  "hex",
@@ -5683,6 +5684,7 @@ dependencies = [
  "serde",
  "serde_json",
  "sha2",
+ "smallvec",
  "storage_broker",
  "strum",
  "strum_macros",
@@ -5709,6 +5711,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "const_format",
+ "pageserver_api",
  "postgres_ffi",
  "pq_proto",
  "serde",
diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml
index 7652c3d413..6b72ace019 100644
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -13,3 +13,4 @@ postgres_ffi.workspace = true
 pq_proto.workspace = true
 tokio.workspace = true
 utils.workspace = true
+pageserver_api.workspace = true
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index a6f90154f4..b5fa903820 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -1,5 +1,6 @@
 //! Types used in safekeeper http API. Many of them are also reused internally.
 
+use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::TimestampTz;
 use serde::{Deserialize, Serialize};
 use std::net::SocketAddr;
@@ -146,7 +147,13 @@ pub type ConnectionId = u32;
 
 /// Serialize is used only for json'ing in API response. Also used internally.
 #[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct WalSenderState {
+pub enum WalSenderState {
+    Vanilla(VanillaWalSenderState),
+    Interpreted(InterpretedWalSenderState),
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct VanillaWalSenderState {
     pub ttid: TenantTimelineId,
     pub addr: SocketAddr,
     pub conn_id: ConnectionId,
@@ -155,6 +162,17 @@ pub struct WalSenderState {
     pub feedback: ReplicationFeedback,
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct InterpretedWalSenderState {
+    pub ttid: TenantTimelineId,
+    pub shard: ShardIdentity,
+    pub addr: SocketAddr,
+    pub conn_id: ConnectionId,
+    // postgres application_name
+    pub appname: Option<String>,
+    pub feedback: ReplicationFeedback,
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalReceiverState {
     /// None means it is recovery initiated by us (this safekeeper).
diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs
index 8bfa48faac..c2f9125b21 100644
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -64,7 +64,7 @@ pub struct InterpretedWalRecords {
 }
 
 /// An interpreted Postgres WAL record, ready to be handled by the pageserver
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
 pub struct InterpretedWalRecord {
     /// Optional metadata record - may cause writes to metadata keys
     /// in the storage engine
diff --git a/libs/wal_decoder/src/serialized_batch.rs b/libs/wal_decoder/src/serialized_batch.rs
index c70ff05b8e..d76f75f51f 100644
--- a/libs/wal_decoder/src/serialized_batch.rs
+++ b/libs/wal_decoder/src/serialized_batch.rs
@@ -32,7 +32,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 /// relation sizes. In the case of "observed" values, we only need to know
 /// the key and LSN, so two types of metadata are supported to save on network
 /// bandwidth.
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
 pub enum ValueMeta {
     Serialized(SerializedValueMeta),
     Observed(ObservedValueMeta),
@@ -79,7 +79,7 @@ impl PartialEq for OrderedValueMeta {
 impl Eq for OrderedValueMeta {}
 
 /// Metadata for a [`Value`] serialized into the batch.
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
 pub struct SerializedValueMeta {
     pub key: CompactKey,
     pub lsn: Lsn,
@@ -91,14 +91,14 @@ pub struct SerializedValueMeta {
 }
 
 /// Metadata for a [`Value`] observed by the batch
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
 pub struct ObservedValueMeta {
     pub key: CompactKey,
     pub lsn: Lsn,
 }
 
 /// Batch of serialized [`Value`]s.
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
 pub struct SerializedValueBatch {
     /// [`Value`]s serialized in EphemeralFile's native format,
     /// ready for disk write by the pageserver
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 3ebb7097f2..0eb511f1cc 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -26,6 +26,7 @@ hex.workspace = true
 humantime.workspace = true
 http.workspace = true
 hyper0.workspace = true
+itertools.workspace = true
 futures.workspace = true
 once_cell.workspace = true
 parking_lot.workspace = true
@@ -39,6 +40,7 @@ scopeguard.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 serde.workspace = true
 serde_json.workspace = true
+smallvec.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 thiserror.workspace = true
@@ -63,6 +65,7 @@ storage_broker.workspace = true
 tokio-stream.workspace = true
 utils.workspace = true
 wal_decoder.workspace = true
+env_logger.workspace = true
 
 workspace_hack.workspace = true
 
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index bc7af02185..6cc53e0d23 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -207,6 +207,13 @@ struct Args {
     /// Also defines interval for eviction retries.
     #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)]
     eviction_min_resident: Duration,
+    /// Enable fanning out WAL to different shards from the same reader
+    #[arg(long)]
+    wal_reader_fanout: bool,
+    /// Only fan out the WAL reader if the absoulte delta between the new requested position
+    /// and the current position of the reader is smaller than this value.
+    #[arg(long)]
+    max_delta_for_fanout: Option<u64>,
 }
 
 // Like PathBufValueParser, but allows empty string.
@@ -370,6 +377,8 @@ async fn main() -> anyhow::Result<()> {
         control_file_save_interval: args.control_file_save_interval,
         partial_backup_concurrency: args.partial_backup_concurrency,
         eviction_min_resident: args.eviction_min_resident,
+        wal_reader_fanout: args.wal_reader_fanout,
+        max_delta_for_fanout: args.max_delta_for_fanout,
     });
 
     // initialize sentry if SENTRY_DSN is provided
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 5ecde4b125..4b9fb9eb67 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -195,7 +195,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
         peer_horizon_lsn: inmem.peer_horizon_lsn,
         remote_consistent_lsn: inmem.remote_consistent_lsn,
         peers: tli.get_peers(conf).await,
-        walsenders: tli.get_walsenders().get_all(),
+        walsenders: tli.get_walsenders().get_all_public(),
         walreceivers: tli.get_walreceivers().get_all(),
     };
     json_response(StatusCode::OK, status)
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 7acf355e6a..e0090c638a 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -108,6 +108,8 @@ pub struct SafeKeeperConf {
     pub control_file_save_interval: Duration,
     pub partial_backup_concurrency: usize,
     pub eviction_min_resident: Duration,
+    pub wal_reader_fanout: bool,
+    pub max_delta_for_fanout: Option<u64>,
 }
 
 impl SafeKeeperConf {
@@ -150,6 +152,8 @@ impl SafeKeeperConf {
             control_file_save_interval: Duration::from_secs(1),
             partial_backup_concurrency: 1,
             eviction_min_resident: Duration::ZERO,
+            wal_reader_fanout: false,
+            max_delta_for_fanout: None,
         }
     }
 }
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 5883f402c7..3ea9e3d674 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -12,9 +12,9 @@ use metrics::{
     pow2_buckets,
     proto::MetricFamily,
     register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair,
-    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, Gauge, GaugeVec,
-    Histogram, HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec,
-    IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS,
+    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge,
+    register_int_gauge_vec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
+    IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS,
 };
 use once_cell::sync::Lazy;
 use postgres_ffi::XLogSegNo;
@@ -211,6 +211,14 @@ pub static WAL_RECEIVERS: Lazy<IntGauge> = Lazy::new(|| {
     )
     .expect("Failed to register safekeeper_wal_receivers")
 });
+pub static WAL_READERS: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "safekeeper_wal_readers",
+        "Number of active WAL readers (may serve pageservers or other safekeepers)",
+        &["kind", "target"]
+    )
+    .expect("Failed to register safekeeper_wal_receivers")
+});
 pub static WAL_RECEIVER_QUEUE_DEPTH: Lazy<Histogram> = Lazy::new(|| {
     // Use powers of two buckets, but add a bucket at 0 and the max queue size to track empty and
     // full queues respectively.
@@ -443,6 +451,7 @@ pub struct FullTimelineInfo {
     pub timeline_is_active: bool,
     pub num_computes: u32,
     pub last_removed_segno: XLogSegNo,
+    pub interpreted_wal_reader_tasks: usize,
 
     pub epoch_start_lsn: Lsn,
     pub mem_state: TimelineMemState,
@@ -472,6 +481,7 @@ pub struct TimelineCollector {
     disk_usage: GenericGaugeVec<AtomicU64>,
     acceptor_term: GenericGaugeVec<AtomicU64>,
     written_wal_bytes: GenericGaugeVec<AtomicU64>,
+    interpreted_wal_reader_tasks: GenericGaugeVec<AtomicU64>,
     written_wal_seconds: GaugeVec,
     flushed_wal_seconds: GaugeVec,
     collect_timeline_metrics: Gauge,
@@ -670,6 +680,16 @@ impl TimelineCollector {
         .unwrap();
         descs.extend(active_timelines_count.desc().into_iter().cloned());
 
+        let interpreted_wal_reader_tasks = GenericGaugeVec::new(
+            Opts::new(
+                "safekeeper_interpreted_wal_reader_tasks",
+                "Number of active interpreted wal reader tasks, grouped by timeline",
+            ),
+            &["tenant_id", "timeline_id"],
+        )
+        .unwrap();
+        descs.extend(interpreted_wal_reader_tasks.desc().into_iter().cloned());
+
         TimelineCollector {
             global_timelines,
             descs,
@@ -693,6 +713,7 @@ impl TimelineCollector {
             collect_timeline_metrics,
             timelines_count,
             active_timelines_count,
+            interpreted_wal_reader_tasks,
         }
     }
 }
@@ -721,6 +742,7 @@ impl Collector for TimelineCollector {
         self.disk_usage.reset();
         self.acceptor_term.reset();
         self.written_wal_bytes.reset();
+        self.interpreted_wal_reader_tasks.reset();
         self.written_wal_seconds.reset();
         self.flushed_wal_seconds.reset();
 
@@ -782,6 +804,9 @@ impl Collector for TimelineCollector {
             self.written_wal_bytes
                 .with_label_values(labels)
                 .set(tli.wal_storage.write_wal_bytes);
+            self.interpreted_wal_reader_tasks
+                .with_label_values(labels)
+                .set(tli.interpreted_wal_reader_tasks as u64);
             self.written_wal_seconds
                 .with_label_values(labels)
                 .set(tli.wal_storage.write_wal_seconds);
@@ -834,6 +859,7 @@ impl Collector for TimelineCollector {
         mfs.extend(self.disk_usage.collect());
         mfs.extend(self.acceptor_term.collect());
         mfs.extend(self.written_wal_bytes.collect());
+        mfs.extend(self.interpreted_wal_reader_tasks.collect());
         mfs.extend(self.written_wal_seconds.collect());
         mfs.extend(self.flushed_wal_seconds.collect());
 
diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
index a718c16a6a..ea09ce364d 100644
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -1,100 +1,330 @@
+use std::collections::HashMap;
+use std::fmt::Display;
+use std::sync::Arc;
 use std::time::Duration;
 
-use anyhow::Context;
+use anyhow::{anyhow, Context};
+use futures::future::Either;
 use futures::StreamExt;
 use pageserver_api::shard::ShardIdentity;
 use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend};
-use postgres_ffi::MAX_SEND_SIZE;
+use postgres_ffi::waldecoder::WalDecodeError;
 use postgres_ffi::{get_current_timestamp, waldecoder::WalStreamDecoder};
 use pq_proto::{BeMessage, InterpretedWalRecordsBody, WalSndKeepAlive};
 use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::sync::mpsc::error::SendError;
+use tokio::task::JoinHandle;
 use tokio::time::MissedTickBehavior;
+use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
 use utils::postgres_client::Compression;
 use utils::postgres_client::InterpretedFormat;
 use wal_decoder::models::{InterpretedWalRecord, InterpretedWalRecords};
 use wal_decoder::wire_format::ToWireFormat;
 
-use crate::send_wal::EndWatchView;
-use crate::wal_reader_stream::{WalBytes, WalReaderStreamBuilder};
+use crate::metrics::WAL_READERS;
+use crate::send_wal::{EndWatchView, WalSenderGuard};
+use crate::timeline::WalResidentTimeline;
+use crate::wal_reader_stream::{StreamingWalReader, WalBytes};
 
-/// Shard-aware interpreted record sender.
-/// This is used for sending WAL to the pageserver. Said WAL
-/// is pre-interpreted and filtered for the shard.
-pub(crate) struct InterpretedWalSender<'a, IO> {
-    pub(crate) format: InterpretedFormat,
-    pub(crate) compression: Option<Compression>,
-    pub(crate) pgb: &'a mut PostgresBackend<IO>,
-    pub(crate) wal_stream_builder: WalReaderStreamBuilder,
-    pub(crate) end_watch_view: EndWatchView,
-    pub(crate) shard: ShardIdentity,
-    pub(crate) pg_version: u32,
-    pub(crate) appname: Option<String>,
+/// Identifier used to differentiate between senders of the same
+/// shard.
+///
+/// In the steady state there's only one, but two pageservers may
+/// temporarily have the same shard attached and attempt to ingest
+/// WAL for it. See also [`ShardSenderId`].
+#[derive(Hash, Eq, PartialEq, Copy, Clone)]
+struct SenderId(u8);
+
+impl SenderId {
+    fn first() -> Self {
+        SenderId(0)
+    }
+
+    fn next(&self) -> Self {
+        SenderId(self.0.checked_add(1).expect("few senders"))
+    }
 }
 
-struct Batch {
+#[derive(Hash, Eq, PartialEq)]
+struct ShardSenderId {
+    shard: ShardIdentity,
+    sender_id: SenderId,
+}
+
+impl Display for ShardSenderId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}{}", self.sender_id.0, self.shard.shard_slug())
+    }
+}
+
+impl ShardSenderId {
+    fn new(shard: ShardIdentity, sender_id: SenderId) -> Self {
+        ShardSenderId { shard, sender_id }
+    }
+
+    fn shard(&self) -> ShardIdentity {
+        self.shard
+    }
+}
+
+/// Shard-aware fan-out interpreted record reader.
+/// Reads WAL from disk, decodes it, intepretets it, and sends
+/// it to any [`InterpretedWalSender`] connected to it.
+/// Each [`InterpretedWalSender`] corresponds to one shard
+/// and gets interpreted records concerning that shard only.
+pub(crate) struct InterpretedWalReader {
+    wal_stream: StreamingWalReader,
+    shard_senders: HashMap<ShardIdentity, smallvec::SmallVec<[ShardSenderState; 1]>>,
+    shard_notification_rx: Option<tokio::sync::mpsc::UnboundedReceiver<AttachShardNotification>>,
+    state: Arc<std::sync::RwLock<InterpretedWalReaderState>>,
+    pg_version: u32,
+}
+
+/// A handle for [`InterpretedWalReader`] which allows for interacting with it
+/// when it runs as a separate tokio task.
+#[derive(Debug)]
+pub(crate) struct InterpretedWalReaderHandle {
+    join_handle: JoinHandle<Result<(), InterpretedWalReaderError>>,
+    state: Arc<std::sync::RwLock<InterpretedWalReaderState>>,
+    shard_notification_tx: tokio::sync::mpsc::UnboundedSender<AttachShardNotification>,
+}
+
+struct ShardSenderState {
+    sender_id: SenderId,
+    tx: tokio::sync::mpsc::Sender<Batch>,
+    next_record_lsn: Lsn,
+}
+
+/// State of [`InterpretedWalReader`] visible outside of the task running it.
+#[derive(Debug)]
+pub(crate) enum InterpretedWalReaderState {
+    Running { current_position: Lsn },
+    Done,
+}
+
+pub(crate) struct Batch {
     wal_end_lsn: Lsn,
     available_wal_end_lsn: Lsn,
     records: InterpretedWalRecords,
 }
 
-impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
-    /// Send interpreted WAL to a receiver.
-    /// Stops when an error occurs or the receiver is caught up and there's no active compute.
-    ///
-    /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
-    /// convenience.
-    pub(crate) async fn run(self) -> Result<(), CopyStreamHandlerEnd> {
-        let mut wal_position = self.wal_stream_builder.start_pos();
-        let mut wal_decoder =
-            WalStreamDecoder::new(self.wal_stream_builder.start_pos(), self.pg_version);
+#[derive(thiserror::Error, Debug)]
+pub enum InterpretedWalReaderError {
+    /// Handler initiates the end of streaming.
+    #[error("decode error: {0}")]
+    Decode(#[from] WalDecodeError),
+    #[error("read or interpret error: {0}")]
+    ReadOrInterpret(#[from] anyhow::Error),
+    #[error("wal stream closed")]
+    WalStreamClosed,
+}
 
-        let stream = self.wal_stream_builder.build(MAX_SEND_SIZE).await?;
-        let mut stream = std::pin::pin!(stream);
+impl InterpretedWalReaderState {
+    fn current_position(&self) -> Option<Lsn> {
+        match self {
+            InterpretedWalReaderState::Running {
+                current_position, ..
+            } => Some(*current_position),
+            InterpretedWalReaderState::Done => None,
+        }
+    }
+}
 
-        let mut keepalive_ticker = tokio::time::interval(Duration::from_secs(1));
-        keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
-        keepalive_ticker.reset();
+pub(crate) struct AttachShardNotification {
+    shard_id: ShardIdentity,
+    sender: tokio::sync::mpsc::Sender<Batch>,
+    start_pos: Lsn,
+}
 
-        let (tx, mut rx) = tokio::sync::mpsc::channel::<Batch>(2);
-        let shard = vec![self.shard];
+impl InterpretedWalReader {
+    /// Spawn the reader in a separate tokio task and return a handle
+    pub(crate) fn spawn(
+        wal_stream: StreamingWalReader,
+        start_pos: Lsn,
+        tx: tokio::sync::mpsc::Sender<Batch>,
+        shard: ShardIdentity,
+        pg_version: u32,
+        appname: &Option<String>,
+    ) -> InterpretedWalReaderHandle {
+        let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running {
+            current_position: start_pos,
+        }));
+
+        let (shard_notification_tx, shard_notification_rx) = tokio::sync::mpsc::unbounded_channel();
+
+        let reader = InterpretedWalReader {
+            wal_stream,
+            shard_senders: HashMap::from([(
+                shard,
+                smallvec::smallvec![ShardSenderState {
+                    sender_id: SenderId::first(),
+                    tx,
+                    next_record_lsn: start_pos,
+                }],
+            )]),
+            shard_notification_rx: Some(shard_notification_rx),
+            state: state.clone(),
+            pg_version,
+        };
+
+        let metric = WAL_READERS
+            .get_metric_with_label_values(&["task", appname.as_deref().unwrap_or("safekeeper")])
+            .unwrap();
+
+        let join_handle = tokio::task::spawn(
+            async move {
+                metric.inc();
+                scopeguard::defer! {
+                    metric.dec();
+                }
+
+                let res = reader.run_impl(start_pos).await;
+                if let Err(ref err) = res {
+                    tracing::error!("Task finished with error: {err}");
+                }
+                res
+            }
+            .instrument(info_span!("interpreted wal reader")),
+        );
+
+        InterpretedWalReaderHandle {
+            join_handle,
+            state,
+            shard_notification_tx,
+        }
+    }
+
+    /// Construct the reader without spawning anything
+    /// Callers should drive the future returned by [`Self::run`].
+    pub(crate) fn new(
+        wal_stream: StreamingWalReader,
+        start_pos: Lsn,
+        tx: tokio::sync::mpsc::Sender<Batch>,
+        shard: ShardIdentity,
+        pg_version: u32,
+    ) -> InterpretedWalReader {
+        let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running {
+            current_position: start_pos,
+        }));
+
+        InterpretedWalReader {
+            wal_stream,
+            shard_senders: HashMap::from([(
+                shard,
+                smallvec::smallvec![ShardSenderState {
+                    sender_id: SenderId::first(),
+                    tx,
+                    next_record_lsn: start_pos,
+                }],
+            )]),
+            shard_notification_rx: None,
+            state: state.clone(),
+            pg_version,
+        }
+    }
+
+    /// Entry point for future (polling) based wal reader.
+    pub(crate) async fn run(
+        self,
+        start_pos: Lsn,
+        appname: &Option<String>,
+    ) -> Result<(), CopyStreamHandlerEnd> {
+        let metric = WAL_READERS
+            .get_metric_with_label_values(&["future", appname.as_deref().unwrap_or("safekeeper")])
+            .unwrap();
+
+        metric.inc();
+        scopeguard::defer! {
+            metric.dec();
+        }
+
+        let res = self.run_impl(start_pos).await;
+        if let Err(err) = res {
+            tracing::error!("Interpreted wal reader encountered error: {err}");
+        } else {
+            tracing::info!("Interpreted wal reader exiting");
+        }
+
+        Err(CopyStreamHandlerEnd::Other(anyhow!(
+            "interpreted wal reader finished"
+        )))
+    }
+
+    /// Send interpreted WAL to one or more [`InterpretedWalSender`]s
+    /// Stops when an error is encountered or when the [`InterpretedWalReaderHandle`]
+    /// goes out of scope.
+    async fn run_impl(mut self, start_pos: Lsn) -> Result<(), InterpretedWalReaderError> {
+        let defer_state = self.state.clone();
+        scopeguard::defer! {
+            *defer_state.write().unwrap() = InterpretedWalReaderState::Done;
+        }
+
+        let mut wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
 
         loop {
             tokio::select! {
-                // Get some WAL from the stream and then: decode, interpret and push it down the
-                // pipeline.
-                wal = stream.next(), if tx.capacity() > 0 => {
-                    let WalBytes { wal, wal_start_lsn: _, wal_end_lsn, available_wal_end_lsn } = match wal {
-                        Some(some) => some?,
-                        None => { break; }
+                // Main branch for reading WAL and forwarding it
+                wal_or_reset = self.wal_stream.next() => {
+                    let wal = wal_or_reset.map(|wor| wor.get_wal().expect("reset handled in select branch below"));
+                    let WalBytes {
+                        wal,
+                        wal_start_lsn: _,
+                        wal_end_lsn,
+                        available_wal_end_lsn,
+                    } = match wal {
+                        Some(some) => some.map_err(InterpretedWalReaderError::ReadOrInterpret)?,
+                        None => {
+                            // [`StreamingWalReader::next`] is an endless stream of WAL.
+                            // It shouldn't ever finish unless it panicked or became internally
+                            // inconsistent.
+                            return Result::Err(InterpretedWalReaderError::WalStreamClosed);
+                        }
                     };
 
-                    wal_position = wal_end_lsn;
                     wal_decoder.feed_bytes(&wal);
 
-                    let mut records = Vec::new();
+                    // Deserialize and interpret WAL records from this batch of WAL.
+                    // Interpreted records for each shard are collected separately.
+                    let shard_ids = self.shard_senders.keys().copied().collect::<Vec<_>>();
+                    let mut records_by_sender: HashMap<ShardSenderId, Vec<InterpretedWalRecord>> = HashMap::new();
                     let mut max_next_record_lsn = None;
-                    while let Some((next_record_lsn, recdata)) = wal_decoder
-                        .poll_decode()
-                        .with_context(|| "Failed to decode WAL")?
+                    while let Some((next_record_lsn, recdata)) = wal_decoder.poll_decode()?
                     {
                         assert!(next_record_lsn.is_aligned());
                         max_next_record_lsn = Some(next_record_lsn);
 
-
-                        // Deserialize and interpret WAL record
                         let interpreted = InterpretedWalRecord::from_bytes_filtered(
                             recdata,
-                            &shard,
+                            &shard_ids,
                             next_record_lsn,
                             self.pg_version,
                         )
-                        .with_context(|| "Failed to interpret WAL")?
-                        .remove(&self.shard)
-                        .unwrap();
+                        .with_context(|| "Failed to interpret WAL")?;
 
-                        if !interpreted.is_empty() {
-                            records.push(interpreted);
+                        for (shard, record) in interpreted {
+                            if record.is_empty() {
+                                continue;
+                            }
+
+                            let mut states_iter = self.shard_senders
+                                .get(&shard)
+                                .expect("keys collected above")
+                                .iter()
+                                .filter(|state| record.next_record_lsn > state.next_record_lsn)
+                                .peekable();
+                            while let Some(state) = states_iter.next() {
+                                let shard_sender_id = ShardSenderId::new(shard, state.sender_id);
+
+                                // The most commont case is one sender per shard. Peek and break to avoid the
+                                // clone in that situation.
+                                if states_iter.peek().is_none() {
+                                    records_by_sender.entry(shard_sender_id).or_default().push(record);
+                                    break;
+                                } else {
+                                    records_by_sender.entry(shard_sender_id).or_default().push(record.clone());
+                                }
+                            }
                         }
                     }
 
@@ -103,20 +333,170 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
                         None => { continue; }
                     };
 
-                    let batch = InterpretedWalRecords {
-                        records,
-                        next_record_lsn: Some(max_next_record_lsn),
-                    };
+                    // Update the current position such that new receivers can decide
+                    // whether to attach to us or spawn a new WAL reader.
+                    match &mut *self.state.write().unwrap() {
+                        InterpretedWalReaderState::Running { current_position, .. } => {
+                            *current_position = max_next_record_lsn;
+                        },
+                        InterpretedWalReaderState::Done => {
+                            unreachable!()
+                        }
+                    }
 
-                    tx.send(Batch {wal_end_lsn, available_wal_end_lsn, records: batch}).await.unwrap();
+                    // Send interpreted records downstream. Anything that has already been seen
+                    // by a shard is filtered out.
+                    let mut shard_senders_to_remove = Vec::new();
+                    for (shard, states) in &mut self.shard_senders {
+                        for state in states {
+                            if max_next_record_lsn <= state.next_record_lsn {
+                                continue;
+                            }
+
+                            let shard_sender_id = ShardSenderId::new(*shard, state.sender_id);
+                            let records = records_by_sender.remove(&shard_sender_id).unwrap_or_default();
+
+                            let batch = InterpretedWalRecords {
+                                records,
+                                next_record_lsn: Some(max_next_record_lsn),
+                            };
+
+                            let res = state.tx.send(Batch {
+                                wal_end_lsn,
+                                available_wal_end_lsn,
+                                records: batch,
+                            }).await;
+
+                            if res.is_err() {
+                                shard_senders_to_remove.push(shard_sender_id);
+                            } else {
+                                state.next_record_lsn = max_next_record_lsn;
+                            }
+                        }
+                    }
+
+                    // Clean up any shard senders that have dropped out.
+                    // This is inefficient, but such events are rare (connection to PS termination)
+                    // and the number of subscriptions on the same shards very small (only one
+                    // for the steady state).
+                    for to_remove in shard_senders_to_remove {
+                        let shard_senders = self.shard_senders.get_mut(&to_remove.shard()).expect("saw it above");
+                        if let Some(idx) = shard_senders.iter().position(|s| s.sender_id == to_remove.sender_id) {
+                            shard_senders.remove(idx);
+                            tracing::info!("Removed shard sender {}", to_remove);
+                        }
+
+                        if shard_senders.is_empty() {
+                            self.shard_senders.remove(&to_remove.shard());
+                        }
+                    }
                 },
-                // For a previously interpreted batch, serialize it and push it down the wire.
-                batch = rx.recv() => {
+                // Listen for new shards that want to attach to this reader.
+                // If the reader is not running as a task, then this is not supported
+                // (see the pending branch below).
+                notification = match self.shard_notification_rx.as_mut() {
+                        Some(rx) => Either::Left(rx.recv()),
+                        None => Either::Right(std::future::pending())
+                    } => {
+                    if let Some(n) = notification {
+                        let AttachShardNotification { shard_id, sender, start_pos } = n;
+
+                        // Update internal and external state, then reset the WAL stream
+                        // if required.
+                        let senders = self.shard_senders.entry(shard_id).or_default();
+                        let new_sender_id = match senders.last() {
+                            Some(sender) => sender.sender_id.next(),
+                            None => SenderId::first()
+                        };
+
+                        senders.push(ShardSenderState { sender_id: new_sender_id, tx: sender, next_record_lsn: start_pos});
+                        let current_pos = self.state.read().unwrap().current_position().unwrap();
+                        if start_pos < current_pos {
+                            self.wal_stream.reset(start_pos).await;
+                            wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
+                        }
+
+                        tracing::info!(
+                            "Added shard sender {} with start_pos={} current_pos={}",
+                            ShardSenderId::new(shard_id, new_sender_id), start_pos, current_pos
+                        );
+                    }
+                }
+            }
+        }
+    }
+}
+
+impl InterpretedWalReaderHandle {
+    /// Fan-out the reader by attaching a new shard to it
+    pub(crate) fn fanout(
+        &self,
+        shard_id: ShardIdentity,
+        sender: tokio::sync::mpsc::Sender<Batch>,
+        start_pos: Lsn,
+    ) -> Result<(), SendError<AttachShardNotification>> {
+        self.shard_notification_tx.send(AttachShardNotification {
+            shard_id,
+            sender,
+            start_pos,
+        })
+    }
+
+    /// Get the current WAL position of the reader
+    pub(crate) fn current_position(&self) -> Option<Lsn> {
+        self.state.read().unwrap().current_position()
+    }
+
+    pub(crate) fn abort(&self) {
+        self.join_handle.abort()
+    }
+}
+
+impl Drop for InterpretedWalReaderHandle {
+    fn drop(&mut self) {
+        tracing::info!("Aborting interpreted wal reader");
+        self.abort()
+    }
+}
+
+pub(crate) struct InterpretedWalSender<'a, IO> {
+    pub(crate) format: InterpretedFormat,
+    pub(crate) compression: Option<Compression>,
+    pub(crate) appname: Option<String>,
+
+    pub(crate) tli: WalResidentTimeline,
+    pub(crate) start_lsn: Lsn,
+
+    pub(crate) pgb: &'a mut PostgresBackend<IO>,
+    pub(crate) end_watch_view: EndWatchView,
+    pub(crate) wal_sender_guard: Arc<WalSenderGuard>,
+    pub(crate) rx: tokio::sync::mpsc::Receiver<Batch>,
+}
+
+impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
+    /// Send interpreted WAL records over the network.
+    /// Also manages keep-alives if nothing was sent for a while.
+    pub(crate) async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> {
+        let mut keepalive_ticker = tokio::time::interval(Duration::from_secs(1));
+        keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
+        keepalive_ticker.reset();
+
+        let mut wal_position = self.start_lsn;
+
+        loop {
+            tokio::select! {
+                batch = self.rx.recv() => {
                     let batch = match batch {
                         Some(b) => b,
-                        None => { break; }
+                        None => {
+                            return Result::Err(
+                                CopyStreamHandlerEnd::Other(anyhow!("Interpreted WAL reader exited early"))
+                            );
+                        }
                     };
 
+                    wal_position = batch.wal_end_lsn;
+
                     let buf = batch
                         .records
                         .to_wire(self.format, self.compression)
@@ -136,7 +516,21 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
                         })).await?;
                 }
                 // Send a periodic keep alive when the connection has been idle for a while.
+                // Since we've been idle, also check if we can stop streaming.
                 _ = keepalive_ticker.tick() => {
+                    if let Some(remote_consistent_lsn) = self.wal_sender_guard
+                        .walsenders()
+                        .get_ws_remote_consistent_lsn(self.wal_sender_guard.id())
+                    {
+                        if self.tli.should_walsender_stop(remote_consistent_lsn).await {
+                            // Stop streaming if the receivers are caught up and
+                            // there's no active compute. This causes the loop in
+                            // [`crate::send_interpreted_wal::InterpretedWalSender::run`]
+                            // to exit and terminate the WAL stream.
+                            break;
+                        }
+                    }
+
                     self.pgb
                         .write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
                             wal_end: self.end_watch_view.get().0,
@@ -144,14 +538,259 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
                             request_reply: true,
                         }))
                         .await?;
-                }
+                },
             }
         }
 
-        // The loop above ends when the receiver is caught up and there's no more WAL to send.
         Err(CopyStreamHandlerEnd::ServerInitiated(format!(
             "ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
             self.appname, wal_position,
         )))
     }
 }
+#[cfg(test)]
+mod tests {
+    use std::{collections::HashMap, str::FromStr, time::Duration};
+
+    use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
+    use postgres_ffi::MAX_SEND_SIZE;
+    use tokio::sync::mpsc::error::TryRecvError;
+    use utils::{
+        id::{NodeId, TenantTimelineId},
+        lsn::Lsn,
+        shard::{ShardCount, ShardNumber},
+    };
+
+    use crate::{
+        send_interpreted_wal::{Batch, InterpretedWalReader},
+        test_utils::Env,
+        wal_reader_stream::StreamingWalReader,
+    };
+
+    #[tokio::test]
+    async fn test_interpreted_wal_reader_fanout() {
+        let _ = env_logger::builder().is_test(true).try_init();
+
+        const SIZE: usize = 8 * 1024;
+        const MSG_COUNT: usize = 200;
+        const PG_VERSION: u32 = 17;
+        const SHARD_COUNT: u8 = 2;
+
+        let start_lsn = Lsn::from_str("0/149FD18").unwrap();
+        let env = Env::new(true).unwrap();
+        let tli = env
+            .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
+            .await
+            .unwrap();
+
+        let resident_tli = tli.wal_residence_guard().await.unwrap();
+        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
+            .await
+            .unwrap();
+        let end_pos = end_watch.get();
+
+        tracing::info!("Doing first round of reads ...");
+
+        let streaming_wal_reader = StreamingWalReader::new(
+            resident_tli,
+            None,
+            start_lsn,
+            end_pos,
+            end_watch,
+            MAX_SEND_SIZE,
+        );
+
+        let shard_0 = ShardIdentity::new(
+            ShardNumber(0),
+            ShardCount(SHARD_COUNT),
+            ShardStripeSize::default(),
+        )
+        .unwrap();
+
+        let shard_1 = ShardIdentity::new(
+            ShardNumber(1),
+            ShardCount(SHARD_COUNT),
+            ShardStripeSize::default(),
+        )
+        .unwrap();
+
+        let mut shards = HashMap::new();
+
+        for shard_number in 0..SHARD_COUNT {
+            let shard_id = ShardIdentity::new(
+                ShardNumber(shard_number),
+                ShardCount(SHARD_COUNT),
+                ShardStripeSize::default(),
+            )
+            .unwrap();
+            let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
+            shards.insert(shard_id, (Some(tx), Some(rx)));
+        }
+
+        let shard_0_tx = shards.get_mut(&shard_0).unwrap().0.take().unwrap();
+        let mut shard_0_rx = shards.get_mut(&shard_0).unwrap().1.take().unwrap();
+
+        let handle = InterpretedWalReader::spawn(
+            streaming_wal_reader,
+            start_lsn,
+            shard_0_tx,
+            shard_0,
+            PG_VERSION,
+            &Some("pageserver".to_string()),
+        );
+
+        tracing::info!("Reading all WAL with only shard 0 attached ...");
+
+        let mut shard_0_interpreted_records = Vec::new();
+        while let Some(batch) = shard_0_rx.recv().await {
+            shard_0_interpreted_records.push(batch.records);
+            if batch.wal_end_lsn == batch.available_wal_end_lsn {
+                break;
+            }
+        }
+
+        let shard_1_tx = shards.get_mut(&shard_1).unwrap().0.take().unwrap();
+        let mut shard_1_rx = shards.get_mut(&shard_1).unwrap().1.take().unwrap();
+
+        tracing::info!("Attaching shard 1 to the reader at start of WAL");
+        handle.fanout(shard_1, shard_1_tx, start_lsn).unwrap();
+
+        tracing::info!("Reading all WAL with shard 0 and shard 1 attached ...");
+
+        let mut shard_1_interpreted_records = Vec::new();
+        while let Some(batch) = shard_1_rx.recv().await {
+            shard_1_interpreted_records.push(batch.records);
+            if batch.wal_end_lsn == batch.available_wal_end_lsn {
+                break;
+            }
+        }
+
+        // This test uses logical messages. Those only go to shard 0. Check that the
+        // filtering worked and shard 1 did not get any.
+        assert!(shard_1_interpreted_records
+            .iter()
+            .all(|recs| recs.records.is_empty()));
+
+        // Shard 0 should not receive anything more since the reader is
+        // going through wal that it has already processed.
+        let res = shard_0_rx.try_recv();
+        if let Ok(ref ok) = res {
+            tracing::error!(
+                "Shard 0 received batch: wal_end_lsn={} available_wal_end_lsn={}",
+                ok.wal_end_lsn,
+                ok.available_wal_end_lsn
+            );
+        }
+        assert!(matches!(res, Err(TryRecvError::Empty)));
+
+        // Check that the next records lsns received by the two shards match up.
+        let shard_0_next_lsns = shard_0_interpreted_records
+            .iter()
+            .map(|recs| recs.next_record_lsn)
+            .collect::<Vec<_>>();
+        let shard_1_next_lsns = shard_1_interpreted_records
+            .iter()
+            .map(|recs| recs.next_record_lsn)
+            .collect::<Vec<_>>();
+        assert_eq!(shard_0_next_lsns, shard_1_next_lsns);
+
+        handle.abort();
+        let mut done = false;
+        for _ in 0..5 {
+            if handle.current_position().is_none() {
+                done = true;
+                break;
+            }
+            tokio::time::sleep(Duration::from_millis(1)).await;
+        }
+
+        assert!(done);
+    }
+
+    #[tokio::test]
+    async fn test_interpreted_wal_reader_same_shard_fanout() {
+        let _ = env_logger::builder().is_test(true).try_init();
+
+        const SIZE: usize = 8 * 1024;
+        const MSG_COUNT: usize = 200;
+        const PG_VERSION: u32 = 17;
+        const SHARD_COUNT: u8 = 2;
+        const ATTACHED_SHARDS: u8 = 4;
+
+        let start_lsn = Lsn::from_str("0/149FD18").unwrap();
+        let env = Env::new(true).unwrap();
+        let tli = env
+            .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
+            .await
+            .unwrap();
+
+        let resident_tli = tli.wal_residence_guard().await.unwrap();
+        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
+            .await
+            .unwrap();
+        let end_pos = end_watch.get();
+
+        let streaming_wal_reader = StreamingWalReader::new(
+            resident_tli,
+            None,
+            start_lsn,
+            end_pos,
+            end_watch,
+            MAX_SEND_SIZE,
+        );
+
+        let shard_0 = ShardIdentity::new(
+            ShardNumber(0),
+            ShardCount(SHARD_COUNT),
+            ShardStripeSize::default(),
+        )
+        .unwrap();
+
+        let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
+        let mut batch_receivers = vec![rx];
+
+        let handle = InterpretedWalReader::spawn(
+            streaming_wal_reader,
+            start_lsn,
+            tx,
+            shard_0,
+            PG_VERSION,
+            &Some("pageserver".to_string()),
+        );
+
+        for _ in 0..(ATTACHED_SHARDS - 1) {
+            let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
+            handle.fanout(shard_0, tx, start_lsn).unwrap();
+            batch_receivers.push(rx);
+        }
+
+        loop {
+            let batch = batch_receivers.first_mut().unwrap().recv().await.unwrap();
+            for rx in batch_receivers.iter_mut().skip(1) {
+                let other_batch = rx.recv().await.unwrap();
+
+                assert_eq!(batch.wal_end_lsn, other_batch.wal_end_lsn);
+                assert_eq!(
+                    batch.available_wal_end_lsn,
+                    other_batch.available_wal_end_lsn
+                );
+            }
+
+            if batch.wal_end_lsn == batch.available_wal_end_lsn {
+                break;
+            }
+        }
+
+        handle.abort();
+        let mut done = false;
+        for _ in 0..5 {
+            if handle.current_position().is_none() {
+                done = true;
+                break;
+            }
+            tokio::time::sleep(Duration::from_millis(1)).await;
+        }
+
+        assert!(done);
+    }
+}
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 8463221998..4a4a74a0fd 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -2,16 +2,18 @@
 //! with the "START_REPLICATION" message, and registry of walsenders.
 
 use crate::handler::SafekeeperPostgresHandler;
-use crate::metrics::RECEIVED_PS_FEEDBACKS;
+use crate::metrics::{RECEIVED_PS_FEEDBACKS, WAL_READERS};
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::TermLsn;
-use crate::send_interpreted_wal::InterpretedWalSender;
+use crate::send_interpreted_wal::{
+    Batch, InterpretedWalReader, InterpretedWalReaderHandle, InterpretedWalSender,
+};
 use crate::timeline::WalResidentTimeline;
-use crate::wal_reader_stream::WalReaderStreamBuilder;
+use crate::wal_reader_stream::StreamingWalReader;
 use crate::wal_storage::WalReader;
 use anyhow::{bail, Context as AnyhowContext};
 use bytes::Bytes;
-use futures::future::Either;
+use futures::FutureExt;
 use parking_lot::Mutex;
 use postgres_backend::PostgresBackend;
 use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError};
@@ -19,16 +21,16 @@ use postgres_ffi::get_current_timestamp;
 use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
 use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
 use safekeeper_api::models::{
-    ConnectionId, HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply,
-    WalSenderState, INVALID_FULL_TRANSACTION_ID,
+    HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply,
+    INVALID_FULL_TRANSACTION_ID,
 };
 use safekeeper_api::Term;
 use tokio::io::{AsyncRead, AsyncWrite};
 use utils::failpoint_support;
-use utils::id::TenantTimelineId;
 use utils::pageserver_feedback::PageserverFeedback;
 use utils::postgres_client::PostgresClientProtocol;
 
+use itertools::Itertools;
 use std::cmp::{max, min};
 use std::net::SocketAddr;
 use std::sync::Arc;
@@ -50,6 +52,12 @@ pub struct WalSenders {
     walreceivers: Arc<WalReceivers>,
 }
 
+pub struct WalSendersTimelineMetricValues {
+    pub ps_feedback_counter: u64,
+    pub last_ps_feedback: PageserverFeedback,
+    pub interpreted_wal_reader_tasks: usize,
+}
+
 impl WalSenders {
     pub fn new(walreceivers: Arc<WalReceivers>) -> Arc<WalSenders> {
         Arc::new(WalSenders {
@@ -60,21 +68,8 @@ impl WalSenders {
 
     /// Register new walsender. Returned guard provides access to the slot and
     /// automatically deregisters in Drop.
-    fn register(
-        self: &Arc<WalSenders>,
-        ttid: TenantTimelineId,
-        addr: SocketAddr,
-        conn_id: ConnectionId,
-        appname: Option<String>,
-    ) -> WalSenderGuard {
+    fn register(self: &Arc<WalSenders>, walsender_state: WalSenderState) -> WalSenderGuard {
         let slots = &mut self.mutex.lock().slots;
-        let walsender_state = WalSenderState {
-            ttid,
-            addr,
-            conn_id,
-            appname,
-            feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
-        };
         // find empty slot or create new one
         let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) {
             slots[pos] = Some(walsender_state);
@@ -90,9 +85,79 @@ impl WalSenders {
         }
     }
 
+    fn create_or_update_interpreted_reader<
+        FUp: FnOnce(&Arc<InterpretedWalReaderHandle>) -> anyhow::Result<()>,
+        FNew: FnOnce() -> InterpretedWalReaderHandle,
+    >(
+        self: &Arc<WalSenders>,
+        id: WalSenderId,
+        start_pos: Lsn,
+        max_delta_for_fanout: Option<u64>,
+        update: FUp,
+        create: FNew,
+    ) -> anyhow::Result<()> {
+        let state = &mut self.mutex.lock();
+
+        let mut selected_interpreted_reader = None;
+        for slot in state.slots.iter().flatten() {
+            if let WalSenderState::Interpreted(slot_state) = slot {
+                if let Some(ref interpreted_reader) = slot_state.interpreted_wal_reader {
+                    let select = match (interpreted_reader.current_position(), max_delta_for_fanout)
+                    {
+                        (Some(pos), Some(max_delta)) => {
+                            let delta = pos.0.abs_diff(start_pos.0);
+                            delta <= max_delta
+                        }
+                        // Reader is not active
+                        (None, _) => false,
+                        // Gating fanout by max delta is disabled.
+                        // Attach to any active reader.
+                        (_, None) => true,
+                    };
+
+                    if select {
+                        selected_interpreted_reader = Some(interpreted_reader.clone());
+                        break;
+                    }
+                }
+            }
+        }
+
+        let slot = state.get_slot_mut(id);
+        let slot_state = match slot {
+            WalSenderState::Interpreted(s) => s,
+            WalSenderState::Vanilla(_) => unreachable!(),
+        };
+
+        let selected_or_new = match selected_interpreted_reader {
+            Some(selected) => {
+                update(&selected)?;
+                selected
+            }
+            None => Arc::new(create()),
+        };
+
+        slot_state.interpreted_wal_reader = Some(selected_or_new);
+
+        Ok(())
+    }
+
     /// Get state of all walsenders.
-    pub fn get_all(self: &Arc<WalSenders>) -> Vec<WalSenderState> {
-        self.mutex.lock().slots.iter().flatten().cloned().collect()
+    pub fn get_all_public(self: &Arc<WalSenders>) -> Vec<safekeeper_api::models::WalSenderState> {
+        self.mutex
+            .lock()
+            .slots
+            .iter()
+            .flatten()
+            .map(|state| match state {
+                WalSenderState::Vanilla(s) => {
+                    safekeeper_api::models::WalSenderState::Vanilla(s.clone())
+                }
+                WalSenderState::Interpreted(s) => {
+                    safekeeper_api::models::WalSenderState::Interpreted(s.public_state.clone())
+                }
+            })
+            .collect()
     }
 
     /// Get LSN of the most lagging pageserver receiver. Return None if there are no
@@ -103,7 +168,7 @@ impl WalSenders {
             .slots
             .iter()
             .flatten()
-            .filter_map(|s| match s.feedback {
+            .filter_map(|s| match s.get_feedback() {
                 ReplicationFeedback::Pageserver(feedback) => Some(feedback.last_received_lsn),
                 ReplicationFeedback::Standby(_) => None,
             })
@@ -111,9 +176,25 @@ impl WalSenders {
     }
 
     /// Returns total counter of pageserver feedbacks received and last feedback.
-    pub fn get_ps_feedback_stats(self: &Arc<WalSenders>) -> (u64, PageserverFeedback) {
+    pub fn info_for_metrics(self: &Arc<WalSenders>) -> WalSendersTimelineMetricValues {
         let shared = self.mutex.lock();
-        (shared.ps_feedback_counter, shared.last_ps_feedback)
+
+        let interpreted_wal_reader_tasks = shared
+            .slots
+            .iter()
+            .filter_map(|ss| match ss {
+                Some(WalSenderState::Interpreted(int)) => int.interpreted_wal_reader.as_ref(),
+                Some(WalSenderState::Vanilla(_)) => None,
+                None => None,
+            })
+            .unique_by(|reader| Arc::as_ptr(reader))
+            .count();
+
+        WalSendersTimelineMetricValues {
+            ps_feedback_counter: shared.ps_feedback_counter,
+            last_ps_feedback: shared.last_ps_feedback,
+            interpreted_wal_reader_tasks,
+        }
     }
 
     /// Get aggregated hot standby feedback (we send it to compute).
@@ -124,7 +205,7 @@ impl WalSenders {
     /// Record new pageserver feedback, update aggregated values.
     fn record_ps_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &PageserverFeedback) {
         let mut shared = self.mutex.lock();
-        shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback);
+        *shared.get_slot_mut(id).get_mut_feedback() = ReplicationFeedback::Pageserver(*feedback);
         shared.last_ps_feedback = *feedback;
         shared.ps_feedback_counter += 1;
         drop(shared);
@@ -143,10 +224,10 @@ impl WalSenders {
             "Record standby reply: ts={} apply_lsn={}",
             reply.reply_ts, reply.apply_lsn
         );
-        match &mut slot.feedback {
+        match &mut slot.get_mut_feedback() {
             ReplicationFeedback::Standby(sf) => sf.reply = *reply,
             ReplicationFeedback::Pageserver(_) => {
-                slot.feedback = ReplicationFeedback::Standby(StandbyFeedback {
+                *slot.get_mut_feedback() = ReplicationFeedback::Standby(StandbyFeedback {
                     reply: *reply,
                     hs_feedback: HotStandbyFeedback::empty(),
                 })
@@ -158,10 +239,10 @@ impl WalSenders {
     fn record_hs_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &HotStandbyFeedback) {
         let mut shared = self.mutex.lock();
         let slot = shared.get_slot_mut(id);
-        match &mut slot.feedback {
+        match &mut slot.get_mut_feedback() {
             ReplicationFeedback::Standby(sf) => sf.hs_feedback = *feedback,
             ReplicationFeedback::Pageserver(_) => {
-                slot.feedback = ReplicationFeedback::Standby(StandbyFeedback {
+                *slot.get_mut_feedback() = ReplicationFeedback::Standby(StandbyFeedback {
                     reply: StandbyReply::empty(),
                     hs_feedback: *feedback,
                 })
@@ -175,7 +256,7 @@ impl WalSenders {
     pub fn get_ws_remote_consistent_lsn(self: &Arc<WalSenders>, id: WalSenderId) -> Option<Lsn> {
         let shared = self.mutex.lock();
         let slot = shared.get_slot(id);
-        match slot.feedback {
+        match slot.get_feedback() {
             ReplicationFeedback::Pageserver(feedback) => Some(feedback.remote_consistent_lsn),
             _ => None,
         }
@@ -199,6 +280,47 @@ struct WalSendersShared {
     slots: Vec<Option<WalSenderState>>,
 }
 
+/// Safekeeper internal definitions of wal sender state
+///
+/// As opposed to [`safekeeper_api::models::WalSenderState`] these struct may
+/// include state that we don not wish to expose to the public api.
+#[derive(Debug, Clone)]
+pub(crate) enum WalSenderState {
+    Vanilla(VanillaWalSenderInternalState),
+    Interpreted(InterpretedWalSenderInternalState),
+}
+
+type VanillaWalSenderInternalState = safekeeper_api::models::VanillaWalSenderState;
+
+#[derive(Debug, Clone)]
+pub(crate) struct InterpretedWalSenderInternalState {
+    public_state: safekeeper_api::models::InterpretedWalSenderState,
+    interpreted_wal_reader: Option<Arc<InterpretedWalReaderHandle>>,
+}
+
+impl WalSenderState {
+    fn get_addr(&self) -> &SocketAddr {
+        match self {
+            WalSenderState::Vanilla(state) => &state.addr,
+            WalSenderState::Interpreted(state) => &state.public_state.addr,
+        }
+    }
+
+    fn get_feedback(&self) -> &ReplicationFeedback {
+        match self {
+            WalSenderState::Vanilla(state) => &state.feedback,
+            WalSenderState::Interpreted(state) => &state.public_state.feedback,
+        }
+    }
+
+    fn get_mut_feedback(&mut self) -> &mut ReplicationFeedback {
+        match self {
+            WalSenderState::Vanilla(state) => &mut state.feedback,
+            WalSenderState::Interpreted(state) => &mut state.public_state.feedback,
+        }
+    }
+}
+
 impl WalSendersShared {
     fn new() -> Self {
         WalSendersShared {
@@ -225,7 +347,7 @@ impl WalSendersShared {
         let mut agg = HotStandbyFeedback::empty();
         let mut reply_agg = StandbyReply::empty();
         for ws_state in self.slots.iter().flatten() {
-            if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback {
+            if let ReplicationFeedback::Standby(standby_feedback) = ws_state.get_feedback() {
                 let hs_feedback = standby_feedback.hs_feedback;
                 // doing Option math like op1.iter().chain(op2.iter()).min()
                 // would be nicer, but we serialize/deserialize this struct
@@ -317,7 +439,7 @@ impl SafekeeperPostgresHandler {
     /// Wrapper around handle_start_replication_guts handling result. Error is
     /// handled here while we're still in walsender ttid span; with API
     /// extension, this can probably be moved into postgres_backend.
-    pub async fn handle_start_replication<IO: AsyncRead + AsyncWrite + Unpin>(
+    pub async fn handle_start_replication<IO: AsyncRead + AsyncWrite + Unpin + Send>(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
         start_pos: Lsn,
@@ -342,7 +464,7 @@ impl SafekeeperPostgresHandler {
         Ok(())
     }
 
-    pub async fn handle_start_replication_guts<IO: AsyncRead + AsyncWrite + Unpin>(
+    pub async fn handle_start_replication_guts<IO: AsyncRead + AsyncWrite + Unpin + Send>(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
         start_pos: Lsn,
@@ -352,12 +474,30 @@ impl SafekeeperPostgresHandler {
         let appname = self.appname.clone();
 
         // Use a guard object to remove our entry from the timeline when we are done.
-        let ws_guard = Arc::new(tli.get_walsenders().register(
-            self.ttid,
-            *pgb.get_peer_addr(),
-            self.conn_id,
-            self.appname.clone(),
-        ));
+        let ws_guard = match self.protocol() {
+            PostgresClientProtocol::Vanilla => Arc::new(tli.get_walsenders().register(
+                WalSenderState::Vanilla(VanillaWalSenderInternalState {
+                    ttid: self.ttid,
+                    addr: *pgb.get_peer_addr(),
+                    conn_id: self.conn_id,
+                    appname: self.appname.clone(),
+                    feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
+                }),
+            )),
+            PostgresClientProtocol::Interpreted { .. } => Arc::new(tli.get_walsenders().register(
+                WalSenderState::Interpreted(InterpretedWalSenderInternalState {
+                    public_state: safekeeper_api::models::InterpretedWalSenderState {
+                        ttid: self.ttid,
+                        shard: self.shard.unwrap(),
+                        addr: *pgb.get_peer_addr(),
+                        conn_id: self.conn_id,
+                        appname: self.appname.clone(),
+                        feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
+                    },
+                    interpreted_wal_reader: None,
+                }),
+            )),
+        };
 
         // Walsender can operate in one of two modes which we select by
         // application_name: give only committed WAL (used by pageserver) or all
@@ -403,7 +543,7 @@ impl SafekeeperPostgresHandler {
                     pgb,
                     // should succeed since we're already holding another guard
                     tli: tli.wal_residence_guard().await?,
-                    appname,
+                    appname: appname.clone(),
                     start_pos,
                     end_pos,
                     term,
@@ -413,7 +553,7 @@ impl SafekeeperPostgresHandler {
                     send_buf: vec![0u8; MAX_SEND_SIZE],
                 };
 
-                Either::Left(sender.run())
+                FutureExt::boxed(sender.run())
             }
             PostgresClientProtocol::Interpreted {
                 format,
@@ -421,27 +561,96 @@ impl SafekeeperPostgresHandler {
             } => {
                 let pg_version = tli.tli.get_state().await.1.server.pg_version / 10000;
                 let end_watch_view = end_watch.view();
-                let wal_stream_builder = WalReaderStreamBuilder {
-                    tli: tli.wal_residence_guard().await?,
-                    start_pos,
-                    end_pos,
-                    term,
-                    end_watch,
-                    wal_sender_guard: ws_guard.clone(),
-                };
+                let wal_residence_guard = tli.wal_residence_guard().await?;
+                let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(2);
+                let shard = self.shard.unwrap();
 
-                let sender = InterpretedWalSender {
-                    format,
-                    compression,
-                    pgb,
-                    wal_stream_builder,
-                    end_watch_view,
-                    shard: self.shard.unwrap(),
-                    pg_version,
-                    appname,
-                };
+                if self.conf.wal_reader_fanout && !shard.is_unsharded() {
+                    let ws_id = ws_guard.id();
+                    ws_guard.walsenders().create_or_update_interpreted_reader(
+                        ws_id,
+                        start_pos,
+                        self.conf.max_delta_for_fanout,
+                        {
+                            let tx = tx.clone();
+                            |reader| {
+                                tracing::info!(
+                                    "Fanning out interpreted wal reader at {}",
+                                    start_pos
+                                );
+                                reader
+                                    .fanout(shard, tx, start_pos)
+                                    .with_context(|| "Failed to fan out reader")
+                            }
+                        },
+                        || {
+                            tracing::info!("Spawning interpreted wal reader at {}", start_pos);
 
-                Either::Right(sender.run())
+                            let wal_stream = StreamingWalReader::new(
+                                wal_residence_guard,
+                                term,
+                                start_pos,
+                                end_pos,
+                                end_watch,
+                                MAX_SEND_SIZE,
+                            );
+
+                            InterpretedWalReader::spawn(
+                                wal_stream, start_pos, tx, shard, pg_version, &appname,
+                            )
+                        },
+                    )?;
+
+                    let sender = InterpretedWalSender {
+                        format,
+                        compression,
+                        appname,
+                        tli: tli.wal_residence_guard().await?,
+                        start_lsn: start_pos,
+                        pgb,
+                        end_watch_view,
+                        wal_sender_guard: ws_guard.clone(),
+                        rx,
+                    };
+
+                    FutureExt::boxed(sender.run())
+                } else {
+                    let wal_reader = StreamingWalReader::new(
+                        wal_residence_guard,
+                        term,
+                        start_pos,
+                        end_pos,
+                        end_watch,
+                        MAX_SEND_SIZE,
+                    );
+
+                    let reader =
+                        InterpretedWalReader::new(wal_reader, start_pos, tx, shard, pg_version);
+
+                    let sender = InterpretedWalSender {
+                        format,
+                        compression,
+                        appname: appname.clone(),
+                        tli: tli.wal_residence_guard().await?,
+                        start_lsn: start_pos,
+                        pgb,
+                        end_watch_view,
+                        wal_sender_guard: ws_guard.clone(),
+                        rx,
+                    };
+
+                    FutureExt::boxed(async move {
+                        // Sender returns an Err on all code paths.
+                        // If the sender finishes first, we will drop the reader future.
+                        // If the reader finishes first, the sender will finish too since
+                        // the wal sender has dropped.
+                        let res = tokio::try_join!(sender.run(), reader.run(start_pos, &appname));
+                        match res.map(|_| ()) {
+                            Ok(_) => unreachable!("sender finishes with Err by convention"),
+                            err_res => err_res,
+                        }
+                    })
+                }
             }
         };
 
@@ -470,7 +679,8 @@ impl SafekeeperPostgresHandler {
             .clone();
         info!(
             "finished streaming to {}, feedback={:?}",
-            ws_state.addr, ws_state.feedback,
+            ws_state.get_addr(),
+            ws_state.get_feedback(),
         );
 
         // Join pg backend back.
@@ -578,6 +788,18 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
     /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
     /// convenience.
     async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> {
+        let metric = WAL_READERS
+            .get_metric_with_label_values(&[
+                "future",
+                self.appname.as_deref().unwrap_or("safekeeper"),
+            ])
+            .unwrap();
+
+        metric.inc();
+        scopeguard::defer! {
+            metric.dec();
+        }
+
         loop {
             // Wait for the next portion if it is not there yet, or just
             // update our end of WAL available for sending value, we
@@ -813,7 +1035,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
 #[cfg(test)]
 mod tests {
     use safekeeper_api::models::FullTransactionId;
-    use utils::id::{TenantId, TimelineId};
+    use utils::id::{TenantId, TenantTimelineId, TimelineId};
 
     use super::*;
 
@@ -830,13 +1052,13 @@ mod tests {
 
     // add to wss specified feedback setting other fields to dummy values
     fn push_feedback(wss: &mut WalSendersShared, feedback: ReplicationFeedback) {
-        let walsender_state = WalSenderState {
+        let walsender_state = WalSenderState::Vanilla(VanillaWalSenderInternalState {
             ttid: mock_ttid(),
             addr: mock_addr(),
             conn_id: 1,
             appname: None,
             feedback,
-        };
+        });
         wss.slots.push(Some(walsender_state))
     }
 
diff --git a/safekeeper/src/test_utils.rs b/safekeeper/src/test_utils.rs
index c40a8bae5a..4e851c5b3d 100644
--- a/safekeeper/src/test_utils.rs
+++ b/safekeeper/src/test_utils.rs
@@ -1,13 +1,19 @@
 use std::sync::Arc;
 
 use crate::rate_limit::RateLimiter;
-use crate::safekeeper::{ProposerAcceptorMessage, ProposerElected, SafeKeeper, TermHistory};
+use crate::receive_wal::WalAcceptor;
+use crate::safekeeper::{
+    AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage,
+    ProposerElected, SafeKeeper, TermHistory,
+};
+use crate::send_wal::EndWatch;
 use crate::state::{TimelinePersistentState, TimelineState};
 use crate::timeline::{get_timeline_dir, SharedState, StateSK, Timeline};
 use crate::timelines_set::TimelinesSet;
 use crate::wal_backup::remote_timeline_path;
-use crate::{control_file, wal_storage, SafeKeeperConf};
+use crate::{control_file, receive_wal, wal_storage, SafeKeeperConf};
 use camino_tempfile::Utf8TempDir;
+use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator};
 use tokio::fs::create_dir_all;
 use utils::id::{NodeId, TenantTimelineId};
 use utils::lsn::Lsn;
@@ -107,4 +113,59 @@ impl Env {
         );
         Ok(timeline)
     }
+
+    // This will be dead code when building a non-benchmark target with the
+    // benchmarking feature enabled.
+    #[allow(dead_code)]
+    pub(crate) async fn write_wal(
+        tli: Arc<Timeline>,
+        start_lsn: Lsn,
+        msg_size: usize,
+        msg_count: usize,
+    ) -> anyhow::Result<EndWatch> {
+        let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE);
+        let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE);
+
+        let end_watch = EndWatch::Commit(tli.get_commit_lsn_watch_rx());
+
+        WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, Some(0));
+
+        let prefix = c"p";
+        let prefixlen = prefix.to_bytes_with_nul().len();
+        assert!(msg_size >= prefixlen);
+        let message = vec![0; msg_size - prefixlen];
+
+        let walgen =
+            &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), start_lsn);
+        for _ in 0..msg_count {
+            let (lsn, record) = walgen.next().unwrap();
+
+            let req = AppendRequest {
+                h: AppendRequestHeader {
+                    term: 1,
+                    term_start_lsn: start_lsn,
+                    begin_lsn: lsn,
+                    end_lsn: lsn + record.len() as u64,
+                    commit_lsn: lsn,
+                    truncate_lsn: Lsn(0),
+                    proposer_uuid: [0; 16],
+                },
+                wal_data: record,
+            };
+
+            let end_lsn = req.h.end_lsn;
+
+            let msg = ProposerAcceptorMessage::AppendRequest(req);
+            msg_tx.send(msg).await?;
+            while let Some(reply) = reply_rx.recv().await {
+                if let AcceptorProposerMessage::AppendResponse(resp) = reply {
+                    if resp.flush_lsn >= end_lsn {
+                        break;
+                    }
+                }
+            }
+        }
+
+        Ok(end_watch)
+    }
 }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 2882391074..5eb0bd7146 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -35,7 +35,7 @@ use crate::control_file;
 use crate::rate_limit::RateLimiter;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn};
-use crate::send_wal::WalSenders;
+use crate::send_wal::{WalSenders, WalSendersTimelineMetricValues};
 use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, TimelineState};
 use crate::timeline_guard::ResidenceGuard;
 use crate::timeline_manager::{AtomicStatus, ManagerCtl};
@@ -712,16 +712,22 @@ impl Timeline {
             return None;
         }
 
-        let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats();
+        let WalSendersTimelineMetricValues {
+            ps_feedback_counter,
+            last_ps_feedback,
+            interpreted_wal_reader_tasks,
+        } = self.walsenders.info_for_metrics();
+
         let state = self.read_shared_state().await;
         Some(FullTimelineInfo {
             ttid: self.ttid,
-            ps_feedback_count,
+            ps_feedback_count: ps_feedback_counter,
             last_ps_feedback,
             wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
             timeline_is_active: self.broker_active.load(Ordering::Relaxed),
             num_computes: self.walreceivers.get_num() as u32,
             last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
+            interpreted_wal_reader_tasks,
             epoch_start_lsn: state.sk.term_start_lsn(),
             mem_state: state.sk.state().inmem.clone(),
             persisted_state: TimelinePersistentState::clone(state.sk.state()),
@@ -740,7 +746,7 @@ impl Timeline {
         debug_dump::Memory {
             is_cancelled: self.is_cancelled(),
             peers_info_len: state.peers_info.0.len(),
-            walsenders: self.walsenders.get_all(),
+            walsenders: self.walsenders.get_all_public(),
             wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
             active: self.broker_active.load(Ordering::Relaxed),
             num_computes: self.walreceivers.get_num() as u32,
diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs
index aea628c208..adac6067da 100644
--- a/safekeeper/src/wal_reader_stream.rs
+++ b/safekeeper/src/wal_reader_stream.rs
@@ -1,34 +1,16 @@
-use std::sync::Arc;
-
-use async_stream::try_stream;
-use bytes::Bytes;
-use futures::Stream;
-use postgres_backend::CopyStreamHandlerEnd;
-use safekeeper_api::Term;
-use std::time::Duration;
-use tokio::time::timeout;
-use utils::lsn::Lsn;
-
-use crate::{
-    send_wal::{EndWatch, WalSenderGuard},
-    timeline::WalResidentTimeline,
+use std::{
+    pin::Pin,
+    task::{Context, Poll},
 };
 
-pub(crate) struct WalReaderStreamBuilder {
-    pub(crate) tli: WalResidentTimeline,
-    pub(crate) start_pos: Lsn,
-    pub(crate) end_pos: Lsn,
-    pub(crate) term: Option<Term>,
-    pub(crate) end_watch: EndWatch,
-    pub(crate) wal_sender_guard: Arc<WalSenderGuard>,
-}
+use bytes::Bytes;
+use futures::{stream::BoxStream, Stream, StreamExt};
+use utils::lsn::Lsn;
 
-impl WalReaderStreamBuilder {
-    pub(crate) fn start_pos(&self) -> Lsn {
-        self.start_pos
-    }
-}
+use crate::{send_wal::EndWatch, timeline::WalResidentTimeline, wal_storage::WalReader};
+use safekeeper_api::Term;
 
+#[derive(PartialEq, Eq, Debug)]
 pub(crate) struct WalBytes {
     /// Raw PG WAL
     pub(crate) wal: Bytes,
@@ -44,106 +26,270 @@ pub(crate) struct WalBytes {
     pub(crate) available_wal_end_lsn: Lsn,
 }
 
-impl WalReaderStreamBuilder {
-    /// Builds a stream of Postgres WAL starting from [`Self::start_pos`].
-    /// The stream terminates when the receiver (pageserver) is fully caught up
-    /// and there's no active computes.
-    pub(crate) async fn build(
-        self,
-        buffer_size: usize,
-    ) -> anyhow::Result<impl Stream<Item = Result<WalBytes, CopyStreamHandlerEnd>>> {
-        // TODO(vlad): The code below duplicates functionality from [`crate::send_wal`].
-        // We can make the raw WAL sender use this stream too and remove the duplication.
-        let Self {
-            tli,
-            mut start_pos,
-            mut end_pos,
-            term,
-            mut end_watch,
-            wal_sender_guard,
-        } = self;
-        let mut wal_reader = tli.get_walreader(start_pos).await?;
-        let mut buffer = vec![0; buffer_size];
+struct PositionedWalReader {
+    start: Lsn,
+    end: Lsn,
+    reader: Option<WalReader>,
+}
 
-        const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
+/// A streaming WAL reader wrapper which can be reset while running
+pub(crate) struct StreamingWalReader {
+    stream: BoxStream<'static, WalOrReset>,
+    start_changed_tx: tokio::sync::watch::Sender<Lsn>,
+}
 
-        Ok(try_stream! {
-            loop {
-                let have_something_to_send = end_pos > start_pos;
+pub(crate) enum WalOrReset {
+    Wal(anyhow::Result<WalBytes>),
+    Reset(Lsn),
+}
 
-                if !have_something_to_send {
-                    // wait for lsn
-                    let res = timeout(POLL_STATE_TIMEOUT, end_watch.wait_for_lsn(start_pos, term)).await;
-                    match res {
-                        Ok(ok) => {
-                            end_pos = ok?;
-                        },
-                        Err(_) => {
-                            if let EndWatch::Commit(_) = end_watch {
-                                if let Some(remote_consistent_lsn) = wal_sender_guard
-                                    .walsenders()
-                                    .get_ws_remote_consistent_lsn(wal_sender_guard.id())
-                                {
-                                    if tli.should_walsender_stop(remote_consistent_lsn).await {
-                                        // Stop streaming if the receivers are caught up and
-                                        // there's no active compute. This causes the loop in
-                                        // [`crate::send_interpreted_wal::InterpretedWalSender::run`]
-                                        // to exit and terminate the WAL stream.
-                                        return;
-                                    }
-                                }
-                            }
-
-                            continue;
-                        }
-                    }
-                }
-
-
-                assert!(
-                    end_pos > start_pos,
-                    "nothing to send after waiting for WAL"
-                );
-
-                // try to send as much as available, capped by the buffer size
-                let mut chunk_end_pos = start_pos + buffer_size as u64;
-                // if we went behind available WAL, back off
-                if chunk_end_pos >= end_pos {
-                    chunk_end_pos = end_pos;
-                } else {
-                    // If sending not up to end pos, round down to page boundary to
-                    // avoid breaking WAL record not at page boundary, as protocol
-                    // demands. See walsender.c (XLogSendPhysical).
-                    chunk_end_pos = chunk_end_pos
-                        .checked_sub(chunk_end_pos.block_offset())
-                        .unwrap();
-                }
-                let send_size = (chunk_end_pos.0 - start_pos.0) as usize;
-                let buffer = &mut buffer[..send_size];
-                let send_size: usize;
-                {
-                    // If uncommitted part is being pulled, check that the term is
-                    // still the expected one.
-                    let _term_guard = if let Some(t) = term {
-                        Some(tli.acquire_term(t).await?)
-                    } else {
-                        None
-                    };
-                    // Read WAL into buffer. send_size can be additionally capped to
-                    // segment boundary here.
-                    send_size = wal_reader.read(buffer).await?
-                };
-                let wal = Bytes::copy_from_slice(&buffer[..send_size]);
-
-                yield WalBytes {
-                    wal,
-                    wal_start_lsn: start_pos,
-                    wal_end_lsn: start_pos + send_size as u64,
-                    available_wal_end_lsn: end_pos
-                };
-
-                start_pos += send_size as u64;
-            }
-        })
+impl WalOrReset {
+    pub(crate) fn get_wal(self) -> Option<anyhow::Result<WalBytes>> {
+        match self {
+            WalOrReset::Wal(wal) => Some(wal),
+            WalOrReset::Reset(_) => None,
+        }
+    }
+}
+
+impl StreamingWalReader {
+    pub(crate) fn new(
+        tli: WalResidentTimeline,
+        term: Option<Term>,
+        start: Lsn,
+        end: Lsn,
+        end_watch: EndWatch,
+        buffer_size: usize,
+    ) -> Self {
+        let (start_changed_tx, start_changed_rx) = tokio::sync::watch::channel(start);
+
+        let state = WalReaderStreamState {
+            tli,
+            wal_reader: PositionedWalReader {
+                start,
+                end,
+                reader: None,
+            },
+            term,
+            end_watch,
+            buffer: vec![0; buffer_size],
+            buffer_size,
+        };
+
+        // When a change notification is received while polling the internal
+        // reader, stop polling the read future and service the change.
+        let stream = futures::stream::unfold(
+            (state, start_changed_rx),
+            |(mut state, mut rx)| async move {
+                let wal_or_reset = tokio::select! {
+                    read_res = state.read() => { WalOrReset::Wal(read_res) },
+                    changed_res = rx.changed() => {
+                        if changed_res.is_err() {
+                            return None;
+                        }
+
+                        let new_start_pos = rx.borrow_and_update();
+                        WalOrReset::Reset(*new_start_pos)
+                    }
+                };
+
+                if let WalOrReset::Reset(lsn) = wal_or_reset {
+                    state.wal_reader.start = lsn;
+                    state.wal_reader.reader = None;
+                }
+
+                Some((wal_or_reset, (state, rx)))
+            },
+        )
+        .boxed();
+
+        Self {
+            stream,
+            start_changed_tx,
+        }
+    }
+
+    /// Reset the stream to a given position.
+    pub(crate) async fn reset(&mut self, start: Lsn) {
+        self.start_changed_tx.send(start).unwrap();
+        while let Some(wal_or_reset) = self.stream.next().await {
+            match wal_or_reset {
+                WalOrReset::Reset(at) => {
+                    // Stream confirmed the reset.
+                    // There may only one ongoing reset at any given time,
+                    // hence the assertion.
+                    assert_eq!(at, start);
+                    break;
+                }
+                WalOrReset::Wal(_) => {
+                    // Ignore wal generated before reset was handled
+                }
+            }
+        }
+    }
+}
+
+impl Stream for StreamingWalReader {
+    type Item = WalOrReset;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        Pin::new(&mut self.stream).poll_next(cx)
+    }
+}
+
+struct WalReaderStreamState {
+    tli: WalResidentTimeline,
+    wal_reader: PositionedWalReader,
+    term: Option<Term>,
+    end_watch: EndWatch,
+    buffer: Vec<u8>,
+    buffer_size: usize,
+}
+
+impl WalReaderStreamState {
+    async fn read(&mut self) -> anyhow::Result<WalBytes> {
+        // Create reader if needed
+        if self.wal_reader.reader.is_none() {
+            self.wal_reader.reader = Some(self.tli.get_walreader(self.wal_reader.start).await?);
+        }
+
+        let have_something_to_send = self.wal_reader.end > self.wal_reader.start;
+        if !have_something_to_send {
+            tracing::debug!(
+                "Waiting for wal: start={}, end={}",
+                self.wal_reader.end,
+                self.wal_reader.start
+            );
+            self.wal_reader.end = self
+                .end_watch
+                .wait_for_lsn(self.wal_reader.start, self.term)
+                .await?;
+            tracing::debug!(
+                "Done waiting for wal: start={}, end={}",
+                self.wal_reader.end,
+                self.wal_reader.start
+            );
+        }
+
+        assert!(
+            self.wal_reader.end > self.wal_reader.start,
+            "nothing to send after waiting for WAL"
+        );
+
+        // Calculate chunk size
+        let mut chunk_end_pos = self.wal_reader.start + self.buffer_size as u64;
+        if chunk_end_pos >= self.wal_reader.end {
+            chunk_end_pos = self.wal_reader.end;
+        } else {
+            chunk_end_pos = chunk_end_pos
+                .checked_sub(chunk_end_pos.block_offset())
+                .unwrap();
+        }
+
+        let send_size = (chunk_end_pos.0 - self.wal_reader.start.0) as usize;
+        let buffer = &mut self.buffer[..send_size];
+
+        // Read WAL
+        let send_size = {
+            let _term_guard = if let Some(t) = self.term {
+                Some(self.tli.acquire_term(t).await?)
+            } else {
+                None
+            };
+            self.wal_reader
+                .reader
+                .as_mut()
+                .unwrap()
+                .read(buffer)
+                .await?
+        };
+
+        let wal = Bytes::copy_from_slice(&buffer[..send_size]);
+        let result = WalBytes {
+            wal,
+            wal_start_lsn: self.wal_reader.start,
+            wal_end_lsn: self.wal_reader.start + send_size as u64,
+            available_wal_end_lsn: self.wal_reader.end,
+        };
+
+        self.wal_reader.start += send_size as u64;
+
+        Ok(result)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::str::FromStr;
+
+    use futures::StreamExt;
+    use postgres_ffi::MAX_SEND_SIZE;
+    use utils::{
+        id::{NodeId, TenantTimelineId},
+        lsn::Lsn,
+    };
+
+    use crate::{test_utils::Env, wal_reader_stream::StreamingWalReader};
+
+    #[tokio::test]
+    async fn test_streaming_wal_reader_reset() {
+        let _ = env_logger::builder().is_test(true).try_init();
+
+        const SIZE: usize = 8 * 1024;
+        const MSG_COUNT: usize = 200;
+
+        let start_lsn = Lsn::from_str("0/149FD18").unwrap();
+        let env = Env::new(true).unwrap();
+        let tli = env
+            .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
+            .await
+            .unwrap();
+
+        let resident_tli = tli.wal_residence_guard().await.unwrap();
+        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
+            .await
+            .unwrap();
+        let end_pos = end_watch.get();
+
+        tracing::info!("Doing first round of reads ...");
+
+        let mut streaming_wal_reader = StreamingWalReader::new(
+            resident_tli,
+            None,
+            start_lsn,
+            end_pos,
+            end_watch,
+            MAX_SEND_SIZE,
+        );
+
+        let mut before_reset = Vec::new();
+        while let Some(wor) = streaming_wal_reader.next().await {
+            let wal = wor.get_wal().unwrap().unwrap();
+            let stop = wal.available_wal_end_lsn == wal.wal_end_lsn;
+            before_reset.push(wal);
+
+            if stop {
+                break;
+            }
+        }
+
+        tracing::info!("Resetting the WAL stream ...");
+
+        streaming_wal_reader.reset(start_lsn).await;
+
+        tracing::info!("Doing second round of reads ...");
+
+        let mut after_reset = Vec::new();
+        while let Some(wor) = streaming_wal_reader.next().await {
+            let wal = wor.get_wal().unwrap().unwrap();
+            let stop = wal.available_wal_end_lsn == wal.wal_end_lsn;
+            after_reset.push(wal);
+
+            if stop {
+                break;
+            }
+        }
+
+        assert_eq!(before_reset, after_reset);
     }
 }
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index a99de71a04..e0d593851e 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -178,6 +178,8 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         control_file_save_interval: Duration::from_secs(1),
         partial_backup_concurrency: 1,
         eviction_min_resident: Duration::ZERO,
+        wal_reader_fanout: false,
+        max_delta_for_fanout: None,
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;

From 3d41069dc4002a0444c3cf6afaa87a05d95bcdb5 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 15 Jan 2025 10:26:58 -0600
Subject: [PATCH 196/583] Update pgrx in extension builds to 0.12.9 (#10372)

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/compute-node.Dockerfile | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 299f4444a3..1ee159e5df 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -871,7 +871,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     chmod +x rustup-init && \
     ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
     rm rustup-init && \
-    cargo install --locked --version 0.12.6 cargo-pgrx && \
+    cargo install --locked --version 0.12.9 cargo-pgrx && \
     /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
 
 USER root
@@ -908,19 +908,19 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p
     mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \
     \
     cd exts/rag && \
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \
     \
     cd ../rag_bge_small_en_v15 && \
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
         REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
         cargo pgrx install --release --features remote_onnx && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \
     \
     cd ../rag_jina_reranker_v1_tiny_en && \
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
         REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
         cargo pgrx install --release --features remote_onnx && \
@@ -945,7 +945,8 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.
     # against postgres forks that decided to change their ABI name (like us).
     # With that we can build extensions without forking them and using stock
     # pgx. As this feature is new few manual version bumps were required.
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx-tests = "0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \
     cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
 
@@ -963,7 +964,8 @@ ARG PG_VERSION
 RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \
     echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \
     mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "=0.12.9"/g' Cargo.toml && \
     cargo pgrx install --release && \
     # it's needed to enable extension because it uses untrusted C language
     sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
@@ -984,9 +986,8 @@ ARG PG_VERSION
 RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7.tar.gz -O pg_tiktoken.tar.gz && \
     echo "a5bc447e7920ee149d3c064b8b9f0086c0e83939499753178f7d35788416f628 pg_tiktoken.tar.gz" | sha256sum --check && \
     mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
-    # TODO update pgrx version in the pg_tiktoken repo and remove this line
-    sed -i 's/pgrx = { version = "=0.10.2",/pgrx = { version = "0.11.3",/g' Cargo.toml && \
-    sed -i 's/pgrx-tests = "=0.10.2"/pgrx-tests = "0.11.3"/g' Cargo.toml && \
+    sed -i 's/pgrx = { version = "=0.12.6",/pgrx = { version = "0.12.9",/g' Cargo.toml && \
+    sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \
     cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
 
@@ -1028,7 +1029,11 @@ ARG PG_VERSION
 RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
     echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
     mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \
+    sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' pgrx-tests/Cargo.toml && \
+    sed -i 's/pgrx-macros = "=0.12.6"/pgrx-macros = "=0.12.9"/g' pgrx-tests/Cargo.toml && \
+    sed -i 's/pgrx-pg-config = "=0.12.6"/pgrx-pg-config = "=0.12.9"/g' pgrx-tests/Cargo.toml && \
     cargo pgrx install --release
 
 #########################################################################################

From efaec6cdf81f71ded5428aeccc6e9f8f607606e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 15 Jan 2025 19:15:30 +0100
Subject: [PATCH 197/583] Add endpoint and storcon cli cmd to set sk scheduling
 policy (#10400)

Implementing the last missing endpoint of #9981, this adds support to
set the scheduling policy of an individual safekeeper, as specified in
the RFC. However, unlike in the RFC we call the endpoint
`scheduling_policy` not `status`

Closes #9981.

As for why not use the upsert endpoint for this: we want to have the
safekeeper upsert endpoint be used for testing and for deploying new
safekeepers, but not for changes of the scheduling policy. We don't want
to change any of the other fields when marking a safekeeper as
decommissioned for example, so we'd have to first fetch them only to
then specify them again. Of course one can also design an endpoint where
one can omit any field and it doesn't get modified, but it's still not
great for observability to put everything into one big "change something
about this safekeeper" endpoint.
---
 control_plane/storcon_cli/src/main.rs         | 47 ++++++++++++++++++-
 libs/pageserver_api/src/controller_api.rs     |  7 ++-
 storage_controller/src/http.rs                | 44 ++++++++++++++++-
 storage_controller/src/persistence.rs         | 31 ++++++++++++
 storage_controller/src/service.rs             | 12 ++++-
 test_runner/fixtures/neon_fixtures.py         | 12 ++++-
 .../regress/test_storage_controller.py        | 11 +++++
 7 files changed, 155 insertions(+), 9 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 2ba8f63678..96bfad4c86 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -9,8 +9,9 @@ use clap::{Parser, Subcommand};
 use pageserver_api::{
     controller_api::{
         AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
-        SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
-        TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
+        SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
+        ShardsPreferredAzsRequest, SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse,
+        TenantPolicyRequest,
     },
     models::{
         EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -231,6 +232,13 @@ enum Command {
     },
     /// List safekeepers known to the storage controller
     Safekeepers {},
+    /// Set the scheduling policy of the specified safekeeper
+    SafekeeperScheduling {
+        #[arg(long)]
+        node_id: NodeId,
+        #[arg(long)]
+        scheduling_policy: SkSchedulingPolicyArg,
+    },
 }
 
 #[derive(Parser)]
@@ -283,6 +291,24 @@ impl FromStr for PlacementPolicyArg {
     }
 }
 
+#[derive(Debug, Clone)]
+struct SkSchedulingPolicyArg(SkSchedulingPolicy);
+
+impl FromStr for SkSchedulingPolicyArg {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self(SkSchedulingPolicy::Active)),
+            "disabled" => Ok(Self(SkSchedulingPolicy::Disabled)),
+            "decomissioned" => Ok(Self(SkSchedulingPolicy::Decomissioned)),
+            _ => Err(anyhow::anyhow!(
+                "Unknown scheduling policy '{s}', try active,disabled,decomissioned"
+            )),
+        }
+    }
+}
+
 #[derive(Debug, Clone)]
 struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
 
@@ -1202,6 +1228,23 @@ async fn main() -> anyhow::Result<()> {
             }
             println!("{table}");
         }
+        Command::SafekeeperScheduling {
+            node_id,
+            scheduling_policy,
+        } => {
+            let scheduling_policy = scheduling_policy.0;
+            storcon_client
+                .dispatch::<SafekeeperSchedulingPolicyRequest, ()>(
+                    Method::POST,
+                    format!("control/v1/safekeeper/{node_id}/scheduling_policy"),
+                    Some(SafekeeperSchedulingPolicyRequest { scheduling_policy }),
+                )
+                .await?;
+            println!(
+                "Scheduling policy of {node_id} set to {}",
+                String::from(scheduling_policy)
+            );
+        }
     }
 
     Ok(())
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index f3880cb766..08d1fa55b9 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -416,8 +416,6 @@ pub struct MetadataHealthListOutdatedResponse {
 }
 
 /// Publicly exposed safekeeper description
-///
-/// The `active` flag which we have in the DB is not included on purpose: it is deprecated.
 #[derive(Serialize, Deserialize, Clone)]
 pub struct SafekeeperDescribeResponse {
     pub id: NodeId,
@@ -433,6 +431,11 @@ pub struct SafekeeperDescribeResponse {
     pub scheduling_policy: SkSchedulingPolicy,
 }
 
+#[derive(Serialize, Deserialize, Clone)]
+pub struct SafekeeperSchedulingPolicyRequest {
+    pub scheduling_policy: SkSchedulingPolicy,
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 03d8f11992..ac890b008f 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -15,7 +15,7 @@ use metrics::{BuildInfo, NeonMetrics};
 use pageserver_api::controller_api::{
     MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
     MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
-    ShardsPreferredAzsRequest, TenantCreateRequest,
+    SafekeeperSchedulingPolicyRequest, ShardsPreferredAzsRequest, TenantCreateRequest,
 };
 use pageserver_api::models::{
     TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest,
@@ -1305,6 +1305,35 @@ async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Bod
         .unwrap())
 }
 
+/// Sets the scheduling policy of the specified safekeeper
+async fn handle_safekeeper_scheduling_policy(
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let body = json_request::<SafekeeperSchedulingPolicyRequest>(&mut req).await?;
+    let id = parse_request_param::<i64>(&req, "id")?;
+
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let state = get_state(&req);
+
+    state
+        .service
+        .set_safekeeper_scheduling_policy(id, body.scheduling_policy)
+        .await?;
+
+    Ok(Response::builder()
+        .status(StatusCode::NO_CONTENT)
+        .body(Body::empty())
+        .unwrap())
+}
+
 /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
 /// be allowed to run if Service has finished its initial reconciliation.
 async fn tenant_service_handler<R, H>(
@@ -1873,7 +1902,18 @@ pub fn make_router(
         })
         .post("/control/v1/safekeeper/:id", |r| {
             // id is in the body
-            named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
+            named_request_span(
+                r,
+                handle_upsert_safekeeper,
+                RequestName("v1_safekeeper_post"),
+            )
+        })
+        .post("/control/v1/safekeeper/:id/scheduling_policy", |r| {
+            named_request_span(
+                r,
+                handle_safekeeper_scheduling_policy,
+                RequestName("v1_safekeeper_status"),
+            )
         })
         // Tenant Shard operations
         .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index eb0bfc879e..37bfaf1139 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -1104,6 +1104,37 @@ impl Persistence {
         })
         .await
     }
+
+    pub(crate) async fn set_safekeeper_scheduling_policy(
+        &self,
+        id_: i64,
+        scheduling_policy_: SkSchedulingPolicy,
+    ) -> Result<(), DatabaseError> {
+        use crate::schema::safekeepers::dsl::*;
+
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            #[derive(Insertable, AsChangeset)]
+            #[diesel(table_name = crate::schema::safekeepers)]
+            struct UpdateSkSchedulingPolicy<'a> {
+                id: i64,
+                scheduling_policy: &'a str,
+            }
+            let scheduling_policy_ = String::from(scheduling_policy_);
+
+            let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
+                .set(scheduling_policy.eq(scheduling_policy_))
+                .execute(conn)?;
+
+            if rows_affected != 1 {
+                return Err(DatabaseError::Logical(format!(
+                    "unexpected number of rows ({rows_affected})",
+                )));
+            }
+
+            Ok(())
+        })
+        .await
+    }
 }
 
 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 57f4cc8463..1d85839881 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -47,7 +47,7 @@ use pageserver_api::{
         AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability,
         NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy,
         SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
-        ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse,
+        ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
         TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
         TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
         TenantShardMigrateResponse,
@@ -7651,6 +7651,16 @@ impl Service {
         self.persistence.safekeeper_upsert(record).await
     }
 
+    pub(crate) async fn set_safekeeper_scheduling_policy(
+        &self,
+        id: i64,
+        scheduling_policy: SkSchedulingPolicy,
+    ) -> Result<(), DatabaseError> {
+        self.persistence
+            .set_safekeeper_scheduling_policy(id, scheduling_policy)
+            .await
+    }
+
     pub(crate) async fn update_shards_preferred_azs(
         &self,
         req: ShardsPreferredAzsRequest,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index c47739cd81..c3950e9bf7 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2336,6 +2336,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
             json=body,
         )
 
+    def safekeeper_scheduling_policy(self, id: int, scheduling_policy: str):
+        self.request(
+            "POST",
+            f"{self.api}/control/v1/safekeeper/{id}/scheduling_policy",
+            headers=self.headers(TokenScope.ADMIN),
+            json={"id": id, "scheduling_policy": scheduling_policy},
+        )
+
     def get_safekeeper(self, id: int) -> dict[str, Any] | None:
         try:
             response = self.request(
@@ -4135,7 +4143,7 @@ class Endpoint(PgProtocol, LogUtils):
 
     # Checkpoints running endpoint and returns pg_wal size in MB.
     def get_pg_wal_size(self):
-        log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}')
+        log.info(f"checkpointing at LSN {self.safe_psql('select pg_current_wal_lsn()')[0][0]}")
         self.safe_psql("checkpoint")
         assert self.pgdata_dir is not None  # please mypy
         return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024
@@ -4975,7 +4983,7 @@ def logical_replication_sync(
         if res:
             log.info(f"subscriber_lsn={res}")
             subscriber_lsn = Lsn(res)
-            log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={ publisher_lsn}")
+            log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={publisher_lsn}")
             if subscriber_lsn >= publisher_lsn:
                 return subscriber_lsn
         time.sleep(0.5)
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index b5d109559f..b1e1fd81d6 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3208,6 +3208,17 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
 
     assert eq_safekeeper_records(body, inserted_now)
 
+    # some small tests for the scheduling policy querying and returning APIs
+    newest_info = target.get_safekeeper(inserted["id"])
+    assert newest_info
+    assert newest_info["scheduling_policy"] == "Disabled"
+    target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
+    newest_info = target.get_safekeeper(inserted["id"])
+    assert newest_info
+    assert newest_info["scheduling_policy"] == "Decomissioned"
+    # Ensure idempotency
+    target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
+
 
 def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
     compared = [dict(a), dict(b)]

From fb0e2acb2f8333d279371da84c17e3e6db5c31f2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 15 Jan 2025 19:07:22 +0000
Subject: [PATCH 198/583] pageserver: add `page_trace` API for debugging
 (#10293)

## Problem

When a pageserver is receiving high rates of requests, we don't have a
good way to efficiently discover what the client's access pattern is.

Closes: https://github.com/neondatabase/neon/issues/10275

## Summary of changes

- Add
`/v1/tenant/x/timeline/y/page_trace?size_limit_bytes=...&time_limit_secs=...`
API, which returns a binary buffer.
- Add `pagectl page-trace` tool to decode and analyze the output.

---------

Co-authored-by: Erik Grinaker <erik@neon.tech>
---
 Cargo.lock                        |  3 ++
 libs/pageserver_api/src/key.rs    |  4 +-
 libs/pageserver_api/src/models.rs | 19 +++++++-
 pageserver/Cargo.toml             |  1 +
 pageserver/ctl/Cargo.toml         |  2 +
 pageserver/ctl/src/main.rs        |  4 ++
 pageserver/ctl/src/page_trace.rs  | 73 +++++++++++++++++++++++++++++++
 pageserver/src/http/routes.rs     | 72 ++++++++++++++++++++++++++++++
 pageserver/src/page_service.rs    | 15 +++++++
 pageserver/src/tenant/timeline.rs |  9 +++-
 10 files changed, 199 insertions(+), 3 deletions(-)
 create mode 100644 pageserver/ctl/src/page_trace.rs

diff --git a/Cargo.lock b/Cargo.lock
index afe16ff848..3f184ebe0b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3981,9 +3981,11 @@ name = "pagectl"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "bincode",
  "camino",
  "clap",
  "humantime",
+ "itertools 0.10.5",
  "pageserver",
  "pageserver_api",
  "postgres_ffi",
@@ -4005,6 +4007,7 @@ dependencies = [
  "arc-swap",
  "async-compression",
  "async-stream",
+ "bincode",
  "bit_field",
  "byteorder",
  "bytes",
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 328dea5dec..dbd45da314 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -24,7 +24,9 @@ pub struct Key {
 
 /// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
 /// a struct of fields.
-#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)]
+#[derive(
+    Clone, Copy, Default, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug,
+)]
 pub struct CompactKey(i128);
 
 /// The storage key size.
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 9af6c4021d..87e8df2ab6 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -29,7 +29,7 @@ use utils::{
 };
 
 use crate::{
-    key::Key,
+    key::{CompactKey, Key},
     reltag::RelTag,
     shard::{ShardCount, ShardStripeSize, TenantShardId},
 };
@@ -1981,6 +1981,23 @@ impl PagestreamBeMessage {
     }
 }
 
+#[derive(Debug, Serialize, Deserialize)]
+pub struct PageTraceEvent {
+    pub key: CompactKey,
+    pub effective_lsn: Lsn,
+    pub time: SystemTime,
+}
+
+impl Default for PageTraceEvent {
+    fn default() -> Self {
+        Self {
+            key: Default::default(),
+            effective_lsn: Default::default(),
+            time: std::time::UNIX_EPOCH,
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use serde_json::json;
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 8547746d94..9195951191 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -16,6 +16,7 @@ arc-swap.workspace = true
 async-compression.workspace = true
 async-stream.workspace = true
 bit_field.workspace = true
+bincode.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
 camino.workspace = true
diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml
index 39ca47568c..7b70f0dc87 100644
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -8,9 +8,11 @@ license.workspace = true
 
 [dependencies]
 anyhow.workspace = true
+bincode.workspace = true
 camino.workspace = true
 clap = { workspace = true, features = ["string"] }
 humantime.workspace = true
+itertools.workspace = true
 pageserver = { path = ".." }
 pageserver_api.workspace = true
 remote_storage = { path = "../../libs/remote_storage" }
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index a0aac89dc8..353b4bd2f9 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -9,7 +9,9 @@ mod index_part;
 mod key;
 mod layer_map_analyzer;
 mod layers;
+mod page_trace;
 
+use page_trace::PageTraceCmd;
 use std::{
     str::FromStr,
     time::{Duration, SystemTime},
@@ -64,6 +66,7 @@ enum Commands {
     Layer(LayerCmd),
     /// Debug print a hex key found from logs
     Key(key::DescribeKeyCommand),
+    PageTrace(PageTraceCmd),
 }
 
 /// Read and update pageserver metadata file
@@ -183,6 +186,7 @@ async fn main() -> anyhow::Result<()> {
                 .await?;
         }
         Commands::Key(dkc) => dkc.execute(),
+        Commands::PageTrace(cmd) => page_trace::main(&cmd)?,
     };
     Ok(())
 }
diff --git a/pageserver/ctl/src/page_trace.rs b/pageserver/ctl/src/page_trace.rs
new file mode 100644
index 0000000000..da0de72fd9
--- /dev/null
+++ b/pageserver/ctl/src/page_trace.rs
@@ -0,0 +1,73 @@
+use std::collections::HashMap;
+use std::io::BufReader;
+
+use camino::Utf8PathBuf;
+use clap::Parser;
+use itertools::Itertools as _;
+use pageserver_api::key::{CompactKey, Key};
+use pageserver_api::models::PageTraceEvent;
+use pageserver_api::reltag::RelTag;
+
+/// Parses a page trace (as emitted by the `page_trace` timeline API), and outputs stats.
+#[derive(Parser)]
+pub(crate) struct PageTraceCmd {
+    /// Trace input file.
+    path: Utf8PathBuf,
+}
+
+pub(crate) fn main(cmd: &PageTraceCmd) -> anyhow::Result<()> {
+    let mut file = BufReader::new(std::fs::OpenOptions::new().read(true).open(&cmd.path)?);
+    let mut events: Vec<PageTraceEvent> = Vec::new();
+    loop {
+        match bincode::deserialize_from(&mut file) {
+            Ok(event) => events.push(event),
+            Err(err) => {
+                if let bincode::ErrorKind::Io(ref err) = *err {
+                    if err.kind() == std::io::ErrorKind::UnexpectedEof {
+                        break;
+                    }
+                }
+                return Err(err.into());
+            }
+        }
+    }
+
+    let mut reads_by_relation: HashMap<RelTag, i64> = HashMap::new();
+    let mut reads_by_key: HashMap<CompactKey, i64> = HashMap::new();
+
+    for event in events {
+        let key = Key::from_compact(event.key);
+        let reltag = RelTag {
+            spcnode: key.field2,
+            dbnode: key.field3,
+            relnode: key.field4,
+            forknum: key.field5,
+        };
+
+        *reads_by_relation.entry(reltag).or_default() += 1;
+        *reads_by_key.entry(event.key).or_default() += 1;
+    }
+
+    let multi_read_keys = reads_by_key
+        .into_iter()
+        .filter(|(_, count)| *count > 1)
+        .sorted_by_key(|(key, count)| (-*count, *key))
+        .collect_vec();
+
+    println!("Multi-read keys: {}", multi_read_keys.len());
+    for (key, count) in multi_read_keys {
+        println!("  {key}: {count}");
+    }
+
+    let reads_by_relation = reads_by_relation
+        .into_iter()
+        .sorted_by_key(|(rel, count)| (-*count, *rel))
+        .collect_vec();
+
+    println!("Reads by relation:");
+    for (reltag, count) in reads_by_relation {
+        println!("  {reltag}: {count}");
+    }
+
+    Ok(())
+}
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 94e0b101bd..33b2d04588 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -27,6 +27,7 @@ use pageserver_api::models::LocationConfigMode;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::LsnLeaseRequest;
 use pageserver_api::models::OffloadedTimelineInfo;
+use pageserver_api::models::PageTraceEvent;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantConfigPatchRequest;
 use pageserver_api::models::TenantDetails;
@@ -51,7 +52,9 @@ use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
+use scopeguard::defer;
 use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
+use tokio::time::Instant;
 use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -1521,6 +1524,71 @@ async fn timeline_gc_unblocking_handler(
     block_or_unblock_gc(request, false).await
 }
 
+/// Traces GetPage@LSN requests for a timeline, and emits metadata in an efficient binary encoding.
+/// Use the `pagectl page-trace` command to decode and analyze the output.
+async fn timeline_page_trace_handler(
+    request: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let state = get_state(&request);
+    check_permission(&request, None)?;
+
+    let size_limit: usize = parse_query_param(&request, "size_limit_bytes")?.unwrap_or(1024 * 1024);
+    let time_limit_secs: u64 = parse_query_param(&request, "time_limit_secs")?.unwrap_or(5);
+
+    // Convert size limit to event limit based on the serialized size of an event. The event size is
+    // fixed, as the default bincode serializer uses fixed-width integer encoding.
+    let event_size = bincode::serialize(&PageTraceEvent::default())
+        .map_err(|err| ApiError::InternalServerError(err.into()))?
+        .len();
+    let event_limit = size_limit / event_size;
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    // Install a page trace, unless one is already in progress. We just use a buffered channel,
+    // which may 2x the memory usage in the worst case, but it's still bounded.
+    let (trace_tx, mut trace_rx) = tokio::sync::mpsc::channel(event_limit);
+    let cur = timeline.page_trace.load();
+    let installed = cur.is_none()
+        && timeline
+            .page_trace
+            .compare_and_swap(cur, Some(Arc::new(trace_tx)))
+            .is_none();
+    if !installed {
+        return Err(ApiError::Conflict("page trace already active".to_string()));
+    }
+    defer!(timeline.page_trace.store(None)); // uninstall on return
+
+    // Collect the trace and return it to the client. We could stream the response, but this is
+    // simple and fine.
+    let mut body = Vec::with_capacity(size_limit);
+    let deadline = Instant::now() + Duration::from_secs(time_limit_secs);
+
+    while body.len() < size_limit {
+        tokio::select! {
+            event = trace_rx.recv() => {
+                let Some(event) = event else {
+                    break; // shouldn't happen (sender doesn't close, unless timeline dropped)
+                };
+                bincode::serialize_into(&mut body, &event)
+                    .map_err(|err| ApiError::InternalServerError(err.into()))?;
+            }
+            _ = tokio::time::sleep_until(deadline) => break, // time limit reached
+            _ = cancel.cancelled() => return Err(ApiError::Cancelled),
+        }
+    }
+
+    Ok(Response::builder()
+        .status(StatusCode::OK)
+        .header(header::CONTENT_TYPE, "application/octet-stream")
+        .body(hyper::Body::from(body))
+        .unwrap())
+}
+
 /// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
 ///
 /// Both are technically unsafe because they might fire off index uploads, thus they are POST.
@@ -3479,6 +3547,10 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
             |r| api_handler(r, timeline_gc_unblocking_handler),
         )
+        .get(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/page_trace",
+            |r| api_handler(r, timeline_page_trace_handler),
+        )
         .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
             api_handler(r, secondary_upload_handler)
         })
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index b3e18fed99..da4180a927 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -67,6 +67,7 @@ use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::{basebackup, timed_after_cancellation};
 use pageserver_api::key::rel_block_to_key;
+use pageserver_api::models::PageTraceEvent;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
@@ -1718,6 +1719,20 @@ impl PageServerHandler {
             .query_metrics
             .observe_getpage_batch_start(requests.len());
 
+        // If a page trace is running, submit an event for this request.
+        if let Some(page_trace) = timeline.page_trace.load().as_ref() {
+            let time = SystemTime::now();
+            for batch in &requests {
+                let key = rel_block_to_key(batch.req.rel, batch.req.blkno).to_compact();
+                // Ignore error (trace buffer may be full or tracer may have disconnected).
+                _ = page_trace.try_send(PageTraceEvent {
+                    key,
+                    effective_lsn,
+                    time,
+                });
+            }
+        }
+
         let results = timeline
             .get_rel_page_at_lsn_batched(
                 requests.iter().map(|p| (&p.req.rel, &p.req.blkno)),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4aa6b7a05a..d6ae11e67d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,7 +14,7 @@ pub mod uninit;
 mod walreceiver;
 
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use arc_swap::ArcSwap;
+use arc_swap::{ArcSwap, ArcSwapOption};
 use bytes::Bytes;
 use camino::Utf8Path;
 use chrono::{DateTime, Utc};
@@ -23,6 +23,7 @@ use fail::fail_point;
 use handle::ShardTimelineId;
 use offload::OffloadError;
 use once_cell::sync::Lazy;
+use pageserver_api::models::PageTraceEvent;
 use pageserver_api::{
     config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
     key::{
@@ -42,6 +43,7 @@ use rand::Rng;
 use remote_storage::DownloadError;
 use serde_with::serde_as;
 use storage_broker::BrokerClientChannel;
+use tokio::sync::mpsc::Sender;
 use tokio::{
     runtime::Handle,
     sync::{oneshot, watch},
@@ -433,6 +435,9 @@ pub struct Timeline {
 
     /// Cf. [`crate::tenant::CreateTimelineIdempotency`].
     pub(crate) create_idempotency: crate::tenant::CreateTimelineIdempotency,
+
+    /// If Some, collects GetPage metadata for an ongoing PageTrace.
+    pub(crate) page_trace: ArcSwapOption<Sender<PageTraceEvent>>,
 }
 
 pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
@@ -2380,6 +2385,8 @@ impl Timeline {
                 attach_wal_lag_cooldown,
 
                 create_idempotency,
+
+                page_trace: Default::default(),
             };
 
             result.repartition_threshold =

From 55a68b28a27b27810255212994e601050b73917e Mon Sep 17 00:00:00 2001
From: Gleb Novikov <NanoBjorn@users.noreply.github.com>
Date: Wed, 15 Jan 2025 20:51:09 +0000
Subject: [PATCH 199/583] fast import: restore to neondb (not postgres)
 database (#10251)

## Problem

`postgres` is system database at neon, so we need to do `pg_restore`
into `neondb` instead

https://github.com/neondatabase/cloud/issues/22100

## Summary of changes

Changed fast_import a little bit:
1. After succesfull connection creating `neondb` in postgres instance
2. Changed restore connstring to use new db
3. Added optional `source_connection_string`, which allows to skip
`s3_prefix` and just connect directly.
4. Added `-i` that stops process until sigterm

## TODO
- [x] test image in cplane e2e
- [ ] Change import job image back to latest after this merged (partial
revert of https://github.com/neondatabase/cloud/pull/22338)
---
 compute_tools/src/bin/fast_import.rs | 227 +++++++++++++++++----------
 1 file changed, 147 insertions(+), 80 deletions(-)

diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index f554362751..5b008f8182 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -31,7 +31,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use clap::Parser;
 use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion};
 use nix::unistd::Pid;
-use tracing::{info, info_span, warn, Instrument};
+use tracing::{error, info, info_span, warn, Instrument};
 use utils::fs_ext::is_directory_empty;
 
 #[path = "fast_import/aws_s3_sync.rs"]
@@ -41,12 +41,19 @@ mod child_stdio_to_log;
 #[path = "fast_import/s3_uri.rs"]
 mod s3_uri;
 
+const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600);
+const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300);
+
 #[derive(clap::Parser)]
 struct Args {
     #[clap(long)]
     working_directory: Utf8PathBuf,
     #[clap(long, env = "NEON_IMPORTER_S3_PREFIX")]
-    s3_prefix: s3_uri::S3Uri,
+    s3_prefix: Option<s3_uri::S3Uri>,
+    #[clap(long)]
+    source_connection_string: Option<String>,
+    #[clap(short, long)]
+    interactive: bool,
     #[clap(long)]
     pg_bin_dir: Utf8PathBuf,
     #[clap(long)]
@@ -77,30 +84,70 @@ pub(crate) async fn main() -> anyhow::Result<()> {
 
     info!("starting");
 
-    let Args {
-        working_directory,
-        s3_prefix,
-        pg_bin_dir,
-        pg_lib_dir,
-    } = Args::parse();
+    let args = Args::parse();
 
-    let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
+    // Validate arguments
+    if args.s3_prefix.is_none() && args.source_connection_string.is_none() {
+        anyhow::bail!("either s3_prefix or source_connection_string must be specified");
+    }
+    if args.s3_prefix.is_some() && args.source_connection_string.is_some() {
+        anyhow::bail!("only one of s3_prefix or source_connection_string can be specified");
+    }
 
-    let spec: Spec = {
-        let spec_key = s3_prefix.append("/spec.json");
-        let s3_client = aws_sdk_s3::Client::new(&aws_config);
-        let object = s3_client
-            .get_object()
-            .bucket(&spec_key.bucket)
-            .key(spec_key.key)
-            .send()
-            .await
-            .context("get spec from s3")?
-            .body
-            .collect()
-            .await
-            .context("download spec body")?;
-        serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
+    let working_directory = args.working_directory;
+    let pg_bin_dir = args.pg_bin_dir;
+    let pg_lib_dir = args.pg_lib_dir;
+
+    // Initialize AWS clients only if s3_prefix is specified
+    let (aws_config, kms_client) = if args.s3_prefix.is_some() {
+        let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
+        let kms = aws_sdk_kms::Client::new(&config);
+        (Some(config), Some(kms))
+    } else {
+        (None, None)
+    };
+
+    // Get source connection string either from S3 spec or direct argument
+    let source_connection_string = if let Some(s3_prefix) = &args.s3_prefix {
+        let spec: Spec = {
+            let spec_key = s3_prefix.append("/spec.json");
+            let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap());
+            let object = s3_client
+                .get_object()
+                .bucket(&spec_key.bucket)
+                .key(spec_key.key)
+                .send()
+                .await
+                .context("get spec from s3")?
+                .body
+                .collect()
+                .await
+                .context("download spec body")?;
+            serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
+        };
+
+        match spec.encryption_secret {
+            EncryptionSecret::KMS { key_id } => {
+                let mut output = kms_client
+                    .unwrap()
+                    .decrypt()
+                    .key_id(key_id)
+                    .ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
+                        spec.source_connstring_ciphertext_base64,
+                    ))
+                    .send()
+                    .await
+                    .context("decrypt source connection string")?;
+                let plaintext = output
+                    .plaintext
+                    .take()
+                    .context("get plaintext source connection string")?;
+                String::from_utf8(plaintext.into_inner())
+                    .context("parse source connection string as utf8")?
+            }
+        }
+    } else {
+        args.source_connection_string.unwrap()
     };
 
     match tokio::fs::create_dir(&working_directory).await {
@@ -123,15 +170,6 @@ pub(crate) async fn main() -> anyhow::Result<()> {
         .await
         .context("create pgdata directory")?;
 
-    //
-    // Setup clients
-    //
-    let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
-    let kms_client = aws_sdk_kms::Client::new(&aws_config);
-
-    //
-    //  Initialize pgdata
-    //
     let pgbin = pg_bin_dir.join("postgres");
     let pg_version = match get_pg_version(pgbin.as_ref()) {
         PostgresMajorVersion::V14 => 14,
@@ -170,7 +208,13 @@ pub(crate) async fn main() -> anyhow::Result<()> {
         .args(["-c", &format!("max_parallel_workers={nproc}")])
         .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
         .args(["-c", &format!("max_worker_processes={nproc}")])
-        .args(["-c", "effective_io_concurrency=100"])
+        .args([
+            "-c",
+            &format!(
+                "effective_io_concurrency={}",
+                if cfg!(target_os = "macos") { 0 } else { 100 }
+            ),
+        ])
         .env_clear()
         .stdout(std::process::Stdio::piped())
         .stderr(std::process::Stdio::piped())
@@ -185,44 +229,58 @@ pub(crate) async fn main() -> anyhow::Result<()> {
         )
         .instrument(info_span!("postgres")),
     );
+
+    // Create neondb database in the running postgres
     let restore_pg_connstring =
         format!("host=localhost port=5432 user={superuser} dbname=postgres");
+
+    let start_time = std::time::Instant::now();
+
     loop {
-        let res = tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await;
-        if res.is_ok() {
-            info!("postgres is ready, could connect to it");
-            break;
+        if start_time.elapsed() > PG_WAIT_TIMEOUT {
+            error!(
+                "timeout exceeded: failed to poll postgres and create database within 10 minutes"
+            );
+            std::process::exit(1);
+        }
+
+        match tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await {
+            Ok((client, connection)) => {
+                // Spawn the connection handling task to maintain the connection
+                tokio::spawn(async move {
+                    if let Err(e) = connection.await {
+                        warn!("connection error: {}", e);
+                    }
+                });
+
+                match client.simple_query("CREATE DATABASE neondb;").await {
+                    Ok(_) => {
+                        info!("created neondb database");
+                        break;
+                    }
+                    Err(e) => {
+                        warn!(
+                            "failed to create database: {}, retying in {}s",
+                            e,
+                            PG_WAIT_RETRY_INTERVAL.as_secs_f32()
+                        );
+                        tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await;
+                        continue;
+                    }
+                }
+            }
+            Err(_) => {
+                info!(
+                    "postgres not ready yet, retrying in {}s",
+                    PG_WAIT_RETRY_INTERVAL.as_secs_f32()
+                );
+                tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await;
+                continue;
+            }
         }
     }
 
-    //
-    // Decrypt connection string
-    //
-    let source_connection_string = {
-        match spec.encryption_secret {
-            EncryptionSecret::KMS { key_id } => {
-                let mut output = kms_client
-                    .decrypt()
-                    .key_id(key_id)
-                    .ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
-                        spec.source_connstring_ciphertext_base64,
-                    ))
-                    .send()
-                    .await
-                    .context("decrypt source connection string")?;
-                let plaintext = output
-                    .plaintext
-                    .take()
-                    .context("get plaintext source connection string")?;
-                String::from_utf8(plaintext.into_inner())
-                    .context("parse source connection string as utf8")?
-            }
-        }
-    };
-
-    //
-    // Start the work
-    //
+    let restore_pg_connstring = restore_pg_connstring.replace("dbname=postgres", "dbname=neondb");
 
     let dumpdir = working_directory.join("dumpdir");
 
@@ -310,6 +368,12 @@ pub(crate) async fn main() -> anyhow::Result<()> {
         }
     }
 
+    // If interactive mode, wait for Ctrl+C
+    if args.interactive {
+        info!("Running in interactive mode. Press Ctrl+C to shut down.");
+        tokio::signal::ctrl_c().await.context("wait for ctrl-c")?;
+    }
+
     info!("shutdown postgres");
     {
         nix::sys::signal::kill(
@@ -325,21 +389,24 @@ pub(crate) async fn main() -> anyhow::Result<()> {
             .context("wait for postgres to shut down")?;
     }
 
-    info!("upload pgdata");
-    aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
-        .await
-        .context("sync dump directory to destination")?;
-
-    info!("write status");
-    {
-        let status_dir = working_directory.join("status");
-        std::fs::create_dir(&status_dir).context("create status directory")?;
-        let status_file = status_dir.join("pgdata");
-        std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
-            .context("write status file")?;
-        aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
+    // Only sync if s3_prefix was specified
+    if let Some(s3_prefix) = args.s3_prefix {
+        info!("upload pgdata");
+        aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
             .await
-            .context("sync status directory to destination")?;
+            .context("sync dump directory to destination")?;
+
+        info!("write status");
+        {
+            let status_dir = working_directory.join("status");
+            std::fs::create_dir(&status_dir).context("create status directory")?;
+            let status_file = status_dir.join("pgdata");
+            std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
+                .context("write status file")?;
+            aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
+                .await
+                .context("sync status directory to destination")?;
+        }
     }
 
     Ok(())

From a753349cb0d8a76b7b5884c216236f6e57357d30 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 15 Jan 2025 17:04:06 -0500
Subject: [PATCH 200/583] feat(pageserver): validate data integrity during
 gc-compaction (#10131)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

part of https://github.com/neondatabase/neon/issues/9114
part of investigation of
https://github.com/neondatabase/neon/issues/10049

## Summary of changes

* If `cfg!(test) or cfg!(feature = testing)`, then we will always try
generating an image to ensure the history is replayable, but not put the
image layer into the final layer results, therefore discovering wrong
key history before we hit a read error.
* I suspect it's easier to trigger some races if gc-compaction is
continuously run on a timeline, so I increased the frequency to twice
per 10 churns.
* Also, create branches in gc-compaction smoke tests to get more test
coverage.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Arpad Müller <arpad@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 77 +++++++++++++++-----
 test_runner/fixtures/workload.py             | 16 ++++
 test_runner/regress/test_compaction.py       | 20 ++++-
 3 files changed, 89 insertions(+), 24 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 05f8d476f9..2042a18e96 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1776,7 +1776,10 @@ impl Timeline {
         base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
     ) -> anyhow::Result<KeyHistoryRetention> {
         // Pre-checks for the invariants
-        if cfg!(debug_assertions) {
+
+        let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing");
+
+        if debug_mode {
             for (log_key, _, _) in full_history {
                 assert_eq!(log_key, &key, "mismatched key");
             }
@@ -1922,15 +1925,19 @@ impl Timeline {
             output
         }
 
+        let mut key_exists = false;
         for (i, split_for_lsn) in split_history.into_iter().enumerate() {
             // TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly.
             records_since_last_image += split_for_lsn.len();
-            let generate_image = if i == 0 && !has_ancestor {
+            // Whether to produce an image into the final layer files
+            let produce_image = if i == 0 && !has_ancestor {
                 // We always generate images for the first batch (below horizon / lowest retain_lsn)
                 true
             } else if i == batch_cnt - 1 {
                 // Do not generate images for the last batch (above horizon)
                 false
+            } else if records_since_last_image == 0 {
+                false
             } else if records_since_last_image >= delta_threshold_cnt {
                 // Generate images when there are too many records
                 true
@@ -1945,29 +1952,45 @@ impl Timeline {
                     break;
                 }
             }
-            if let Some((_, _, val)) = replay_history.first() {
-                if !val.will_init() {
-                    return Err(anyhow::anyhow!("invalid history, no base image")).with_context(
-                        || {
-                            generate_debug_trace(
-                                Some(&replay_history),
-                                full_history,
-                                retain_lsn_below_horizon,
-                                horizon,
-                            )
-                        },
-                    );
-                }
+            if replay_history.is_empty() && !key_exists {
+                // The key does not exist at earlier LSN, we can skip this iteration.
+                retention.push(Vec::new());
+                continue;
+            } else {
+                key_exists = true;
             }
-            if generate_image && records_since_last_image > 0 {
+            let Some((_, _, val)) = replay_history.first() else {
+                unreachable!("replay history should not be empty once it exists")
+            };
+            if !val.will_init() {
+                return Err(anyhow::anyhow!("invalid history, no base image")).with_context(|| {
+                    generate_debug_trace(
+                        Some(&replay_history),
+                        full_history,
+                        retain_lsn_below_horizon,
+                        horizon,
+                    )
+                });
+            }
+            // Whether to reconstruct the image. In debug mode, we will generate an image
+            // at every retain_lsn to ensure data is not corrupted, but we won't put the
+            // image into the final layer.
+            let generate_image = produce_image || debug_mode;
+            if produce_image {
                 records_since_last_image = 0;
-                let replay_history_for_debug = if cfg!(debug_assertions) {
+            }
+            let img_and_lsn = if generate_image {
+                let replay_history_for_debug = if debug_mode {
                     Some(replay_history.clone())
                 } else {
                     None
                 };
                 let replay_history_for_debug_ref = replay_history_for_debug.as_deref();
-                let history = std::mem::take(&mut replay_history);
+                let history = if produce_image {
+                    std::mem::take(&mut replay_history)
+                } else {
+                    replay_history.clone()
+                };
                 let mut img = None;
                 let mut records = Vec::with_capacity(history.len());
                 if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
@@ -2004,8 +2027,20 @@ impl Timeline {
                 }
                 records.reverse();
                 let state = ValueReconstructState { img, records };
-                let request_lsn = lsn_split_points[i]; // last batch does not generate image so i is always in range
+                // last batch does not generate image so i is always in range, unless we force generate
+                // an image during testing
+                let request_lsn = if i >= lsn_split_points.len() {
+                    Lsn::MAX
+                } else {
+                    lsn_split_points[i]
+                };
                 let img = self.reconstruct_value(key, request_lsn, state).await?;
+                Some((request_lsn, img))
+            } else {
+                None
+            };
+            if produce_image {
+                let (request_lsn, img) = img_and_lsn.unwrap();
                 replay_history.push((key, request_lsn, Value::Image(img.clone())));
                 retention.push(vec![(request_lsn, Value::Image(img))]);
             } else {
@@ -2273,6 +2308,8 @@ impl Timeline {
         let compact_key_range = job.compact_key_range;
         let compact_lsn_range = job.compact_lsn_range;
 
+        let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing");
+
         info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", compact_key_range.start, compact_key_range.end, compact_lsn_range.start, compact_lsn_range.end);
 
         scopeguard::defer! {
@@ -2398,7 +2435,7 @@ impl Timeline {
                 .first()
                 .copied()
                 .unwrap_or(job_desc.gc_cutoff);
-            if cfg!(debug_assertions) {
+            if debug_mode {
                 assert_eq!(
                     res,
                     job_desc
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index 1b8c9fef44..eea0ec2b95 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -53,6 +53,22 @@ class Workload:
         self._endpoint: Endpoint | None = None
         self._endpoint_opts = endpoint_opts or {}
 
+    def branch(
+        self,
+        timeline_id: TimelineId,
+        branch_name: str | None = None,
+        endpoint_opts: dict[str, Any] | None = None,
+    ) -> Workload:
+        """
+        Checkpoint the current status of the workload in case of branching
+        """
+        branch_workload = Workload(
+            self.env, self.tenant_id, timeline_id, branch_name, endpoint_opts
+        )
+        branch_workload.expect_rows = self.expect_rows
+        branch_workload.churn_cursor = self.churn_cursor
+        return branch_workload
+
     def reconfigure(self) -> None:
         """
         Request the endpoint to reconfigure based on location reported by storage controller
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index fe0422088a..d0a2349ccf 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -112,7 +112,11 @@ page_cache_size=10
 
 
 @skip_in_debug_build("only run with release build")
-def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize(
+    "with_branches",
+    ["with_branches", "no_branches"],
+)
+def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_branches: str):
     SMOKE_CONF = {
         # Run both gc and gc-compaction.
         "gc_period": "5s",
@@ -143,12 +147,17 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
     log.info("Writing initial data ...")
     workload.write_rows(row_count, env.pageserver.id)
 
+    child_workloads: list[Workload] = []
+
     for i in range(1, churn_rounds + 1):
         if i % 10 == 0:
             log.info(f"Running churn round {i}/{churn_rounds} ...")
-
-        if (i - 1) % 10 == 0:
-            # Run gc-compaction every 10 rounds to ensure the test doesn't take too long time.
+        if i % 10 == 5 and with_branches == "with_branches":
+            branch_name = f"child-{i}"
+            branch_timeline_id = env.create_branch(branch_name)
+            child_workloads.append(workload.branch(branch_timeline_id, branch_name))
+        if (i - 1) % 10 == 0 or (i - 1) % 10 == 1:
+            # Run gc-compaction twice every 10 rounds to ensure the test doesn't take too long time.
             ps_http.timeline_compact(
                 tenant_id,
                 timeline_id,
@@ -179,6 +188,9 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
 
     log.info("Validating at workload end ...")
     workload.validate(env.pageserver.id)
+    for child_workload in child_workloads:
+        log.info(f"Validating at branch {child_workload.branch_name}")
+        child_workload.validate(env.pageserver.id)
 
     # Run a legacy compaction+gc to ensure gc-compaction can coexist with legacy compaction.
     ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)

From c7429af8a0765b516bd3f6a49c8bc6632deffbf7 Mon Sep 17 00:00:00 2001
From: Mikhail Kot <mikhail@neon.tech>
Date: Wed, 15 Jan 2025 22:29:18 +0000
Subject: [PATCH 201/583] Enable dblink (#10358)

Update compute image to include dblink #3720
---
 compute/compute-node.Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 1ee159e5df..f56a8358d2 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -66,6 +66,7 @@ RUN cd postgres && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
     # Enable some of contrib extensions
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \

From 2eda484ef6dac15e0e7a72955e9d194ccfe3326f Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Thu, 16 Jan 2025 03:43:47 +0100
Subject: [PATCH 202/583] prefetch: Read more frequently from TCP buffer
 (#10394)

This reduces pressure on the OS TCP read buffer by increasing the
moments we read data out of the receive buffer, and increasing the
number of bytes we can pull from that buffer when we do reads.

## Problem

A backend may not always consume its prefetch data quick enough

## Summary of changes

We add a new function `prefetch_pump_state` which pulls as many prefetch
requests from the OS TCP receive buffer as possible, but without
blocking.

This thus reduces pressure on OS-level TCP buffers, thus increasing
throughput by limiting throttling caused by full TCP buffers.
---
 pgxn/neon/libpagestore.c     | 70 +++++++++++++++++++++++++++++++++++-
 pgxn/neon/pagestore_client.h | 20 +++++++++++
 pgxn/neon/pagestore_smgr.c   | 66 ++++++++++++++++++++++++++++++++++
 3 files changed, 155 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 769befb4e5..4460e3b40c 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -911,7 +911,74 @@ pageserver_receive(shardno_t shard_no)
 		}
 		PG_CATCH();
 		{
-			neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response");
+			neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response");
+			pageserver_disconnect(shard_no);
+			PG_RE_THROW();
+		}
+		PG_END_TRY();
+
+		if (message_level_is_interesting(PageStoreTrace))
+		{
+			char	   *msg = nm_to_string((NeonMessage *) resp);
+
+			neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
+			pfree(msg);
+		}
+	}
+	else if (rc == -1)
+	{
+		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn)));
+		pageserver_disconnect(shard_no);
+		resp = NULL;
+	}
+	else if (rc == -2)
+	{
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
+		pageserver_disconnect(shard_no);
+		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg);
+	}
+	else
+	{
+		pageserver_disconnect(shard_no);
+		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
+	}
+
+	shard->nresponses_received++;
+	return (NeonResponse *) resp;
+}
+
+static NeonResponse *
+pageserver_try_receive(shardno_t shard_no)
+{
+	StringInfoData resp_buff;
+	NeonResponse *resp;
+	PageServer *shard = &page_servers[shard_no];
+	PGconn	   *pageserver_conn = shard->conn;
+	/* read response */
+	int			rc;
+
+	if (shard->state != PS_Connected)
+		return NULL;
+
+	Assert(pageserver_conn);
+
+	rc = PQgetCopyData(shard->conn, &resp_buff.data, 1 /* async = true */);
+
+	if (rc == 0)
+		return NULL;
+	else if (rc > 0)
+	{
+		PG_TRY();
+		{
+			resp_buff.len = rc;
+			resp_buff.cursor = 0;
+			resp = nm_unpack_response(&resp_buff);
+			PQfreemem(resp_buff.data);
+		}
+		PG_CATCH();
+		{
+			neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response");
 			pageserver_disconnect(shard_no);
 			PG_RE_THROW();
 		}
@@ -980,6 +1047,7 @@ page_server_api api =
 	.send = pageserver_send,
 	.flush = pageserver_flush,
 	.receive = pageserver_receive,
+	.try_receive = pageserver_try_receive,
 	.disconnect = pageserver_disconnect_shard
 };
 
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 37bc4f7886..b751235595 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -192,9 +192,29 @@ typedef uint16 shardno_t;
 
 typedef struct
 {
+	/*
+	 * Send this request to the PageServer associated with this shard.
+	 */
 	bool		(*send) (shardno_t  shard_no, NeonRequest * request);
+	/*
+	 * Blocking read for the next response of this shard.
+	 *
+	 * When a CANCEL signal is handled, the connection state will be
+	 * unmodified.
+	 */
 	NeonResponse *(*receive) (shardno_t shard_no);
+	/*
+	 * Try get the next response from the TCP buffers, if any.
+	 * Returns NULL when the data is not yet available. 
+	 */
+	NeonResponse *(*try_receive) (shardno_t shard_no);
+	/*
+	 * Make sure all requests are sent to PageServer.
+	 */
 	bool		(*flush) (shardno_t shard_no);
+	/*
+	 * Disconnect from this pageserver shard.
+	 */
 	void        (*disconnect) (shardno_t shard_no);
 } page_server_api;
 
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 7a4c0ef487..54cacea984 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -405,6 +405,56 @@ compact_prefetch_buffers(void)
 	return false;
 }
 
+/*
+ * If there might be responses still in the TCP buffer, then
+ * we should try to use those, so as to reduce any TCP backpressure
+ * on the OS/PS side.
+ *
+ * This procedure handles that.
+ *
+ * Note that this is only valid as long as the only pipelined
+ * operations in the TCP buffer are getPage@Lsn requests.
+ */
+static void
+prefetch_pump_state(void)
+{
+	while (MyPState->ring_receive != MyPState->ring_flush)
+	{
+		NeonResponse   *response;
+		PrefetchRequest *slot;
+		MemoryContext	old;
+
+		slot = GetPrfSlot(MyPState->ring_receive);
+
+		old = MemoryContextSwitchTo(MyPState->errctx);
+		response = page_server->try_receive(slot->shard_no);
+		MemoryContextSwitchTo(old);
+
+		if (response == NULL)
+			break;
+
+		/* The slot should still be valid */
+		if (slot->status != PRFS_REQUESTED ||
+			slot->response != NULL ||
+			slot->my_ring_index != MyPState->ring_receive)
+			neon_shard_log(slot->shard_no, ERROR,
+						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
+						   slot->status, slot->response,
+						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
+
+		/* update prefetch state */
+		MyPState->n_responses_buffered += 1;
+		MyPState->n_requests_inflight -= 1;
+		MyPState->ring_receive += 1;
+		MyNeonCounters->getpage_prefetches_buffered =
+			MyPState->n_responses_buffered;
+
+		/* update slot state */
+		slot->status = PRFS_RECEIVED;
+		slot->response = response;
+	}
+}
+
 void
 readahead_buffer_resize(int newsize, void *extra)
 {
@@ -2808,6 +2858,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			   MyPState->ring_last <= ring_index);
 	}
 
+	prefetch_pump_state();
+
 	return false;
 }
 
@@ -2849,6 +2901,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 	Assert(ring_index < MyPState->ring_unused &&
 		   MyPState->ring_last <= ring_index);
 
+	prefetch_pump_state();
+
 	return false;
 }
 #endif /* PG_MAJORVERSION_NUM < 17 */
@@ -2891,6 +2945,8 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 	 */
 	neon_log(SmgrTrace, "writeback noop");
 
+	prefetch_pump_state();
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
 		mdwriteback(reln, forknum, blocknum, nblocks);
@@ -3145,6 +3201,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, NULL);
 	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
 
+	prefetch_pump_state();
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
 	{
@@ -3282,6 +3340,8 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
 					  buffers, nblocks, read);
 
+	prefetch_pump_state();
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
 	{
@@ -3450,6 +3510,8 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 
 	lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
 
+	prefetch_pump_state();
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
 		#if PG_MAJORVERSION_NUM >= 17
@@ -3503,6 +3565,8 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 
 	lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
 
+	prefetch_pump_state();
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
 		mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
@@ -3792,6 +3856,8 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
 
 	neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
 
+	prefetch_pump_state();
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
 		mdimmedsync(reln, forknum);

From 6fe4c6798f76aaf103f1f44166970f4677ff6c9d Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Thu, 16 Jan 2025 11:01:19 +0300
Subject: [PATCH 203/583] Add START_WAL_PUSH proto_version and
 allow_timeline_creation options. (#10406)

## Problem

As part of https://github.com/neondatabase/neon/issues/8614 we need to
pass options to START_WAL_PUSH.

## Summary of changes

Add two options. `allow_timeline_creation`, default true, disables
implicit timeline creation in the connection from compute. Eventually
such creation will be forbidden completely, but as we migrate to
configurations we need to support both: current mode and configurations
enabled where creation by compute is disabled.

`proto_version` specifies compute <-> sk protocol version. We have it
currently in the first greeting package also, but I plan to change tag
size from u64 to u8, which would make it hard to use. Command is more
appropriate place for it anyway.
---
 safekeeper/src/handler.rs                     | 107 ++++++++++++++++--
 safekeeper/src/receive_wal.rs                 |  57 +++++++---
 safekeeper/src/safekeeper.rs                  |  11 +-
 .../tests/walproposer_sim/safekeeper.rs       |   6 +-
 4 files changed, 152 insertions(+), 29 deletions(-)

diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index bb639bfb32..e77eeb4130 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -52,16 +52,70 @@ pub struct SafekeeperPostgresHandler {
 
 /// Parsed Postgres command.
 enum SafekeeperPostgresCommand {
-    StartWalPush,
-    StartReplication { start_lsn: Lsn, term: Option<Term> },
+    StartWalPush {
+        proto_version: u32,
+        // Eventually timelines will be always created explicitly by storcon.
+        // This option allows legacy behaviour for compute to do that until we
+        // fully migrate.
+        allow_timeline_creation: bool,
+    },
+    StartReplication {
+        start_lsn: Lsn,
+        term: Option<Term>,
+    },
     IdentifySystem,
     TimelineStatus,
-    JSONCtrl { cmd: AppendLogicalMessage },
+    JSONCtrl {
+        cmd: AppendLogicalMessage,
+    },
 }
 
 fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
     if cmd.starts_with("START_WAL_PUSH") {
-        Ok(SafekeeperPostgresCommand::StartWalPush)
+        // Allow additional options in postgres START_REPLICATION style like
+        //   START_WAL_PUSH (proto_version '3', allow_timeline_creation 'false').
+        // Parsing here is very naive and breaks in case of commas or
+        // whitespaces in values, but enough for our purposes.
+        let re = Regex::new(r"START_WAL_PUSH(\s+?\((.*)\))?").unwrap();
+        let caps = re
+            .captures(cmd)
+            .context(format!("failed to parse START_WAL_PUSH command {}", cmd))?;
+        // capture () content
+        let options = caps.get(2).map(|m| m.as_str()).unwrap_or("");
+        // default values
+        let mut proto_version = 2;
+        let mut allow_timeline_creation = true;
+        for kvstr in options.split(",") {
+            if kvstr.is_empty() {
+                continue;
+            }
+            let mut kvit = kvstr.split_whitespace();
+            let key = kvit.next().context(format!(
+                "failed to parse key in kv {} in command {}",
+                kvstr, cmd
+            ))?;
+            let value = kvit.next().context(format!(
+                "failed to parse value in kv {} in command {}",
+                kvstr, cmd
+            ))?;
+            let value_trimmed = value.trim_matches('\'');
+            if key == "proto_version" {
+                proto_version = value_trimmed.parse::<u32>().context(format!(
+                    "failed to parse proto_version value {} in command {}",
+                    value, cmd
+                ))?;
+            }
+            if key == "allow_timeline_creation" {
+                allow_timeline_creation = value_trimmed.parse::<bool>().context(format!(
+                    "failed to parse allow_timeline_creation value {} in command {}",
+                    value, cmd
+                ))?;
+            }
+        }
+        Ok(SafekeeperPostgresCommand::StartWalPush {
+            proto_version,
+            allow_timeline_creation,
+        })
     } else if cmd.starts_with("START_REPLICATION") {
         let re = Regex::new(
             // We follow postgres START_REPLICATION LOGICAL options to pass term.
@@ -95,7 +149,7 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
 
 fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str {
     match cmd {
-        SafekeeperPostgresCommand::StartWalPush => "START_WAL_PUSH",
+        SafekeeperPostgresCommand::StartWalPush { .. } => "START_WAL_PUSH",
         SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION",
         SafekeeperPostgresCommand::TimelineStatus => "TIMELINE_STATUS",
         SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM",
@@ -293,8 +347,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
             self.ttid = TenantTimelineId::new(tenant_id, timeline_id);
 
             match cmd {
-                SafekeeperPostgresCommand::StartWalPush => {
-                    self.handle_start_wal_push(pgb)
+                SafekeeperPostgresCommand::StartWalPush {
+                    proto_version,
+                    allow_timeline_creation,
+                } => {
+                    self.handle_start_wal_push(pgb, proto_version, allow_timeline_creation)
                         .instrument(info_span!("WAL receiver"))
                         .await
                 }
@@ -467,3 +524,39 @@ impl SafekeeperPostgresHandler {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::SafekeeperPostgresCommand;
+
+    /// Test parsing of START_WAL_PUSH command
+    #[test]
+    fn test_start_wal_push_parse() {
+        let cmd = "START_WAL_PUSH";
+        let parsed = super::parse_cmd(cmd).expect("failed to parse");
+        match parsed {
+            SafekeeperPostgresCommand::StartWalPush {
+                proto_version,
+                allow_timeline_creation,
+            } => {
+                assert_eq!(proto_version, 2);
+                assert!(allow_timeline_creation);
+            }
+            _ => panic!("unexpected command"),
+        }
+
+        let cmd =
+            "START_WAL_PUSH (proto_version '3', allow_timeline_creation 'false', unknown 'hoho')";
+        let parsed = super::parse_cmd(cmd).expect("failed to parse");
+        match parsed {
+            SafekeeperPostgresCommand::StartWalPush {
+                proto_version,
+                allow_timeline_creation,
+            } => {
+                assert_eq!(proto_version, 3);
+                assert!(!allow_timeline_creation);
+            }
+            _ => panic!("unexpected command"),
+        }
+    }
+}
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index daaa8a253d..cb42f6f414 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -200,9 +200,14 @@ impl SafekeeperPostgresHandler {
     pub async fn handle_start_wal_push<IO: AsyncRead + AsyncWrite + Unpin>(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
+        proto_version: u32,
+        allow_timeline_creation: bool,
     ) -> Result<(), QueryError> {
         let mut tli: Option<WalResidentTimeline> = None;
-        if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
+        if let Err(end) = self
+            .handle_start_wal_push_guts(pgb, &mut tli, proto_version, allow_timeline_creation)
+            .await
+        {
             // Log the result and probably send it to the client, closing the stream.
             let handle_end_fut = pgb.handle_copy_stream_end(end);
             // If we managed to create the timeline, augment logging with current LSNs etc.
@@ -222,6 +227,8 @@ impl SafekeeperPostgresHandler {
         &mut self,
         pgb: &mut PostgresBackend<IO>,
         tli: &mut Option<WalResidentTimeline>,
+        proto_version: u32,
+        allow_timeline_creation: bool,
     ) -> Result<(), CopyStreamHandlerEnd> {
         // The `tli` parameter is only used for passing _out_ a timeline, one should
         // not have been passed in.
@@ -250,12 +257,17 @@ impl SafekeeperPostgresHandler {
             conn_id: self.conn_id,
             pgb_reader: &mut pgb_reader,
             peer_addr,
+            proto_version,
             acceptor_handle: &mut acceptor_handle,
             global_timelines: self.global_timelines.clone(),
         };
 
-        // Read first message and create timeline if needed.
-        let res = network_reader.read_first_message().await;
+        // Read first message and create timeline if needed and allowed. This
+        // won't be when timelines will be always created by storcon and
+        // allow_timeline_creation becomes false.
+        let res = network_reader
+            .read_first_message(allow_timeline_creation)
+            .await;
 
         let network_res = if let Ok((timeline, next_msg)) = res {
             let pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback> =
@@ -313,6 +325,7 @@ struct NetworkReader<'a, IO> {
     conn_id: ConnectionId,
     pgb_reader: &'a mut PostgresBackendReader<IO>,
     peer_addr: SocketAddr,
+    proto_version: u32,
     // WalAcceptor is spawned when we learn server info from walproposer and
     // create timeline; handle is put here.
     acceptor_handle: &'a mut Option<JoinHandle<anyhow::Result<()>>>,
@@ -322,9 +335,10 @@ struct NetworkReader<'a, IO> {
 impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
     async fn read_first_message(
         &mut self,
+        allow_timeline_creation: bool,
     ) -> Result<(WalResidentTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
         // Receive information about server to create timeline, if not yet.
-        let next_msg = read_message(self.pgb_reader).await?;
+        let next_msg = read_message(self.pgb_reader, self.proto_version).await?;
         let tli = match next_msg {
             ProposerAcceptorMessage::Greeting(ref greeting) => {
                 info!(
@@ -336,17 +350,22 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
                     system_id: greeting.system_id,
                     wal_seg_size: greeting.wal_seg_size,
                 };
-                let tli = self
-                    .global_timelines
-                    .create(
-                        self.ttid,
-                        Configuration::empty(),
-                        server_info,
-                        Lsn::INVALID,
-                        Lsn::INVALID,
-                    )
-                    .await
-                    .context("create timeline")?;
+                let tli = if allow_timeline_creation {
+                    self.global_timelines
+                        .create(
+                            self.ttid,
+                            Configuration::empty(),
+                            server_info,
+                            Lsn::INVALID,
+                            Lsn::INVALID,
+                        )
+                        .await
+                        .context("create timeline")?
+                } else {
+                    self.global_timelines
+                        .get(self.ttid)
+                        .context("get timeline")?
+                };
                 tli.wal_residence_guard().await?
             }
             _ => {
@@ -375,7 +394,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
         ));
 
         // Forward all messages to WalAcceptor
-        read_network_loop(self.pgb_reader, msg_tx, next_msg).await
+        read_network_loop(self.pgb_reader, msg_tx, next_msg, self.proto_version).await
     }
 }
 
@@ -383,9 +402,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
 /// TODO: Return Ok(None) on graceful termination.
 async fn read_message<IO: AsyncRead + AsyncWrite + Unpin>(
     pgb_reader: &mut PostgresBackendReader<IO>,
+    proto_version: u32,
 ) -> Result<ProposerAcceptorMessage, CopyStreamHandlerEnd> {
     let copy_data = pgb_reader.read_copy_message().await?;
-    let msg = ProposerAcceptorMessage::parse(copy_data)?;
+    let msg = ProposerAcceptorMessage::parse(copy_data, proto_version)?;
     Ok(msg)
 }
 
@@ -393,6 +413,7 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
     pgb_reader: &mut PostgresBackendReader<IO>,
     msg_tx: Sender<ProposerAcceptorMessage>,
     mut next_msg: ProposerAcceptorMessage,
+    proto_version: u32,
 ) -> Result<(), CopyStreamHandlerEnd> {
     /// Threshold for logging slow WalAcceptor sends.
     const SLOW_THRESHOLD: Duration = Duration::from_secs(5);
@@ -425,7 +446,7 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
         WAL_RECEIVER_QUEUE_DEPTH_TOTAL.inc();
         WAL_RECEIVER_QUEUE_SIZE_TOTAL.add(size as i64);
 
-        next_msg = read_message(pgb_reader).await?;
+        next_msg = read_message(pgb_reader, proto_version).await?;
     }
 }
 
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 06403228e9..45e19c31b6 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -29,7 +29,7 @@ use utils::{
     lsn::Lsn,
 };
 
-const SK_PROTOCOL_VERSION: u32 = 2;
+pub const SK_PROTOCOL_VERSION: u32 = 2;
 pub const UNKNOWN_SERVER_VERSION: u32 = 0;
 
 #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
@@ -317,7 +317,14 @@ pub enum ProposerAcceptorMessage {
 
 impl ProposerAcceptorMessage {
     /// Parse proposer message.
-    pub fn parse(msg_bytes: Bytes) -> Result<ProposerAcceptorMessage> {
+    pub fn parse(msg_bytes: Bytes, proto_version: u32) -> Result<ProposerAcceptorMessage> {
+        if proto_version != SK_PROTOCOL_VERSION {
+            bail!(
+                "incompatible protocol version {}, expected {}",
+                proto_version,
+                SK_PROTOCOL_VERSION
+            );
+        }
         // xxx using Reader is inefficient but easy to work with bincode
         let mut stream = msg_bytes.reader();
         // u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index e0d593851e..0023a4d22a 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -15,7 +15,9 @@ use desim::{
 };
 use http::Uri;
 use safekeeper::{
-    safekeeper::{ProposerAcceptorMessage, SafeKeeper, UNKNOWN_SERVER_VERSION},
+    safekeeper::{
+        ProposerAcceptorMessage, SafeKeeper, SK_PROTOCOL_VERSION, UNKNOWN_SERVER_VERSION,
+    },
     state::{TimelinePersistentState, TimelineState},
     timeline::TimelineError,
     wal_storage::Storage,
@@ -285,7 +287,7 @@ impl ConnState {
                 bail!("finished processing START_REPLICATION")
             }
 
-            let msg = ProposerAcceptorMessage::parse(copy_data)?;
+            let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTOCOL_VERSION)?;
             debug!("got msg: {:?}", msg);
             self.process(msg, global)
         } else {

From 7be971081aee245b0b78ff22766d8eeb020b66f2 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Thu, 16 Jan 2025 09:34:11 +0100
Subject: [PATCH 204/583] Make sure we request pages with a known-flushed LSN.
 (#10413)

This should fix the largest source of flakyness of
test_nbtree_pagesplit_cycleid.

## Problem

https://github.com/neondatabase/neon/issues/10390

## Summary of changes

By using a guaranteed-flushed LSN, we ensure that PS won't have to wait
forever.

(If it does wait forever, we know the issue can't be with Compute's WAL)
---
 .../regress/test_nbtree_pagesplit_cycleid.py  | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/test_runner/regress/test_nbtree_pagesplit_cycleid.py b/test_runner/regress/test_nbtree_pagesplit_cycleid.py
index 558557aeba..32ec6fcb92 100644
--- a/test_runner/regress/test_nbtree_pagesplit_cycleid.py
+++ b/test_runner/regress/test_nbtree_pagesplit_cycleid.py
@@ -4,9 +4,19 @@ import time
 from fixtures.neon_fixtures import NeonEnv
 
 BTREE_NUM_CYCLEID_PAGES = """
-    WITH raw_pages AS (
-        SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, NULL, NULL) page
-        FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) blkno
+    WITH lsns AS (
+        /*
+         * pg_switch_wal() ensures we have an LSN that
+         * 1. is after any previous modifications, but also,
+         * 2. (critically) is flushed, preventing any issues with waiting for
+         * unflushed WAL in PageServer.
+         */
+        SELECT pg_switch_wal() as lsn
+    ),
+    raw_pages AS (
+        SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, lsn, lsn) page
+        FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) AS blkno,
+            lsns l(lsn)
     ),
     parsed_pages AS (
         /* cycle ID is the last 2 bytes of the btree page */
@@ -36,7 +46,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
     ses1.execute("CREATE UNIQUE INDEX t_uidx ON t(id);")
     ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 2035) i;")
 
-    ses1.execute("SELECT neon_xlogflush();")
     ses1.execute(BTREE_NUM_CYCLEID_PAGES)
     pages = ses1.fetchall()
     assert (
@@ -57,7 +66,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
     ses1.execute("DELETE FROM t WHERE id <= 610;")
 
     # Flush wal, for checking purposes
-    ses1.execute("SELECT neon_xlogflush();")
     ses1.execute(BTREE_NUM_CYCLEID_PAGES)
     pages = ses1.fetchall()
     assert len(pages) == 0, f"No back splits with cycle ID expected, got batches of {pages} instead"
@@ -108,8 +116,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
     # unpin the btree page, allowing s3's vacuum to complete
     ses2.execute("FETCH ALL FROM foo;")
     ses2.execute("ROLLBACK;")
-    # flush WAL to make sure PS is up-to-date
-    ses1.execute("SELECT neon_xlogflush();")
     # check that our expectations are correct
     ses1.execute(BTREE_NUM_CYCLEID_PAGES)
     pages = ses1.fetchall()

From 58f6af6c9ad5bbc6238999769c801cdc52495bac Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 16 Jan 2025 02:35:36 -0600
Subject: [PATCH 205/583] Clean up compute_ctl extension server code (#10417)

---
 compute_tools/src/http/routes/extension_server.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs
index ee5bc675ba..5cc9b6d277 100644
--- a/compute_tools/src/http/routes/extension_server.rs
+++ b/compute_tools/src/http/routes/extension_server.rs
@@ -17,7 +17,8 @@ use crate::{
 
 #[derive(Debug, Clone, Deserialize)]
 pub(in crate::http) struct ExtensionServerParams {
-    is_library: Option<bool>,
+    #[serde(default)]
+    is_library: bool,
 }
 
 /// Download a remote extension.
@@ -51,7 +52,7 @@ pub(in crate::http) async fn download_extension(
 
         remote_extensions.get_ext(
             &filename,
-            params.is_library.unwrap_or(false),
+            params.is_library,
             &compute.build_tag,
             &compute.pgversion,
         )

From 86dbc44db12d1ebb5b9532a270fcfc68caffd8d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Thu, 16 Jan 2025 10:20:24 +0100
Subject: [PATCH 206/583] CI: Run check-codestyle-rust as part of
 pre-merge-checks (#10387)

## Problem

When multiple changes are grouped in a merge group to be merged as part
of the merge queue, the changes might individually pass
`check-codestyle-rust` but not in their combined form.

## Summary of changes

- Move `check-codestyle-rust` into a reusable workflow that is called
from it's previous location in `build_and_test.yml`, and additionally
call it from `pre_merge_checks.yml`. The additional call does not run on
ARM, only x86, to ensure the merge queue continues being responsive.
- Trigger `pre_merge_checks.yml` on PRs that change any of the workflows
running in `pre_merge_checks.yml`, so that we get feedback on those
early an not only after trying to merge those changes.
---
 .github/workflows/_check-codestyle-rust.yml | 91 +++++++++++++++++++++
 .github/workflows/build_and_test.yml        | 76 ++---------------
 .github/workflows/pre-merge-checks.yml      | 33 ++++++++
 3 files changed, 129 insertions(+), 71 deletions(-)
 create mode 100644 .github/workflows/_check-codestyle-rust.yml

diff --git a/.github/workflows/_check-codestyle-rust.yml b/.github/workflows/_check-codestyle-rust.yml
new file mode 100644
index 0000000000..cbc47c6406
--- /dev/null
+++ b/.github/workflows/_check-codestyle-rust.yml
@@ -0,0 +1,91 @@
+name: Check Codestyle Rust
+
+on:
+  workflow_call:
+    inputs:
+      build-tools-image:
+        description: "build-tools image"
+        required: true
+        type: string
+      archs:
+        description: "Json array of architectures to run on"
+        type: string
+
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+jobs:
+  check-codestyle-rust:
+    strategy:
+      matrix:
+        arch: ${{ fromJson(inputs.archs) }}
+    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
+
+    container:
+      image: ${{ inputs.build-tools-image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - name: Cache cargo deps
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            !~/.cargo/registry/src
+            ~/.cargo/git
+            target
+          key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
+
+      # Some of our rust modules use FFI and need those to be checked
+      - name: Get postgres headers
+        run: make postgres-headers -j$(nproc)
+
+      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
+      # This will catch compiler & clippy warnings in all feature combinations.
+      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
+      # NB: keep clippy args in sync with ./run_clippy.sh
+      #
+      # The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
+      # #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
+      # time just for that, so skip "clippy --release".
+      - run: |
+          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
+          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
+            echo "No clippy args found in .neon_clippy_args"
+            exit 1
+          fi
+          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+      - name: Run cargo clippy (debug)
+        run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS
+
+      - name: Check documentation generation
+        run: cargo doc --workspace --no-deps --document-private-items
+        env:
+          RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
+
+      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
+      - name: Check formatting
+        if: ${{ !cancelled() }}
+        run: cargo fmt --all -- --check
+
+      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
+      - name: Check rust dependencies
+        if: ${{ !cancelled() }}
+        run: |
+          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
+          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
+
+      # https://github.com/EmbarkStudios/cargo-deny
+      - name: Check rust licenses/bans/advisories/sources
+        if: ${{ !cancelled() }}
+        run: cargo deny check --hide-inclusion-graph
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 489a93f46d..9ec5273af7 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -164,77 +164,11 @@ jobs:
 
   check-codestyle-rust:
     needs: [ check-permissions, build-build-tools-image ]
-    strategy:
-      matrix:
-        arch: [ x64, arm64 ]
-    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
-
-    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      options: --init
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-
-      - name: Cache cargo deps
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cargo/registry
-            !~/.cargo/registry/src
-            ~/.cargo/git
-            target
-          key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
-
-      # Some of our rust modules use FFI and need those to be checked
-      - name: Get postgres headers
-        run: make postgres-headers -j$(nproc)
-
-      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
-      # This will catch compiler & clippy warnings in all feature combinations.
-      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
-      # NB: keep clippy args in sync with ./run_clippy.sh
-      #
-      # The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
-      # #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
-      # time just for that, so skip "clippy --release".
-      - run: |
-          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
-          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
-            echo "No clippy args found in .neon_clippy_args"
-            exit 1
-          fi
-          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
-      - name: Run cargo clippy (debug)
-        run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS
-
-      - name: Check documentation generation
-        run: cargo doc --workspace --no-deps --document-private-items
-        env:
-            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
-
-      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
-      - name: Check formatting
-        if: ${{ !cancelled() }}
-        run: cargo fmt --all -- --check
-
-      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
-      - name: Check rust dependencies
-        if: ${{ !cancelled() }}
-        run: |
-          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
-          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
-
-      # https://github.com/EmbarkStudios/cargo-deny
-      - name: Check rust licenses/bans/advisories/sources
-        if: ${{ !cancelled() }}
-        run: cargo deny check --hide-inclusion-graph
+    uses: ./.github/workflows/_check-codestyle-rust.yml
+    with:
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+      archs: '["x64", "arm64"]'
+    secrets: inherit
 
   build-and-test-locally:
     needs: [ tag, build-build-tools-image ]
diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml
index b2e00d94f7..e6dfbaeed8 100644
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -1,6 +1,12 @@
 name: Pre-merge checks
 
 on:
+  pull_request:
+    paths:
+      - .github/workflows/_check-codestyle-python.yml
+      - .github/workflows/_check-codestyle-rust.yml
+      - .github/workflows/build-build-tools-image.yml
+      - .github/workflows/pre-merge-checks.yml
   merge_group:
     branches:
       - main
@@ -17,8 +23,10 @@ jobs:
     runs-on: ubuntu-22.04
     outputs:
       python-changed: ${{ steps.python-src.outputs.any_changed }}
+      rust-changed: ${{ steps.rust-src.outputs.any_changed }}
     steps:
       - uses: actions/checkout@v4
+
       - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
         id: python-src
         with:
@@ -30,11 +38,25 @@ jobs:
             poetry.lock
             pyproject.toml
 
+      - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
+        id: rust-src
+        with:
+          files: |
+            .github/workflows/_check-codestyle-rust.yml
+            .github/workflows/build-build-tools-image.yml
+            .github/workflows/pre-merge-checks.yml
+            **/**.rs
+            **/Cargo.toml
+            Cargo.toml
+            Cargo.lock
+
       - name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES
         env:
           PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }}
+          RUST_CHANGED_FILES: ${{ steps.rust-src.outputs.all_changed_files }}
         run: |
           echo "${PYTHON_CHANGED_FILES}"
+          echo "${RUST_CHANGED_FILES}"
 
   build-build-tools-image:
     if: needs.get-changed-files.outputs.python-changed == 'true'
@@ -55,6 +77,16 @@ jobs:
       build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64
     secrets: inherit
 
+  check-codestyle-rust:
+    if: needs.get-changed-files.outputs.rust-changed == 'true'
+    needs: [ get-changed-files, build-build-tools-image ]
+    uses: ./.github/workflows/_check-codestyle-rust.yml
+    with:
+      # `-bookworm-x64` suffix should match the combination in `build-build-tools-image`
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64
+      archs: '["x64"]'
+    secrets: inherit
+
   # To get items from the merge queue merged into main we need to satisfy "Status checks that are required".
   # Currently we require 2 jobs (checks with exact name):
   # - conclusion
@@ -67,6 +99,7 @@ jobs:
     needs:
       - get-changed-files
       - check-codestyle-python
+      - check-codestyle-rust
     runs-on: ubuntu-22.04
     steps:
       - name: Create fake `neon-cloud-e2e` check

From 21d7b6a258e16542cf025790072850e80e40fdff Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 16 Jan 2025 14:11:33 +0000
Subject: [PATCH 207/583] tests: refactor
 test_tenant_delete_races_timeline_creation (#10425)

## Problem

Threads spawned in `test_tenant_delete_races_timeline_creation` are not
joined before the test ends, and can generate
`PytestUnhandledThreadExceptionWarning` in other tests.


https://neon-github-public-dev.s3.amazonaws.com/reports/pr-10419/12805365523/index.html#/testresult/53a72568acd04dbd

## Summary of changes

- Wrap threads in ThreadPoolExecutor which will join them before the
test ends
- Remove a spurious deletion call -- the background thread doing
deletion ought to succeed.
---
 test_runner/regress/test_tenant_delete.py | 71 ++++++++++++-----------
 1 file changed, 38 insertions(+), 33 deletions(-)

diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 48e55c1ab1..3720f653c5 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import json
+from concurrent.futures import ThreadPoolExecutor
 from threading import Thread
 
 import pytest
@@ -253,29 +254,8 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
     ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "pause"))
 
     def timeline_create():
-        try:
-            ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1)
-            raise RuntimeError("creation succeeded even though it shouldn't")
-        except ReadTimeout:
-            pass
-
-    Thread(target=timeline_create).start()
-
-    def hit_initdb_upload_failpoint():
-        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
-
-    wait_until(hit_initdb_upload_failpoint)
-
-    def creation_connection_timed_out():
-        env.pageserver.assert_log_contains(
-            "POST.*/timeline.* request was dropped before completing"
-        )
-
-    # Wait so that we hit the timeout and the connection is dropped
-    # (But timeline creation still continues)
-    wait_until(creation_connection_timed_out)
-
-    ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
+        ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1)
+        raise RuntimeError("creation succeeded even though it shouldn't")
 
     def tenant_delete():
         def tenant_delete_inner():
@@ -283,21 +263,46 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
 
         wait_until(tenant_delete_inner)
 
-    Thread(target=tenant_delete).start()
+    # We will spawn background threads for timeline creation and tenant deletion.  They will both
+    # get blocked on our failpoint.
+    with ThreadPoolExecutor(max_workers=1) as executor:
+        create_fut = executor.submit(timeline_create)
 
-    def deletion_arrived():
-        env.pageserver.assert_log_contains(
-            f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
-        )
+        def hit_initdb_upload_failpoint():
+            env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
 
-    wait_until(deletion_arrived)
+        wait_until(hit_initdb_upload_failpoint)
 
-    ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off"))
+        def creation_connection_timed_out():
+            env.pageserver.assert_log_contains(
+                "POST.*/timeline.* request was dropped before completing"
+            )
 
-    # Disable the failpoint and wait for deletion to finish
-    ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
+        # Wait so that we hit the timeout and the connection is dropped
+        # (But timeline creation still continues)
+        wait_until(creation_connection_timed_out)
 
-    ps_http.tenant_delete(tenant_id)
+        with pytest.raises(ReadTimeout):
+            # Our creation failed from the client's point of view.
+            create_fut.result()
+
+        ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
+
+        delete_fut = executor.submit(tenant_delete)
+
+        def deletion_arrived():
+            env.pageserver.assert_log_contains(
+                f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
+            )
+
+        wait_until(deletion_arrived)
+
+        ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off"))
+
+        # Disable the failpoint and wait for deletion to finish
+        ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
+
+        delete_fut.result()
 
     # Physical deletion should have happened
     assert_prefix_empty(

From e436dcad57d22f6d5fcb84ff2118b5f2ec96656f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 16 Jan 2025 15:30:49 +0100
Subject: [PATCH 208/583] Rename "disabled" safekeeper scheduling policy to
 "pause" (#10410)

Rename the safekeeper scheduling policy "disabled" to "pause".

A rename was requested in
https://github.com/neondatabase/neon/pull/10400#discussion_r1916259124,
as the "disabled" policy is meant to be analogous to the "pause" policy
for pageservers.

Also simplify the `SkSchedulingPolicyArg::from_str` function, relying on
the `from_str` implementation of `SkSchedulingPolicy`. Latter is used
for the database format as well, so it is quite stable. If we ever want
to change the UI, we'll need to duplicate the function again but this is
cheap.
---
 control_plane/storcon_cli/src/main.rs                |  9 +--------
 libs/pageserver_api/src/controller_api.rs            | 12 ++++++++----
 .../down.sql                                         |  2 ++
 .../up.sql                                           |  2 ++
 test_runner/regress/test_storage_controller.py       |  2 +-
 5 files changed, 14 insertions(+), 13 deletions(-)
 create mode 100644 storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/down.sql
 create mode 100644 storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/up.sql

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 96bfad4c86..d9b76b9600 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -298,14 +298,7 @@ impl FromStr for SkSchedulingPolicyArg {
     type Err = anyhow::Error;
 
     fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self(SkSchedulingPolicy::Active)),
-            "disabled" => Ok(Self(SkSchedulingPolicy::Disabled)),
-            "decomissioned" => Ok(Self(SkSchedulingPolicy::Decomissioned)),
-            _ => Err(anyhow::anyhow!(
-                "Unknown scheduling policy '{s}', try active,disabled,decomissioned"
-            )),
-        }
+        SkSchedulingPolicy::from_str(s).map(Self)
     }
 }
 
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 08d1fa55b9..78e080981a 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -324,7 +324,7 @@ impl From<NodeSchedulingPolicy> for String {
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
 pub enum SkSchedulingPolicy {
     Active,
-    Disabled,
+    Pause,
     Decomissioned,
 }
 
@@ -334,9 +334,13 @@ impl FromStr for SkSchedulingPolicy {
     fn from_str(s: &str) -> Result<Self, Self::Err> {
         Ok(match s {
             "active" => Self::Active,
-            "disabled" => Self::Disabled,
+            "pause" => Self::Pause,
             "decomissioned" => Self::Decomissioned,
-            _ => return Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
+            _ => {
+                return Err(anyhow::anyhow!(
+                    "Unknown scheduling policy '{s}', try active,pause,decomissioned"
+                ))
+            }
         })
     }
 }
@@ -346,7 +350,7 @@ impl From<SkSchedulingPolicy> for String {
         use SkSchedulingPolicy::*;
         match value {
             Active => "active",
-            Disabled => "disabled",
+            Pause => "pause",
             Decomissioned => "decomissioned",
         }
         .to_string()
diff --git a/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/down.sql b/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/down.sql
new file mode 100644
index 0000000000..3c7126e343
--- /dev/null
+++ b/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/down.sql
@@ -0,0 +1,2 @@
+ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'disabled';
+UPDATE safekeepers SET scheduling_policy = 'disabled' WHERE scheduling_policy = 'pause';
diff --git a/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/up.sql b/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/up.sql
new file mode 100644
index 0000000000..9ff75444f3
--- /dev/null
+++ b/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/up.sql
@@ -0,0 +1,2 @@
+ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'pause';
+UPDATE safekeepers SET scheduling_policy = 'pause' WHERE scheduling_policy = 'disabled';
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index b1e1fd81d6..ff479e8fe2 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3211,7 +3211,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
     # some small tests for the scheduling policy querying and returning APIs
     newest_info = target.get_safekeeper(inserted["id"])
     assert newest_info
-    assert newest_info["scheduling_policy"] == "Disabled"
+    assert newest_info["scheduling_policy"] == "Pause"
     target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
     newest_info = target.get_safekeeper(inserted["id"])
     assert newest_info

From cccc1968487c1f29c6ed190a538178b7c79410ce Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 16 Jan 2025 10:33:37 -0500
Subject: [PATCH 209/583] refactor(pageserver): make partitioning an ArcSwap
 (#10377)

## Problem

gc-compaction needs the partitioning data to decide the job split. This
refactor allows concurrent access/computing the partitioning.

## Summary of changes

Make `partitioning` an ArcSwap so that others can access the
partitioning while we compute it. Fully eliminate the `repartition is
called concurrently` warning when gc-compaction is going on.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/utils/src/guard_arc_swap.rs             | 54 ++++++++++++++++++++
 libs/utils/src/lib.rs                        |  2 +
 pageserver/src/tenant/timeline.rs            | 26 +++++-----
 pageserver/src/tenant/timeline/compaction.rs |  7 +--
 4 files changed, 70 insertions(+), 19 deletions(-)
 create mode 100644 libs/utils/src/guard_arc_swap.rs

diff --git a/libs/utils/src/guard_arc_swap.rs b/libs/utils/src/guard_arc_swap.rs
new file mode 100644
index 0000000000..cec5202460
--- /dev/null
+++ b/libs/utils/src/guard_arc_swap.rs
@@ -0,0 +1,54 @@
+//! A wrapper around `ArcSwap` that ensures there is only one writer at a time and writes
+//! don't block reads.
+
+use arc_swap::ArcSwap;
+use std::sync::Arc;
+use tokio::sync::TryLockError;
+
+pub struct GuardArcSwap<T> {
+    inner: ArcSwap<T>,
+    guard: tokio::sync::Mutex<()>,
+}
+
+pub struct Guard<'a, T> {
+    _guard: tokio::sync::MutexGuard<'a, ()>,
+    inner: &'a ArcSwap<T>,
+}
+
+impl<T> GuardArcSwap<T> {
+    pub fn new(inner: T) -> Self {
+        Self {
+            inner: ArcSwap::new(Arc::new(inner)),
+            guard: tokio::sync::Mutex::new(()),
+        }
+    }
+
+    pub fn read(&self) -> Arc<T> {
+        self.inner.load_full()
+    }
+
+    pub async fn write_guard(&self) -> Guard<'_, T> {
+        Guard {
+            _guard: self.guard.lock().await,
+            inner: &self.inner,
+        }
+    }
+
+    pub fn try_write_guard(&self) -> Result<Guard<'_, T>, TryLockError> {
+        let guard = self.guard.try_lock()?;
+        Ok(Guard {
+            _guard: guard,
+            inner: &self.inner,
+        })
+    }
+}
+
+impl<T> Guard<'_, T> {
+    pub fn read(&self) -> Arc<T> {
+        self.inner.load_full()
+    }
+
+    pub fn write(&mut self, value: T) {
+        self.inner.store(Arc::new(value));
+    }
+}
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 2c56dd750f..1fb18e9e9a 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -98,6 +98,8 @@ pub mod try_rcu;
 
 pub mod pprof;
 
+pub mod guard_arc_swap;
+
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
 #[doc(hidden)]
 pub use git_version;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d6ae11e67d..f24611e1d8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -51,7 +51,9 @@ use tokio::{
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{
-    fs_ext, pausable_failpoint,
+    fs_ext,
+    guard_arc_swap::GuardArcSwap,
+    pausable_failpoint,
     postgres_client::PostgresClientProtocol,
     sync::gate::{Gate, GateGuard},
 };
@@ -353,8 +355,8 @@ pub struct Timeline {
     // though let's keep them both for better error visibility.
     pub initdb_lsn: Lsn,
 
-    /// When did we last calculate the partitioning? Make it pub to test cases.
-    pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
+    /// The repartitioning result. Allows a single writer and multiple readers.
+    pub(crate) partitioning: GuardArcSwap<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
 
     /// Configuration: how often should the partitioning be recalculated.
     repartition_threshold: u64,
@@ -2340,7 +2342,8 @@ impl Timeline {
                     // initial logical size is 0.
                     LogicalSize::empty_initial()
                 },
-                partitioning: tokio::sync::Mutex::new((
+
+                partitioning: GuardArcSwap::new((
                     (KeyPartitioning::new(), KeyPartitioning::new().into_sparse()),
                     Lsn(0),
                 )),
@@ -4028,18 +4031,15 @@ impl Timeline {
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
     ) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> {
-        let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
+        let Ok(mut guard) = self.partitioning.try_write_guard() else {
             // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
             // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
             // and hence before the compaction task starts.
-            // Note that there are a third "caller" that will take the `partitioning` lock. It is `gc_compaction_split_jobs` for
-            // gc-compaction where it uses the repartition data to determine the split jobs. In the future, it might use its own
-            // heuristics, but for now, we should allow concurrent access to it and let the caller retry compaction.
             return Err(CompactionError::Other(anyhow!(
-                "repartition() called concurrently, this is rare and a retry should be fine"
+                "repartition() called concurrently"
             )));
         };
-        let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
+        let ((dense_partition, sparse_partition), partition_lsn) = &*guard.read();
         if lsn < *partition_lsn {
             return Err(CompactionError::Other(anyhow!(
                 "repartition() called with LSN going backwards, this should not happen"
@@ -4067,9 +4067,9 @@ impl Timeline {
         let sparse_partitioning = SparseKeyPartitioning {
             parts: vec![sparse_ks],
         }; // no partitioning for metadata keys for now
-        *partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn);
-
-        Ok((partitioning_guard.0.clone(), partitioning_guard.1))
+        let result = ((dense_partitioning, sparse_partitioning), lsn);
+        guard.write(result.clone());
+        Ok(result)
     }
 
     // Is it time to create a new image layer for the given partition?
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 2042a18e96..06a21f6b3c 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -2146,12 +2146,7 @@ impl Timeline {
         let mut compact_jobs = Vec::new();
         // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
         // by estimating the amount of files read for a compaction job. We should also partition on LSN.
-        let ((dense_ks, sparse_ks), _) = {
-            let Ok(partition) = self.partitioning.try_lock() else {
-                bail!("failed to acquire partition lock during gc-compaction");
-            };
-            partition.clone()
-        };
+        let ((dense_ks, sparse_ks), _) = self.partitioning.read().as_ref().clone();
         // Truncate the key range to be within user specified compaction range.
         fn truncate_to(
             source_start: &Key,

From 2e13a3aa7ab117870b0b46525f65aa5cff0c3e2c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 16 Jan 2025 16:56:44 +0000
Subject: [PATCH 210/583] storage controller: handle legacy TenantConf in
 consistency_check (#10422)

## Problem

We were comparing serialized configs from the database with serialized
configs from memory. If fields have been added/removed to TenantConfig,
this generates spurious consistency errors. This is fine in test
environments, but limits the usefulness of this debug API in the field.

Closes: https://github.com/neondatabase/neon/issues/10369

## Summary of changes

- Do a decode/encode cycle on the config before comparing it, so that it
will have exactly the expected fields.
---
 storage_controller/src/service.rs | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 1d85839881..f56b683b9f 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5411,6 +5411,15 @@ impl Service {
 
         expect_shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count));
 
+        // Because JSON contents of persistent tenants might disagree with the fields in current `TenantConfig`
+        // definition, we will do an encode/decode cycle to ensure any legacy fields are dropped and any new
+        // fields are added, before doing a comparison.
+        for tsp in &mut persistent_shards {
+            let config: TenantConfig = serde_json::from_str(&tsp.config)
+                .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            tsp.config = serde_json::to_string(&config).expect("Encoding config is infallible");
+        }
+
         if persistent_shards != expect_shards {
             tracing::error!("Consistency check failed on shards.");
 

From da1315479173bed01804f72ad93594dbf36ee979 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 16 Jan 2025 17:33:46 +0000
Subject: [PATCH 211/583] storcon: revise fill logic to prioritize AZ (#10411)

## Problem

Node fills were limited to moving (total shards / node_count) shards. In
systems that aren't perfectly balanced already, that leads us to skip
migrating some of the shards that belong on this node, generating work
for the optimizer later to gradually move them back.

## Summary of changes

- Where a shard has a preferred AZ and is currently attached outside
this AZ, then always promote it during fill, irrespective of target fill
count
---
 storage_controller/src/service.rs | 121 ++++++++++++++++++------------
 1 file changed, 71 insertions(+), 50 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index f56b683b9f..9ac9ee17ca 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -7279,19 +7279,14 @@ impl Service {
         Ok(())
     }
 
-    /// Create a node fill plan (pick secondaries to promote) that meets the following requirements:
-    /// 1. The node should be filled until it reaches the expected cluster average of
-    ///    attached shards. If there are not enough secondaries on the node, the plan stops early.
-    /// 2. Select tenant shards to promote such that the number of attached shards is balanced
-    ///    throughout the cluster. We achieve this by picking tenant shards from each node,
-    ///    starting from the ones with the largest number of attached shards, until the node
-    ///    reaches the expected cluster average.
-    /// 3. Avoid promoting more shards of the same tenant than required. The upper bound
-    ///    for the number of tenants from the same shard promoted to the node being filled is:
-    ///    shard count for the tenant divided by the number of nodes in the cluster.
+    /// Create a node fill plan (pick secondaries to promote), based on:
+    /// 1. Shards which have a secondary on this node, and this node is in their home AZ, and are currently attached to a node
+    ///    outside their home AZ, should be migrated back here.
+    /// 2. If after step 1 we have not migrated enough shards for this node to have its fair share of
+    ///    attached shards, we will promote more shards from the nodes with the most attached shards, unless
+    ///    those shards have a home AZ that doesn't match the node we're filling.
     fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
         let mut locked = self.inner.write().unwrap();
-        let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
         let (nodes, tenants, _scheduler) = locked.parts_mut();
 
         let node_az = nodes
@@ -7300,53 +7295,79 @@ impl Service {
             .get_availability_zone_id()
             .clone();
 
-        let mut tids_by_node = tenants
-            .iter_mut()
-            .filter_map(|(tid, tenant_shard)| {
-                if !matches!(
-                    tenant_shard.get_scheduling_policy(),
-                    ShardSchedulingPolicy::Active
-                ) {
-                    // Only include tenants in fills if they have a normal (Active) scheduling policy.  We
-                    // even exclude Essential, because moving to fill a node is not essential to keeping this
-                    // tenant available.
-                    return None;
-                }
+        // The tenant shard IDs that we plan to promote from secondary to attached on this node
+        let mut plan = Vec::new();
 
-                // AZ check: when filling nodes after a restart, our intent is to move _back_ the
-                // shards which belong on this node, not to promote shards whose scheduling preference
-                // would be on their currently attached node.  So will avoid promoting shards whose
-                // home AZ doesn't match the AZ of the node we're filling.
-                match tenant_shard.preferred_az() {
-                    None => {
-                        // Shard doesn't have an AZ preference: it is elegible to be moved.
-                    }
-                    Some(az) if az == &node_az => {
-                        // This shard's home AZ is equal to the node we're filling: it is
-                        // elegible to be moved: fall through;
-                    }
-                    Some(_) => {
-                        // This shard's home AZ is somewhere other than the node we're filling:
-                        // do not include it in the fill plan.
-                        return None;
-                    }
-                }
+        // Collect shards which do not have a preferred AZ & are elegible for moving in stage 2
+        let mut free_tids_by_node: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
 
-                if tenant_shard.intent.get_secondary().contains(&node_id) {
+        // Don't respect AZ preferences if there is only one AZ.  This comes up in tests, but it could
+        // conceivably come up in real life if deploying a single-AZ region intentionally.
+        let respect_azs = nodes
+            .values()
+            .map(|n| n.get_availability_zone_id())
+            .unique()
+            .count()
+            > 1;
+
+        // Step 1: collect all shards that we are required to migrate back to this node because their AZ preference
+        // requires it.
+        for (tsid, tenant_shard) in tenants {
+            if !tenant_shard.intent.get_secondary().contains(&node_id) {
+                // Shard doesn't have a secondary on this node, ignore it.
+                continue;
+            }
+
+            // AZ check: when filling nodes after a restart, our intent is to move _back_ the
+            // shards which belong on this node, not to promote shards whose scheduling preference
+            // would be on their currently attached node.  So will avoid promoting shards whose
+            // home AZ doesn't match the AZ of the node we're filling.
+            match tenant_shard.preferred_az() {
+                _ if !respect_azs => {
                     if let Some(primary) = tenant_shard.intent.get_attached() {
-                        return Some((*primary, *tid));
+                        free_tids_by_node.entry(*primary).or_default().push(*tsid);
                     }
                 }
+                None => {
+                    // Shard doesn't have an AZ preference: it is elegible to be moved, but we
+                    // will only do so if our target shard count requires it.
+                    if let Some(primary) = tenant_shard.intent.get_attached() {
+                        free_tids_by_node.entry(*primary).or_default().push(*tsid);
+                    }
+                }
+                Some(az) if az == &node_az => {
+                    // This shard's home AZ is equal to the node we're filling: it should
+                    // be moved back to this node as part of filling, unless its currently
+                    // attached location is also in its home AZ.
+                    if let Some(primary) = tenant_shard.intent.get_attached() {
+                        if nodes
+                            .get(primary)
+                            .expect("referenced node must exist")
+                            .get_availability_zone_id()
+                            != tenant_shard
+                                .preferred_az()
+                                .expect("tenant must have an AZ preference")
+                        {
+                            plan.push(*tsid)
+                        }
+                    } else {
+                        plan.push(*tsid)
+                    }
+                }
+                Some(_) => {
+                    // This shard's home AZ is somewhere other than the node we're filling,
+                    // it may not be moved back to this node as part of filling.  Ignore it
+                }
+            }
+        }
 
-                None
-            })
-            .into_group_map();
+        // Step 2: also promote any AZ-agnostic shards as required to achieve the target number of attachments
+        let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
 
         let expected_attached = locked.scheduler.expected_attached_shard_count();
         let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count();
 
         let mut promoted_per_tenant: HashMap<TenantId, usize> = HashMap::new();
-        let mut plan = Vec::new();
 
         for (node_id, attached) in nodes_by_load {
             let available = locked.nodes.get(&node_id).is_some_and(|n| n.is_available());
@@ -7355,7 +7376,7 @@ impl Service {
             }
 
             if plan.len() >= fill_requirement
-                || tids_by_node.is_empty()
+                || free_tids_by_node.is_empty()
                 || attached <= expected_attached
             {
                 break;
@@ -7367,7 +7388,7 @@ impl Service {
 
             let mut remove_node = false;
             while take > 0 {
-                match tids_by_node.get_mut(&node_id) {
+                match free_tids_by_node.get_mut(&node_id) {
                     Some(tids) => match tids.pop() {
                         Some(tid) => {
                             let max_promote_for_tenant = std::cmp::max(
@@ -7393,7 +7414,7 @@ impl Service {
             }
 
             if remove_node {
-                tids_by_node.remove(&node_id);
+                free_tids_by_node.remove(&node_id);
             }
         }
 

From 3a285a046b03954eb4ceb706d045510c70a41905 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 16 Jan 2025 18:51:56 +0000
Subject: [PATCH 212/583] pageserver: include node id when subscribing to SK
 (#10432)

## Problem

All pageserver have the same application name which makes it hard to
distinguish them.

## Summary of changes

Include the node id in the application name sent to the safekeeper. This
should gives us
more visibility in logs. There's a few metrics that will increase in
cardinality by `pageserver_count`,
but that's fine.
---
 .../src/tenant/timeline/walreceiver/walreceiver_connection.rs   | 2 +-
 test_runner/regress/test_tenants.py                             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 129b987e57..01c272633c 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -140,7 +140,7 @@ pub(super) async fn handle_walreceiver_connection(
 
     let (replication_client, connection) = {
         let mut config = wal_source_connconf.to_tokio_postgres_config();
-        config.application_name("pageserver");
+        config.application_name(format!("pageserver-{}", node.0).as_str());
         config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
         match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
             Ok(client_and_conn) => client_and_conn?,
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index d31901b384..b4c968b217 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -194,7 +194,7 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
         io_metrics = query_all_safekeepers(
             "safekeeper_pg_io_bytes_total",
             {
-                "app_name": "pageserver",
+                "app_name": f"pageserver-{env.pageserver.id}",
                 "client_az": "test_ps_az",
                 "dir": io_direction,
                 "same_az": "false",

From 8f2ebc068420b62a9cd89292870b68bb0b74664c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 16 Jan 2025 19:00:16 +0000
Subject: [PATCH 213/583] tests: stabilize
 test_storage_controller_node_deletion (#10420)

## Problem

`test_storage_controller_node_deletion` sometimes failed because shards
were moving around during timeline creation, and neon_local isn't
tolerant of that. The movements were unexpected because the shards had
only just been created.

This was a regression from #9916

Closes: #10383

## Summary of changes

- Make this test use multiple AZs -- this makes the storage controller's
scheduling reliably stable

Why this works: in #9916 , I made a simplifying assumption that we would
have multiple AZs to get nice stable scheduling -- it's much easier,
because each tenant has a well defined primary+secondary location when
they have an AZ preference and nodes have different AZs. Everything
still works if you don't have multiple AZs, but you just have this quirk
that sometimes the optimizer can disagree with initial scheduling, so
once in a while a shard moves after being created -- annoying for tests,
harmless IRL.
---
 test_runner/fixtures/neon_fixtures.py          | 14 ++++++++++++--
 test_runner/regress/test_storage_controller.py |  4 ++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index c3950e9bf7..a01cb47984 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -370,6 +370,7 @@ class NeonEnvBuilder:
         pageserver_config_override: str | Callable[[dict[str, Any]], None] | None = None,
         num_safekeepers: int = 1,
         num_pageservers: int = 1,
+        num_azs: int = 1,
         # Use non-standard SK ids to check for various parsing bugs
         safekeepers_id_start: int = 0,
         # fsync is disabled by default to make the tests go faster
@@ -401,6 +402,7 @@ class NeonEnvBuilder:
         self.pageserver_config_override = pageserver_config_override
         self.num_safekeepers = num_safekeepers
         self.num_pageservers = num_pageservers
+        self.num_azs = num_azs
         self.safekeepers_id_start = safekeepers_id_start
         self.safekeepers_enable_fsync = safekeepers_enable_fsync
         self.auth_enabled = auth_enabled
@@ -990,6 +992,7 @@ class NeonEnv:
         self.endpoints = EndpointFactory(self)
         self.safekeepers: list[Safekeeper] = []
         self.pageservers: list[NeonPageserver] = []
+        self.num_azs = config.num_azs
         self.broker = NeonBroker(self)
         self.pageserver_remote_storage = config.pageserver_remote_storage
         self.safekeepers_remote_storage = config.safekeepers_remote_storage
@@ -1090,14 +1093,21 @@ class NeonEnv:
                 http=self.port_distributor.get_port(),
             )
 
+            # Availabilty zones may also be configured manually with `NeonEnvBuilder.pageserver_config_override`
+            if self.num_azs > 1:
+                # Round-robin assignment of AZ names like us-east-2a, us-east-2b, etc.
+                az_prefix = DEFAULT_AZ_ID[:-1]
+                availability_zone = f"{az_prefix}{chr(ord('a') + (ps_id - 1) % self.num_azs)}"
+            else:
+                availability_zone = DEFAULT_AZ_ID
+
             ps_cfg: dict[str, Any] = {
                 "id": ps_id,
                 "listen_pg_addr": f"localhost:{pageserver_port.pg}",
                 "listen_http_addr": f"localhost:{pageserver_port.http}",
                 "pg_auth_type": pg_auth_type,
                 "http_auth_type": http_auth_type,
-                # Default which can be overriden with `NeonEnvBuilder.pageserver_config_override`
-                "availability_zone": DEFAULT_AZ_ID,
+                "availability_zone": availability_zone,
                 # Disable pageserver disk syncs in tests: when running tests concurrently, this avoids
                 # the pageserver taking a long time to start up due to syncfs flushing other tests' data
                 "no_sync": True,
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index ff479e8fe2..350fe31099 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2394,6 +2394,7 @@ def test_storage_controller_node_deletion(
     Test that deleting a node works & properly reschedules everything that was on the node.
     """
     neon_env_builder.num_pageservers = 3
+    neon_env_builder.num_azs = 3
     env = neon_env_builder.init_configs()
     env.start()
 
@@ -2407,6 +2408,9 @@ def test_storage_controller_node_deletion(
             tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
         )
 
+    # Sanity check: initial creations should not leave the system in an unstable scheduling state
+    assert env.storage_controller.reconcile_all() == 0
+
     victim = env.pageservers[-1]
 
     # The procedure a human would follow is:

From b0838a68e50e7caf3ae107cfa647093e0f62bb64 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 16 Jan 2025 13:49:04 -0600
Subject: [PATCH 214/583] Enable pgx_ulid on Postgres 17 (#10397)

The extension now supports Postgres 17. The release also seems to be
binary compatible with the previous version.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/compute-node.Dockerfile | 43 +++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index f56a8358d2..8c7200c5cb 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -995,24 +995,50 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04
 #########################################################################################
 #
 # Layer "pg-pgx-ulid-build"
-# Compile "pgx_ulid" extension
+# Compile "pgx_ulid" extension for v16 and below
 #
 #########################################################################################
 
 FROM rust-extensions-build AS pg-pgx-ulid-build
 ARG PG_VERSION
 
-# doesn't support v17 yet
-# https://github.com/pksunkara/pgx_ulid/pull/52
-RUN case "${PG_VERSION}" in "v17") \
-    echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \
+RUN case "${PG_VERSION}" in \
+    "v14" | "v15" | "v16") \
+        ;; \
+    *) \
+        echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \
+        ;; \
     esac && \
     wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
-    echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \
+    echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17  pgx_ulid.tar.gz" | sha256sum --check && \
     mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx       = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx       = "^0.11.2"/pgrx       = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
-    echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/ulid.control
+
+#########################################################################################
+#
+# Layer "pg-pgx-ulid-pgrx12-build"
+# Compile "pgx_ulid" extension for v17 and up
+#
+#########################################################################################
+
+FROM rust-extensions-build-pgrx12 AS pg-pgx-ulid-pgrx12-build
+ARG PG_VERSION
+
+RUN case "${PG_VERSION}" in \
+    "v17") \
+        ;; \
+    *) \
+        echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \
+        ;; \
+    esac && \
+    wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.2.0.tar.gz -O pgx_ulid.tar.gz && \
+    echo "cef6a9a2e5e7bd1a10a18989286586ee9e6c1c06005a4055cff190de41bf3e9f pgx_ulid.tar.gz" | sha256sum --check && \
+    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
+    sed -i 's/pgrx       = "^0.12.7"/pgrx       = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgx_ulid.control
 
 #########################################################################################
 #
@@ -1157,6 +1183,7 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-pgx-ulid-pgrx12-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-session-jwt-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/

From c47c5f4acefa1cd2edbe0f39dac8cefd2fc95284 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 16 Jan 2025 21:34:02 +0100
Subject: [PATCH 215/583] fix(page_service pipelining): tenant cannot shut down
 because gate kept open while flushing responses (#10386)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# Refs

- fixes https://github.com/neondatabase/neon/issues/10309
- fixup of batching design, first introduced in
https://github.com/neondatabase/neon/pull/9851
- refinement of https://github.com/neondatabase/neon/pull/8339

# Problem

`Tenant::shutdown` was occasionally taking many minutes (sometimes up to
20) in staging and prod if the
`page_service_pipelining.mode="concurrent-futures"` is enabled.

# Symptoms

The issue happens during shard migration between pageservers.
There is page_service unavailability and hence effectively downtime for
customers in the following case:
1. The source (state `AttachedStale`) gets stuck in `Tenant::shutdown`,
waiting for the gate to close.
2. Cplane/Storcon decides to transition the target `AttachedMulti` to
`AttachedSingle`.
3. That transition comes with a bump of the generation number, causing
the `PUT .../location_config` endpoint to do a full `Tenant::shutdown` /
`Tenant::attach` cycle for the target location.
4. That `Tenant::shutdown` on the target gets stuck, waiting for the
gate to close.
5. Eventually the gate closes (`close completed`), correlating with a
`page_service` connection handler logging that it's exiting because of a
network error (`Connection reset by peer` or `Broken pipe`).

While in (4):
- `Tenant::shutdown` is stuck waiting for all `Timeline::shutdown` calls
to complete.
  So, really, this is a `Timeline::shutdown` bug.
- retries from Cplane/Storcon to complete above state transitions, fail
with errors related to the tenant mgr slot being in state
`TenantSlot::InProgress`, the tenant state being
`TenantState::Stopping`, and the timelines being in
`TimelineState::Stopping`, and the `Timeline::cancel` being cancelled.
- Existing (and/or new?) page_service connections log errors `error
reading relation or page version: Not found: Timed out waiting 30s for
tenant active state. Latest state: None`

# Root-Cause

After a lengthy investigation ([internal
write-up](https://www.notion.so/neondatabase/2025-01-09-batching-deadlock-Slow-Log-Analysis-in-Staging-176f189e00478050bc21c1a072157ca4?pvs=4))
I arrived at the following root cause.

The `spsc_fold` channel (`batch_tx`/`batch_rx`) that connects the
Batcher and Executor stages of the pipelined mode was storing a `Handle`
and thus `GateGuard` of the Timeline that was not shutting down.
The design assumption with pipelining was that this would always be a
short transient state.
However, that was incorrect: the Executor was stuck on writing/flushing
an earlier response into the connection to the client, i.e., socket
write being slow because of TCP backpressure.

The probable scenario of how we end up in that case:
1. Compute backend process sends a continuous stream of getpage prefetch
requests into the connection, but never reads the responses (why this
happens: see Appendix section).
2. Batch N is processed by Batcher and Executor, up to the point where
Executor starts flushing the response.
3. Batch N+1 is procssed by Batcher and queued in the `spsc_fold`.
4. Executor is still waiting for batch N flush to finish.
5. Batcher eventually hits the `TimeoutReader` error (10min).
From here on it waits on the
`spsc_fold.send(Err(QueryError(TimeoutReader_error)))`
which will never finish because the batch already inside the `spsc_fold`
is not
being read by the Executor, because the Executor is still stuck in the
flush.
   (This state is not observable at our default `info` log level)
6. Eventually, Compute backend process is killed (`close()` on the
socket) or Compute as a whole gets killed (probably no clean TCP
shutdown happening in that case).
7. Eventually, Pageserver TCP stack learns about (6) through RST packets
and the Executor's flush() call fails with an error.
8. The Executor exits, dropping `cancel_batcher` and its end of the
spsc_fold.
   This wakes Batcher, causing the `spsc_fold.send` to fail.
   Batcher exits.
   The pipeline shuts down as intended.
We return from `process_query` and log the `Connection reset by peer` or
`Broken pipe` error.

The following diagram visualizes the wait-for graph at (5)

```mermaid
flowchart TD
   Batcher --spsc_fold.send(TimeoutReader_error)--> Executor
   Executor --flush batch N responses--> socket.write_end
   socket.write_end --wait for TCP window to move forward--> Compute
```

# Analysis

By holding the GateGuard inside the `spsc_fold` open, the pipelining
implementation
violated the principle established in
(https://github.com/neondatabase/neon/pull/8339).
That is, that `Handle`s must only be held across an await point if that
await point
is sensitive to the `<Handle as Deref<Target=Timeline>>::cancel` token.

In this case, we were holding the Handle inside the `spsc_fold` while
awaiting the
`pgb_writer.flush()` future.

One may jump to the conclusion that we should simply peek into the
spsc_fold to get
that Timeline cancel token and be sensitive to it during flush, then.

But that violates another principle of the design from
https://github.com/neondatabase/neon/pull/8339.
That is, that the page_service connection lifecycle and the Timeline
lifecycles must be completely decoupled.
Tt must be possible to shut down one shard without shutting down the
page_service connection, because on that single connection we might be
serving other shards attached to this pageserver.
(The current compute client opens separate connections per shard, but,
there are plans to change that.)

# Solution

This PR adds a `handle::WeakHandle` struct that does _not_ hold the
timeline gate open.
It must be `upgrade()`d to get a `handle::Handle`.
That `handle::Handle` _does_ hold the timeline gate open.

The batch queued inside the `spsc_fold` only holds a `WeakHandle`.
We only upgrade it while calling into the various `handle_` methods,
i.e., while interacting with the `Timeline` via `<Handle as
Deref<Target=Timeline>>`.
All that code has always been required to be (and is!) sensitive to
`Timeline::cancel`, and therefore we're guaranteed to bail from it
quickly when `Timeline::shutdown` starts.
We will drop the `Handle` immediately, before we start
`pgb_writer.flush()`ing the responses.
Thereby letting go of our hold on the `GateGuard`, allowing the timeline
shutdown to complete while the page_service handler remains intact.

# Code Changes

* Reproducer & Regression Test
* Developed and proven to reproduce the issue in
https://github.com/neondatabase/neon/pull/10399
* Add a `Test` message to the pagestream protocol (`cfg(feature =
"testing")`).
* Drive-by minimal improvement to the parsing code, we now have a
`PagestreamFeMessageTag`.
* Refactor `pageserver/client` to allow sending and receiving
`page_service` requests independently.
  * Add a Rust helper binary to produce situation (4) from above
* Rationale: (4) and (5) are the same bug class, we're holding a gate
open while `flush()`ing.
* Add a Python regression test that uses the helper binary to
demonstrate the problem.
* Fix
   * Introduce and use `WeakHandle` as explained earlier.
* Replace the `shut_down` atomic with two enum states for `HandleInner`,
wrapped in a `Mutex`.
* To make `WeakHandle::upgrade()` and `Handle::downgrade()`
cache-efficient:
     * Wrap the `Types::Timeline` in an `Arc`
     * Wrap the `GateGuard` in an `Arc`
* The separate `Arc`s enable uncontended cloning of the timeline
reference in `upgrade()` and `downgrade()`.
If instead we were `Arc<Timeline>::clone`, different connection handlers
would be hitting the same cache line on every upgrade()/downgrade(),
causing contention.
* Please read the udpated module-level comment in `mod handle`
module-level comment for details.

# Testing & Performance

The reproducer test that failed before the changes now passes, and
obviously other tests are passing as well.

We'll do more testing in staging, where the issue happens every ~4h if
chaos migrations are enabled in storcon.

Existing perf testing will be sufficient, no perf degradation is
expected.
It's a few more alloctations due to the added Arc's, but, they're low
frequency.

# Appendix: Why Compute Sometimes Doesn't Read Responses

Remember, the whole problem surfaced because flush() was slow because
Compute was not reading responses. Why is that?

In short, the way the compute works, it only advances the page_service
protocol processing when it has an interest in data, i.e., when the
pagestore smgr is called to return pages.

Thus, if compute issues a bunch of requests as part of prefetch but then
it turns out it can service the query without reading those pages, it
may very well happen that these messages stay in the TCP until the next
smgr read happens, either in that session, or possibly in another
session.

If there’s too many unread responses in the TCP, the pageserver kernel
is going to backpressure into userspace, resulting in our stuck flush().

All of this stems from the way vanilla Postgres does prefetching and
"async IO":
it issues `fadvise()` to make the kernel do the IO in the background,
buffering results in the kernel page cache.
It then consumes the results through synchronous `read()` system calls,
which hopefully will be fast because of the `fadvise()`.

If it turns out that some / all of the prefetch results are not needed,
Postgres will not be issuing those `read()` system calls.
The kernel will eventually react to that by reusing page cache pages
that hold completed prefetched data.
Uncompleted prefetch requests may or may not be processed -- it's up to
the kernel.

In Neon, the smgr + Pageserver together take on the role of the kernel
in above paragraphs.
In the current implementation, all prefetches are sent as GetPage
requests to Pageserver.
The responses are only processed in the places where vanilla Postgres
would do the synchronous `read()` system call.
If we never get to that, the responses are queued inside the TCP
connection, which, once buffers run full, will backpressure into
Pageserver's sending code, i.e., the `pgb_writer.flush()` that was the
root cause of the problems we're fixing in this PR.
---
 libs/pageserver_api/src/models.rs             | 246 +++++++--
 pageserver/Cargo.toml                         |   6 +-
 pageserver/client/Cargo.toml                  |   3 +
 pageserver/client/src/page_service.rs         | 139 ++++-
 .../src/bin/test_helper_slow_client_reads.rs  |  65 +++
 pageserver/src/metrics.rs                     |   2 +
 pageserver/src/page_service.rs                | 184 ++++++-
 .../src/tenant/remote_timeline_client.rs      |   6 +
 pageserver/src/tenant/timeline.rs             |   7 +-
 pageserver/src/tenant/timeline/handle.rs      | 518 ++++++++++++------
 pgxn/neon/pagestore_client.h                  |   4 +
 .../test_page_service_batching_regressions.py |  60 ++
 12 files changed, 966 insertions(+), 274 deletions(-)
 create mode 100644 pageserver/src/bin/test_helper_slow_client_reads.rs
 create mode 100644 test_runner/regress/test_page_service_batching_regressions.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 87e8df2ab6..c38af9cb80 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -33,7 +33,6 @@ use crate::{
     reltag::RelTag,
     shard::{ShardCount, ShardStripeSize, TenantShardId},
 };
-use anyhow::bail;
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 
 /// The state of a tenant in this pageserver.
@@ -1400,6 +1399,8 @@ pub enum PagestreamFeMessage {
     GetPage(PagestreamGetPageRequest),
     DbSize(PagestreamDbSizeRequest),
     GetSlruSegment(PagestreamGetSlruSegmentRequest),
+    #[cfg(feature = "testing")]
+    Test(PagestreamTestRequest),
 }
 
 // Wrapped in libpq CopyData
@@ -1411,6 +1412,22 @@ pub enum PagestreamBeMessage {
     Error(PagestreamErrorResponse),
     DbSize(PagestreamDbSizeResponse),
     GetSlruSegment(PagestreamGetSlruSegmentResponse),
+    #[cfg(feature = "testing")]
+    Test(PagestreamTestResponse),
+}
+
+// Keep in sync with `pagestore_client.h`
+#[repr(u8)]
+enum PagestreamFeMessageTag {
+    Exists = 0,
+    Nblocks = 1,
+    GetPage = 2,
+    DbSize = 3,
+    GetSlruSegment = 4,
+    /* future tags above this line */
+    /// For testing purposes, not available in production.
+    #[cfg(feature = "testing")]
+    Test = 99,
 }
 
 // Keep in sync with `pagestore_client.h`
@@ -1422,7 +1439,28 @@ enum PagestreamBeMessageTag {
     Error = 103,
     DbSize = 104,
     GetSlruSegment = 105,
+    /* future tags above this line */
+    /// For testing purposes, not available in production.
+    #[cfg(feature = "testing")]
+    Test = 199,
 }
+
+impl TryFrom<u8> for PagestreamFeMessageTag {
+    type Error = u8;
+    fn try_from(value: u8) -> Result<Self, u8> {
+        match value {
+            0 => Ok(PagestreamFeMessageTag::Exists),
+            1 => Ok(PagestreamFeMessageTag::Nblocks),
+            2 => Ok(PagestreamFeMessageTag::GetPage),
+            3 => Ok(PagestreamFeMessageTag::DbSize),
+            4 => Ok(PagestreamFeMessageTag::GetSlruSegment),
+            #[cfg(feature = "testing")]
+            99 => Ok(PagestreamFeMessageTag::Test),
+            _ => Err(value),
+        }
+    }
+}
+
 impl TryFrom<u8> for PagestreamBeMessageTag {
     type Error = u8;
     fn try_from(value: u8) -> Result<Self, u8> {
@@ -1433,6 +1471,8 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
             103 => Ok(PagestreamBeMessageTag::Error),
             104 => Ok(PagestreamBeMessageTag::DbSize),
             105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
+            #[cfg(feature = "testing")]
+            199 => Ok(PagestreamBeMessageTag::Test),
             _ => Err(value),
         }
     }
@@ -1550,6 +1590,20 @@ pub struct PagestreamDbSizeResponse {
     pub db_size: i64,
 }
 
+#[cfg(feature = "testing")]
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct PagestreamTestRequest {
+    pub hdr: PagestreamRequest,
+    pub batch_key: u64,
+    pub message: String,
+}
+
+#[cfg(feature = "testing")]
+#[derive(Debug)]
+pub struct PagestreamTestResponse {
+    pub req: PagestreamTestRequest,
+}
+
 // This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
 // that require pageserver-internal types.  It is sufficient to get the total size.
 #[derive(Serialize, Deserialize, Debug)]
@@ -1569,7 +1623,7 @@ impl PagestreamFeMessage {
 
         match self {
             Self::Exists(req) => {
-                bytes.put_u8(0);
+                bytes.put_u8(PagestreamFeMessageTag::Exists as u8);
                 bytes.put_u64(req.hdr.reqid);
                 bytes.put_u64(req.hdr.request_lsn.0);
                 bytes.put_u64(req.hdr.not_modified_since.0);
@@ -1580,7 +1634,7 @@ impl PagestreamFeMessage {
             }
 
             Self::Nblocks(req) => {
-                bytes.put_u8(1);
+                bytes.put_u8(PagestreamFeMessageTag::Nblocks as u8);
                 bytes.put_u64(req.hdr.reqid);
                 bytes.put_u64(req.hdr.request_lsn.0);
                 bytes.put_u64(req.hdr.not_modified_since.0);
@@ -1591,7 +1645,7 @@ impl PagestreamFeMessage {
             }
 
             Self::GetPage(req) => {
-                bytes.put_u8(2);
+                bytes.put_u8(PagestreamFeMessageTag::GetPage as u8);
                 bytes.put_u64(req.hdr.reqid);
                 bytes.put_u64(req.hdr.request_lsn.0);
                 bytes.put_u64(req.hdr.not_modified_since.0);
@@ -1603,7 +1657,7 @@ impl PagestreamFeMessage {
             }
 
             Self::DbSize(req) => {
-                bytes.put_u8(3);
+                bytes.put_u8(PagestreamFeMessageTag::DbSize as u8);
                 bytes.put_u64(req.hdr.reqid);
                 bytes.put_u64(req.hdr.request_lsn.0);
                 bytes.put_u64(req.hdr.not_modified_since.0);
@@ -1611,13 +1665,24 @@ impl PagestreamFeMessage {
             }
 
             Self::GetSlruSegment(req) => {
-                bytes.put_u8(4);
+                bytes.put_u8(PagestreamFeMessageTag::GetSlruSegment as u8);
                 bytes.put_u64(req.hdr.reqid);
                 bytes.put_u64(req.hdr.request_lsn.0);
                 bytes.put_u64(req.hdr.not_modified_since.0);
                 bytes.put_u8(req.kind);
                 bytes.put_u32(req.segno);
             }
+            #[cfg(feature = "testing")]
+            Self::Test(req) => {
+                bytes.put_u8(PagestreamFeMessageTag::Test as u8);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
+                bytes.put_u64(req.batch_key);
+                let message = req.message.as_bytes();
+                bytes.put_u64(message.len() as u64);
+                bytes.put_slice(message);
+            }
         }
 
         bytes.into()
@@ -1645,56 +1710,66 @@ impl PagestreamFeMessage {
             ),
         };
 
-        match msg_tag {
-            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                hdr: PagestreamRequest {
-                    reqid,
-                    request_lsn,
-                    not_modified_since,
-                },
-                rel: RelTag {
-                    spcnode: body.read_u32::<BigEndian>()?,
+        match PagestreamFeMessageTag::try_from(msg_tag)
+            .map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))?
+        {
+            PagestreamFeMessageTag::Exists => {
+                Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
+                    rel: RelTag {
+                        spcnode: body.read_u32::<BigEndian>()?,
+                        dbnode: body.read_u32::<BigEndian>()?,
+                        relnode: body.read_u32::<BigEndian>()?,
+                        forknum: body.read_u8()?,
+                    },
+                }))
+            }
+            PagestreamFeMessageTag::Nblocks => {
+                Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
+                    rel: RelTag {
+                        spcnode: body.read_u32::<BigEndian>()?,
+                        dbnode: body.read_u32::<BigEndian>()?,
+                        relnode: body.read_u32::<BigEndian>()?,
+                        forknum: body.read_u8()?,
+                    },
+                }))
+            }
+            PagestreamFeMessageTag::GetPage => {
+                Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
+                    rel: RelTag {
+                        spcnode: body.read_u32::<BigEndian>()?,
+                        dbnode: body.read_u32::<BigEndian>()?,
+                        relnode: body.read_u32::<BigEndian>()?,
+                        forknum: body.read_u8()?,
+                    },
+                    blkno: body.read_u32::<BigEndian>()?,
+                }))
+            }
+            PagestreamFeMessageTag::DbSize => {
+                Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
                     dbnode: body.read_u32::<BigEndian>()?,
-                    relnode: body.read_u32::<BigEndian>()?,
-                    forknum: body.read_u8()?,
-                },
-            })),
-            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                hdr: PagestreamRequest {
-                    reqid,
-                    request_lsn,
-                    not_modified_since,
-                },
-                rel: RelTag {
-                    spcnode: body.read_u32::<BigEndian>()?,
-                    dbnode: body.read_u32::<BigEndian>()?,
-                    relnode: body.read_u32::<BigEndian>()?,
-                    forknum: body.read_u8()?,
-                },
-            })),
-            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                hdr: PagestreamRequest {
-                    reqid,
-                    request_lsn,
-                    not_modified_since,
-                },
-                rel: RelTag {
-                    spcnode: body.read_u32::<BigEndian>()?,
-                    dbnode: body.read_u32::<BigEndian>()?,
-                    relnode: body.read_u32::<BigEndian>()?,
-                    forknum: body.read_u8()?,
-                },
-                blkno: body.read_u32::<BigEndian>()?,
-            })),
-            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                hdr: PagestreamRequest {
-                    reqid,
-                    request_lsn,
-                    not_modified_since,
-                },
-                dbnode: body.read_u32::<BigEndian>()?,
-            })),
-            4 => Ok(PagestreamFeMessage::GetSlruSegment(
+                }))
+            }
+            PagestreamFeMessageTag::GetSlruSegment => Ok(PagestreamFeMessage::GetSlruSegment(
                 PagestreamGetSlruSegmentRequest {
                     hdr: PagestreamRequest {
                         reqid,
@@ -1705,7 +1780,21 @@ impl PagestreamFeMessage {
                     segno: body.read_u32::<BigEndian>()?,
                 },
             )),
-            _ => bail!("unknown smgr message tag: {:?}", msg_tag),
+            #[cfg(feature = "testing")]
+            PagestreamFeMessageTag::Test => Ok(PagestreamFeMessage::Test(PagestreamTestRequest {
+                hdr: PagestreamRequest {
+                    reqid,
+                    request_lsn,
+                    not_modified_since,
+                },
+                batch_key: body.read_u64::<BigEndian>()?,
+                message: {
+                    let len = body.read_u64::<BigEndian>()?;
+                    let mut buf = vec![0; len as usize];
+                    body.read_exact(&mut buf)?;
+                    String::from_utf8(buf)?
+                },
+            })),
         }
     }
 }
@@ -1748,6 +1837,15 @@ impl PagestreamBeMessage {
                         bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
                         bytes.put(&resp.segment[..]);
                     }
+
+                    #[cfg(feature = "testing")]
+                    Self::Test(resp) => {
+                        bytes.put_u8(Tag::Test as u8);
+                        bytes.put_u64(resp.req.batch_key);
+                        let message = resp.req.message.as_bytes();
+                        bytes.put_u64(message.len() as u64);
+                        bytes.put_slice(message);
+                    }
                 }
             }
             PagestreamProtocolVersion::V3 => {
@@ -1816,6 +1914,18 @@ impl PagestreamBeMessage {
                         bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
                         bytes.put(&resp.segment[..]);
                     }
+
+                    #[cfg(feature = "testing")]
+                    Self::Test(resp) => {
+                        bytes.put_u8(Tag::Test as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u64(resp.req.batch_key);
+                        let message = resp.req.message.as_bytes();
+                        bytes.put_u64(message.len() as u64);
+                        bytes.put_slice(message);
+                    }
                 }
             }
         }
@@ -1958,6 +2068,28 @@ impl PagestreamBeMessage {
                         segment: segment.into(),
                     })
                 }
+                #[cfg(feature = "testing")]
+                Tag::Test => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let batch_key = buf.read_u64::<BigEndian>()?;
+                    let len = buf.read_u64::<BigEndian>()?;
+                    let mut msg = vec![0; len as usize];
+                    buf.read_exact(&mut msg)?;
+                    let message = String::from_utf8(msg)?;
+                    Self::Test(PagestreamTestResponse {
+                        req: PagestreamTestRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            batch_key,
+                            message,
+                        },
+                    })
+                }
             };
         let remaining = buf.into_inner();
         if !remaining.is_empty() {
@@ -1977,6 +2109,8 @@ impl PagestreamBeMessage {
             Self::Error(_) => "Error",
             Self::DbSize(_) => "DbSize",
             Self::GetSlruSegment(_) => "GetSlruSegment",
+            #[cfg(feature = "testing")]
+            Self::Test(_) => "Test",
         }
     }
 }
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 9195951191..9c835c956b 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 default = []
 # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
 # which adds some runtime cost to run tests on outage conditions
-testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"]
+testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"]
 
 [dependencies]
 anyhow.workspace = true
@@ -114,3 +114,7 @@ harness = false
 [[bench]]
 name = "upload_queue"
 harness = false
+
+[[bin]]
+name = "test_helper_slow_client_reads"
+required-features = [ "testing" ]
diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml
index d9b36bf3d4..f582d307a7 100644
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -4,6 +4,9 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[features]
+testing = [ "pageserver_api/testing" ]
+
 [dependencies]
 pageserver_api.workspace = true
 thiserror.workspace = true
diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs
index 207ec4166c..27280912b4 100644
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -1,6 +1,9 @@
-use std::pin::Pin;
+use std::sync::{Arc, Mutex};
 
-use futures::SinkExt;
+use futures::{
+    stream::{SplitSink, SplitStream},
+    SinkExt, StreamExt,
+};
 use pageserver_api::{
     models::{
         PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
@@ -10,7 +13,6 @@ use pageserver_api::{
 };
 use tokio::task::JoinHandle;
 use tokio_postgres::CopyOutStream;
-use tokio_stream::StreamExt;
 use tokio_util::sync::CancellationToken;
 use utils::{
     id::{TenantId, TimelineId},
@@ -62,15 +64,28 @@ impl Client {
             .client
             .copy_both_simple(&format!("pagestream_v3 {tenant_id} {timeline_id}"))
             .await?;
+        let (sink, stream) = copy_both.split(); // TODO: actually support splitting of the CopyBothDuplex so the lock inside this split adaptor goes away.
         let Client {
             cancel_on_client_drop,
             conn_task,
             client: _,
         } = self;
+        let shared = Arc::new(Mutex::new(PagestreamShared::ConnTaskRunning(
+            ConnTaskRunning {
+                cancel_on_client_drop,
+                conn_task,
+            },
+        )));
         Ok(PagestreamClient {
-            copy_both: Box::pin(copy_both),
-            conn_task,
-            cancel_on_client_drop,
+            sink: PagestreamSender {
+                shared: shared.clone(),
+                sink,
+            },
+            stream: PagestreamReceiver {
+                shared: shared.clone(),
+                stream,
+            },
+            shared,
         })
     }
 
@@ -97,7 +112,28 @@ impl Client {
 
 /// Create using [`Client::pagestream`].
 pub struct PagestreamClient {
-    copy_both: Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
+    shared: Arc<Mutex<PagestreamShared>>,
+    sink: PagestreamSender,
+    stream: PagestreamReceiver,
+}
+
+pub struct PagestreamSender {
+    #[allow(dead_code)]
+    shared: Arc<Mutex<PagestreamShared>>,
+    sink: SplitSink<tokio_postgres::CopyBothDuplex<bytes::Bytes>, bytes::Bytes>,
+}
+
+pub struct PagestreamReceiver {
+    #[allow(dead_code)]
+    shared: Arc<Mutex<PagestreamShared>>,
+    stream: SplitStream<tokio_postgres::CopyBothDuplex<bytes::Bytes>>,
+}
+
+enum PagestreamShared {
+    ConnTaskRunning(ConnTaskRunning),
+    ConnTaskCancelledJoinHandleReturnedOrDropped,
+}
+struct ConnTaskRunning {
     cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
     conn_task: JoinHandle<()>,
 }
@@ -110,11 +146,11 @@ pub struct RelTagBlockNo {
 impl PagestreamClient {
     pub async fn shutdown(self) {
         let Self {
-            copy_both,
-            cancel_on_client_drop: cancel_conn_task,
-            conn_task,
-        } = self;
-        // The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`.
+            shared,
+            sink,
+            stream,
+        } = { self };
+        // The `copy_both` split into `sink` and `stream` contains internal channel sender, the receiver of which is polled by `conn_task`.
         // When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection.
         // (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56).
         //
@@ -131,27 +167,77 @@ impl PagestreamClient {
         //
         // NB: page_service doesn't have a use case to exit the `pagestream` mode currently.
         // => https://github.com/neondatabase/neon/issues/6390
-        let _ = cancel_conn_task.unwrap();
+        let ConnTaskRunning {
+            cancel_on_client_drop,
+            conn_task,
+        } = {
+            let mut guard = shared.lock().unwrap();
+            match std::mem::replace(
+                &mut *guard,
+                PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped,
+            ) {
+                PagestreamShared::ConnTaskRunning(conn_task_running) => conn_task_running,
+                PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped => unreachable!(),
+            }
+        };
+        let _ = cancel_on_client_drop.unwrap();
         conn_task.await.unwrap();
-        drop(copy_both);
+
+        // Now drop the split copy_both.
+        drop(sink);
+        drop(stream);
+    }
+
+    pub fn split(self) -> (PagestreamSender, PagestreamReceiver) {
+        let Self {
+            shared: _,
+            sink,
+            stream,
+        } = self;
+        (sink, stream)
     }
 
     pub async fn getpage(
         &mut self,
         req: PagestreamGetPageRequest,
     ) -> anyhow::Result<PagestreamGetPageResponse> {
-        let req = PagestreamFeMessage::GetPage(req);
-        let req: bytes::Bytes = req.serialize();
-        // let mut req = tokio_util::io::ReaderStream::new(&req);
-        let mut req = tokio_stream::once(Ok(req));
+        self.getpage_send(req).await?;
+        self.getpage_recv().await
+    }
 
-        self.copy_both.send_all(&mut req).await?;
+    pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
+        self.sink.getpage_send(req).await
+    }
 
-        let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
+    pub async fn getpage_recv(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
+        self.stream.getpage_recv().await
+    }
+}
+
+impl PagestreamSender {
+    // TODO: maybe make this impl Sink instead for better composability?
+    pub async fn send(&mut self, msg: PagestreamFeMessage) -> anyhow::Result<()> {
+        let msg = msg.serialize();
+        self.sink.send_all(&mut tokio_stream::once(Ok(msg))).await?;
+        Ok(())
+    }
+
+    pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
+        self.send(PagestreamFeMessage::GetPage(req)).await
+    }
+}
+
+impl PagestreamReceiver {
+    // TODO: maybe make this impl Stream instead for better composability?
+    pub async fn recv(&mut self) -> anyhow::Result<PagestreamBeMessage> {
+        let next: Option<Result<bytes::Bytes, _>> = self.stream.next().await;
         let next: bytes::Bytes = next.unwrap()?;
+        PagestreamBeMessage::deserialize(next)
+    }
 
-        let msg = PagestreamBeMessage::deserialize(next)?;
-        match msg {
+    pub async fn getpage_recv(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
+        let next: PagestreamBeMessage = self.recv().await?;
+        match next {
             PagestreamBeMessage::GetPage(p) => Ok(p),
             PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
             PagestreamBeMessage::Exists(_)
@@ -160,7 +246,14 @@ impl PagestreamClient {
             | PagestreamBeMessage::GetSlruSegment(_) => {
                 anyhow::bail!(
                     "unexpected be message kind in response to getpage request: {}",
-                    msg.kind()
+                    next.kind()
+                )
+            }
+            #[cfg(feature = "testing")]
+            PagestreamBeMessage::Test(_) => {
+                anyhow::bail!(
+                    "unexpected be message kind in response to getpage request: {}",
+                    next.kind()
                 )
             }
         }
diff --git a/pageserver/src/bin/test_helper_slow_client_reads.rs b/pageserver/src/bin/test_helper_slow_client_reads.rs
new file mode 100644
index 0000000000..c1ce332b6c
--- /dev/null
+++ b/pageserver/src/bin/test_helper_slow_client_reads.rs
@@ -0,0 +1,65 @@
+use std::{
+    io::{stdin, stdout, Read, Write},
+    time::Duration,
+};
+
+use clap::Parser;
+use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest};
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+#[derive(clap::Parser)]
+struct Args {
+    connstr: String,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let Args {
+        connstr,
+        tenant_id,
+        timeline_id,
+    } = Args::parse();
+    let client = pageserver_client::page_service::Client::new(connstr).await?;
+    let client = client.pagestream(tenant_id, timeline_id).await?;
+    let (mut sender, _receiver) = client.split();
+
+    eprintln!("filling the pipe");
+    let mut msg = 0;
+    loop {
+        msg += 1;
+        let fut = sender.send(pageserver_api::models::PagestreamFeMessage::Test(
+            PagestreamTestRequest {
+                hdr: PagestreamRequest {
+                    reqid: 0,
+                    request_lsn: Lsn(23),
+                    not_modified_since: Lsn(23),
+                },
+                batch_key: 42,
+                message: format!("message {}", msg),
+            },
+        ));
+        let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else {
+            eprintln!("pipe seems full");
+            break;
+        };
+        let _: () = res?;
+    }
+
+    let n = stdout().write(b"R")?;
+    assert_eq!(n, 1);
+    stdout().flush()?;
+
+    eprintln!("waiting for signal to tell us to exit");
+
+    let mut buf = [0u8; 1];
+    stdin().read_exact(&mut buf)?;
+
+    eprintln!("termination signal received, exiting");
+
+    anyhow::Ok(())
+}
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 5b1cbbad63..3c4830e3cd 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1463,6 +1463,8 @@ pub enum SmgrQueryType {
     GetPageAtLsn,
     GetDbSize,
     GetSlruSegment,
+    #[cfg(feature = "testing")]
+    Test,
 }
 
 pub(crate) struct SmgrQueryTimePerTimeline {
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index da4180a927..b14a44f9e3 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -555,37 +555,52 @@ struct BatchedGetPageRequest {
     timer: SmgrOpTimer,
 }
 
+#[cfg(feature = "testing")]
+struct BatchedTestRequest {
+    req: models::PagestreamTestRequest,
+    timer: SmgrOpTimer,
+}
+
+/// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum,
+/// so that we don't keep the [`Timeline::gate`] open while the batch
+/// is being built up inside the [`spsc_fold`] (pagestream pipelining).
 enum BatchedFeMessage {
     Exists {
         span: Span,
         timer: SmgrOpTimer,
-        shard: timeline::handle::Handle<TenantManagerTypes>,
+        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
         req: models::PagestreamExistsRequest,
     },
     Nblocks {
         span: Span,
         timer: SmgrOpTimer,
-        shard: timeline::handle::Handle<TenantManagerTypes>,
+        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
         req: models::PagestreamNblocksRequest,
     },
     GetPage {
         span: Span,
-        shard: timeline::handle::Handle<TenantManagerTypes>,
+        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
         effective_request_lsn: Lsn,
         pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
     },
     DbSize {
         span: Span,
         timer: SmgrOpTimer,
-        shard: timeline::handle::Handle<TenantManagerTypes>,
+        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
         req: models::PagestreamDbSizeRequest,
     },
     GetSlruSegment {
         span: Span,
         timer: SmgrOpTimer,
-        shard: timeline::handle::Handle<TenantManagerTypes>,
+        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
         req: models::PagestreamGetSlruSegmentRequest,
     },
+    #[cfg(feature = "testing")]
+    Test {
+        span: Span,
+        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
+        requests: Vec<BatchedTestRequest>,
+    },
     RespondError {
         span: Span,
         error: BatchedPageStreamError,
@@ -606,6 +621,12 @@ impl BatchedFeMessage {
                     page.timer.observe_execution_start(at);
                 }
             }
+            #[cfg(feature = "testing")]
+            BatchedFeMessage::Test { requests, .. } => {
+                for req in requests {
+                    req.timer.observe_execution_start(at);
+                }
+            }
             BatchedFeMessage::RespondError { .. } => {}
         }
     }
@@ -735,7 +756,7 @@ impl PageServerHandler {
                 BatchedFeMessage::Exists {
                     span,
                     timer,
-                    shard,
+                    shard: shard.downgrade(),
                     req,
                 }
             }
@@ -754,7 +775,7 @@ impl PageServerHandler {
                 BatchedFeMessage::Nblocks {
                     span,
                     timer,
-                    shard,
+                    shard: shard.downgrade(),
                     req,
                 }
             }
@@ -773,7 +794,7 @@ impl PageServerHandler {
                 BatchedFeMessage::DbSize {
                     span,
                     timer,
-                    shard,
+                    shard: shard.downgrade(),
                     req,
                 }
             }
@@ -792,7 +813,7 @@ impl PageServerHandler {
                 BatchedFeMessage::GetSlruSegment {
                     span,
                     timer,
-                    shard,
+                    shard: shard.downgrade(),
                     req,
                 }
             }
@@ -844,6 +865,7 @@ impl PageServerHandler {
                 )
                 .await?;
 
+                // We're holding the Handle
                 let effective_request_lsn = match Self::wait_or_get_last_lsn(
                     &shard,
                     req.hdr.request_lsn,
@@ -861,11 +883,27 @@ impl PageServerHandler {
                 };
                 BatchedFeMessage::GetPage {
                     span,
-                    shard,
+                    shard: shard.downgrade(),
                     effective_request_lsn,
                     pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }],
                 }
             }
+            #[cfg(feature = "testing")]
+            PagestreamFeMessage::Test(req) => {
+                let span = tracing::info_span!(parent: parent_span, "handle_test_request");
+                let shard = timeline_handles
+                    .get(tenant_id, timeline_id, ShardSelector::Zero)
+                    .instrument(span.clone()) // sets `shard_id` field
+                    .await?;
+                let timer =
+                    record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
+                        .await?;
+                BatchedFeMessage::Test {
+                    span,
+                    shard: shard.downgrade(),
+                    requests: vec![BatchedTestRequest { req, timer }],
+                }
+            }
         };
         Ok(Some(batched_msg))
     }
@@ -907,9 +945,7 @@ impl PageServerHandler {
                     assert_eq!(accum_pages.len(), max_batch_size.get());
                     return false;
                 }
-                if (accum_shard.tenant_shard_id, accum_shard.timeline_id)
-                    != (this_shard.tenant_shard_id, this_shard.timeline_id)
-                {
+                if !accum_shard.is_same_handle_as(&this_shard) {
                     trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch");
                     // TODO: we _could_ batch & execute each shard seperately (and in parallel).
                     // But the current logic for keeping responses in order does not support that.
@@ -928,6 +964,44 @@ impl PageServerHandler {
                 accum_pages.extend(this_pages);
                 Ok(())
             }
+            #[cfg(feature = "testing")]
+            (
+                Ok(BatchedFeMessage::Test {
+                    shard: accum_shard,
+                    requests: accum_requests,
+                    ..
+                }),
+                BatchedFeMessage::Test {
+                    shard: this_shard,
+                    requests: this_requests,
+                    ..
+                },
+            ) if (|| {
+                assert!(this_requests.len() == 1);
+                if accum_requests.len() >= max_batch_size.get() {
+                    trace!(%max_batch_size, "stopping batching because of batch size");
+                    assert_eq!(accum_requests.len(), max_batch_size.get());
+                    return false;
+                }
+                if !accum_shard.is_same_handle_as(&this_shard) {
+                    trace!("stopping batching because timeline object mismatch");
+                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
+                    // But the current logic for keeping responses in order does not support that.
+                    return false;
+                }
+                let this_batch_key = this_requests[0].req.batch_key;
+                let accum_batch_key = accum_requests[0].req.batch_key;
+                if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
+                    trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
+                    return false;
+                }
+                true
+            })() =>
+            {
+                // ok to batch
+                accum_requests.extend(this_requests);
+                Ok(())
+            }
             // something batched already but this message is unbatchable
             (_, this_msg) => {
                 // by default, don't continue batching
@@ -969,7 +1043,7 @@ impl PageServerHandler {
                 fail::fail_point!("ps::handle-pagerequest-message::exists");
                 (
                     vec![self
-                        .handle_get_rel_exists_request(&shard, &req, ctx)
+                        .handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx)
                         .instrument(span.clone())
                         .await
                         .map(|msg| (msg, timer))
@@ -986,7 +1060,7 @@ impl PageServerHandler {
                 fail::fail_point!("ps::handle-pagerequest-message::nblocks");
                 (
                     vec![self
-                        .handle_get_nblocks_request(&shard, &req, ctx)
+                        .handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx)
                         .instrument(span.clone())
                         .await
                         .map(|msg| (msg, timer))
@@ -1007,7 +1081,7 @@ impl PageServerHandler {
                         trace!(npages, "handling getpage request");
                         let res = self
                             .handle_get_page_at_lsn_request_batched(
-                                &shard,
+                                &*shard.upgrade()?,
                                 effective_request_lsn,
                                 pages,
                                 ctx,
@@ -1029,7 +1103,7 @@ impl PageServerHandler {
                 fail::fail_point!("ps::handle-pagerequest-message::dbsize");
                 (
                     vec![self
-                        .handle_db_size_request(&shard, &req, ctx)
+                        .handle_db_size_request(&*shard.upgrade()?, &req, ctx)
                         .instrument(span.clone())
                         .await
                         .map(|msg| (msg, timer))
@@ -1046,7 +1120,7 @@ impl PageServerHandler {
                 fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
                 (
                     vec![self
-                        .handle_get_slru_segment_request(&shard, &req, ctx)
+                        .handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx)
                         .instrument(span.clone())
                         .await
                         .map(|msg| (msg, timer))
@@ -1054,6 +1128,27 @@ impl PageServerHandler {
                     span,
                 )
             }
+            #[cfg(feature = "testing")]
+            BatchedFeMessage::Test {
+                span,
+                shard,
+                requests,
+            } => {
+                fail::fail_point!("ps::handle-pagerequest-message::test");
+                (
+                    {
+                        let npages = requests.len();
+                        trace!(npages, "handling getpage request");
+                        let res = self
+                            .handle_test_request_batch(&*shard.upgrade()?, requests, ctx)
+                            .instrument(span.clone())
+                            .await;
+                        assert_eq!(res.len(), npages);
+                        res
+                    },
+                    span,
+                )
+            }
             BatchedFeMessage::RespondError { span, error } => {
                 // We've already decided to respond with an error, so we don't need to
                 // call the handler.
@@ -1791,6 +1886,51 @@ impl PageServerHandler {
         ))
     }
 
+    // NB: this impl mimics what we do for batched getpage requests.
+    #[cfg(feature = "testing")]
+    #[instrument(skip_all, fields(shard_id))]
+    async fn handle_test_request_batch(
+        &mut self,
+        timeline: &Timeline,
+        requests: Vec<BatchedTestRequest>,
+        _ctx: &RequestContext,
+    ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
+        // real requests would do something with the timeline
+        let mut results = Vec::with_capacity(requests.len());
+        for _req in requests.iter() {
+            tokio::task::yield_now().await;
+
+            results.push({
+                if timeline.cancel.is_cancelled() {
+                    Err(PageReconstructError::Cancelled)
+                } else {
+                    Ok(())
+                }
+            });
+        }
+
+        // TODO: avoid creating the new Vec here
+        Vec::from_iter(
+            requests
+                .into_iter()
+                .zip(results.into_iter())
+                .map(|(req, res)| {
+                    res.map(|()| {
+                        (
+                            PagestreamBeMessage::Test(models::PagestreamTestResponse {
+                                req: req.req.clone(),
+                            }),
+                            req.timer,
+                        )
+                    })
+                    .map_err(|e| BatchedPageStreamError {
+                        err: PageStreamError::from(e),
+                        req: req.req.hdr,
+                    })
+                }),
+        )
+    }
+
     /// Note on "fullbackup":
     /// Full basebackups should only be used for debugging purposes.
     /// Originally, it was introduced to enable breaking storage format changes,
@@ -2406,6 +2546,14 @@ impl From<GetActiveTimelineError> for QueryError {
     }
 }
 
+impl From<crate::tenant::timeline::handle::HandleUpgradeError> for QueryError {
+    fn from(e: crate::tenant::timeline::handle::HandleUpgradeError) -> Self {
+        match e {
+            crate::tenant::timeline::handle::HandleUpgradeError::ShutDown => QueryError::Shutdown,
+        }
+    }
+}
+
 fn set_tracing_field_shard_id(timeline: &Timeline) {
     debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
     tracing::Span::current().record(
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 47c4a8637d..a006647785 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -382,6 +382,12 @@ pub(crate) struct RemoteTimelineClient {
     cancel: CancellationToken,
 }
 
+impl Drop for RemoteTimelineClient {
+    fn drop(&mut self) {
+        debug!("dropping RemoteTimelineClient");
+    }
+}
+
 impl RemoteTimelineClient {
     ///
     /// Create a remote storage client for given timeline
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f24611e1d8..2ba71416b8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -76,6 +76,7 @@ use std::{pin::pin, sync::OnceLock};
 
 use crate::{
     aux_file::AuxFileSizeEstimator,
+    page_service::TenantManagerTypes,
     tenant::{
         config::AttachmentMode,
         layer_map::{LayerMap, SearchResult},
@@ -431,7 +432,7 @@ pub struct Timeline {
 
     pub(crate) l0_flush_global_state: L0FlushGlobalState,
 
-    pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
+    pub(crate) handles: handle::PerTimelineState<TenantManagerTypes>,
 
     pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
 
@@ -4625,6 +4626,10 @@ impl Drop for Timeline {
                 }
             }
         }
+        info!(
+            "Timeline {} for tenant {} is being dropped",
+            self.timeline_id, self.tenant_shard_id.tenant_id
+        );
     }
 }
 
diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs
index e82559b8b3..35d8c75ce1 100644
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -32,54 +32,151 @@
 //!
 //! # Design
 //!
+//! ## Data Structures
+//!
 //! There are three user-facing data structures:
 //! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
 //! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
 //! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
-//!   Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
+//! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows
+//!   trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*.
 //!
-//! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
+//! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`.
+//! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`.
 //!
-//! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
-//! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
+//! The `HandleInner`  is allocated as a `Arc<Mutex<HandleInner>>` and
+//! referenced weakly and strongly from various places which we are now illustrating.
+//! For brevity, we will omit the `Arc<Mutex<>>` part in the following and instead
+//! use `strong ref` and `weak ref` when referring to the `Arc<Mutex<HandleInner>>`
+//! or `Weak<Mutex<HandleInner>>`, respectively.
+//!
+//! - The `Handle` is a strong ref.
+//! - The `WeakHandle` is a weak ref.
+//! - The `PerTimelineState` contains a `HashMap<CacheId, strong ref>`.
+//! - The `Cache` is a `HashMap<unique identifier for the shard, weak ref>`.
+//!
+//! Lifetimes:
+//! - `WeakHandle` and `Handle`: single pagestream request.
+//! - `Cache`: single page service connection.
+//! - `PerTimelineState`:  lifetime of the Timeline object (i.e., i.e., till `Timeline::shutdown`).
+//!
+//! ## Request Handling Flow (= filling and using the `Cache``)
 //!
 //! To dispatch a request, the page service connection calls `Cache::get`.
 //!
 //! A cache miss means we consult the tenant manager for shard routing,
-//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
-//! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
-//! and the `Arc<HandleInner>` in the `PerTimelineState`.
+//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and store it in the the
+//! `Arc<Mutex<HandleInner>>>`. A weak ref is stored in the `Cache`
+//! and a strong ref in the `PerTimelineState`.
+//! A strong ref is returned wrapped in a `Handle`.
 //!
 //! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
-//! and find the `Weak<HandleInner>` in the cache.
-//! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
+//! and find the weak ref in the cache.
+//! We upgrade the weak ref to a strong ref and return it wrapped in a `Handle`.
 //!
-//! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
+//! The pagestream processing is pipelined and involves a batching step.
+//! While a request is batching, the `Handle` is downgraded to a `WeakHandle`.
+//! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle`
+//! and the request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
 //! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
 //!
-//! # Memory Management / How The Reference Cycle Is Broken
+//! # Performance
 //!
-//! The attentive reader may have noticed the strong reference cycle
-//! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
+//! Remember from the introductory section:
 //!
-//! This cycle is intentional: while it exists, the `Cache` can upgrade its
-//! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
+//! > However, we want to avoid the overhead of entering the gate for every
+//! > method invocation.
+//!
+//! Why do we want to avoid that?
+//! Because the gate is a shared location in memory and entering it involves
+//! bumping refcounts, which leads to cache contention if done frequently
+//! from multiple cores in parallel.
+//!
+//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`.
+//! That `Arc` is private to the `HandleInner` and hence to the connection.
+//! (Review the "Data Structures" section if that is unclear to you.)
+//!
+//! A `WeakHandle` is a weak ref to the `HandleInner`.
+//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and
+//! further acquire an additional strong ref to the `Arc<GateGuard>` inside it.
+//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection.
+//!
+//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc<GateGuard>`.
+//! Again, this is cheap because the `Arc` is private to the connection.
+//!
+//! In addition to the GateGuard, we need to provide `Deref<Target=Timeline>` impl.
+//! For this, both `Handle` need infallible access to an `Arc<Timeline>`.
+//! We could clone the `Arc<Timeline>` when upgrading a `WeakHandle`, but that would cause contention
+//! on the shared memory location that trakcs the refcount of the `Arc<Timeline>`.
+//! Instead, we wrap the `Arc<Timeline>` into another `Arc`.
+//! so that we can clone it cheaply when upgrading a `WeakHandle`.
+//!
+//! # Shutdown
+//!
+//! The attentive reader may have noticed the following reference cycle around the `Arc<Timeline>`:
+//!
+//! ```text
+//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline
+//! ```
+//!
+//! Further, there is this cycle:
+//!
+//! ```text
+//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline
+//! ```
+//!
+//! The former cycle is a memory leak if not broken.
+//! The latter cycle further prevents the Timeline from shutting down
+//! because we certainly won't drop the Timeline while the GateGuard is alive.
+//! Preventing shutdown is the whole point of this handle/cache system,
+//! but when the Timeline needs to shut down, we need to break the cycle.
 //!
 //! The cycle is broken by either
-//! - `PerTimelineState::shutdown` or
-//! - dropping the `Cache`.
+//! - Timeline shutdown (=> `PerTimelineState::shutdown`)
+//! - Connection shutdown (=> dropping the `Cache`).
 //!
-//! Concurrently existing `Handle`s will extend the existence of the cycle.
+//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to
+//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the
+//! `Arc<GateGuard>`.
+//!
+//! `PerTimelineState::shutdown` drops all the `HandleInners` it contains,
+//! thereby breaking the cycle.
+//! It also initiates draining of already existing `Handle`s by
+//! poisoning things so that no new `HandleInner`'s can be added
+//! to the `PerTimelineState`, which will make subsequent `Cache::get` fail.
+//!
+//! Concurrently existing / already upgraded `Handle`s will extend the
+//! lifetime of the `Arc<Mutex<HandleInner>>` and hence cycles.
 //! However, since `Handle`s are short-lived and new `Handle`s are not
-//! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
-//! that extension of the cycle is bounded.
+//! handed out from `Cache::get` or `WeakHandle::upgrade` after
+//! `PerTimelineState::shutdown`, that extension of the cycle is bounded.
+//!
+//! Concurrently existing `WeakHandle`s will fail to `upgrade()`:
+//! while they will succeed in upgrading `Weak<Mutex<HandleInner>>`,
+//! they will find the inner in state `HandleInner::ShutDown` state where the
+//! `Arc<GateGuard>` and Timeline has already been dropped.
+//!
+//! Dropping the `Cache` undoes the registration of this `Cache`'s
+//! `HandleInner`s from all the `PerTimelineState`s, i.e., it
+//! removes the strong ref to each of its `HandleInner`s
+//! from all the `PerTimelineState`.
+//!
+//! # Locking Rules
+//!
+//! To prevent deadlocks we:
+//!
+//! 1. Only ever hold one of the locks at a time.
+//! 2. Don't add more than one Drop impl that locks on the
+//!    cycles above.
+//!
+//! As per (2), that impl is in `Drop for Cache`.
 //!
 //! # Fast Path for Shard Routing
 //!
 //! The `Cache` has a fast path for shard routing to avoid calling into
 //! the tenant manager for every request.
 //!
-//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
+//! The `Cache` maintains a hash map of `ShardTimelineId` to `WeakHandle`s.
 //!
 //! The current implementation uses the first entry in the hash map
 //! to determine the `ShardParameters` and derive the correct
@@ -87,18 +184,18 @@
 //!
 //! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
 //!
-//! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
+//! If the lookup is successful and the `WeakHandle` can be upgraded,
 //! it's a hit.
 //!
 //! ## Cache invalidation
 //!
-//! The insight is that cache invalidation is sufficient and most efficiently done lazily.
+//! The insight is that cache invalidation is sufficient and most efficiently if done lazily.
 //! The only reasons why an entry in the cache can become stale are:
 //! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
 //!    being detached, timeline or shard deleted, or pageserver is shutting down.
 //! 2. We're doing a shard split and new traffic should be routed to the child shards.
 //!
-//! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
+//! Regarding (1), we will eventually fail to upgrade the `WeakHandle` once the
 //! timeline has shut down, and when that happens, we remove the entry from the cache.
 //!
 //! Regarding (2), the insight is that it is toally fine to keep dispatching requests
@@ -107,8 +204,6 @@
 
 use std::collections::hash_map;
 use std::collections::HashMap;
-use std::sync::atomic::AtomicBool;
-use std::sync::atomic::Ordering;
 use std::sync::Arc;
 use std::sync::Mutex;
 use std::sync::Weak;
@@ -152,7 +247,7 @@ pub(crate) struct Cache<T: Types> {
     map: Map<T>,
 }
 
-type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
+type Map<T> = HashMap<ShardTimelineId, WeakHandle<T>>;
 
 impl<T: Types> Default for Cache<T> {
     fn default() -> Self {
@@ -170,12 +265,22 @@ pub(crate) struct ShardTimelineId {
 }
 
 /// See module-level comment.
-pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
-struct HandleInner<T: Types> {
-    shut_down: AtomicBool,
-    timeline: T::Timeline,
-    // The timeline's gate held open.
-    _gate_guard: utils::sync::gate::GateGuard,
+pub(crate) struct Handle<T: Types> {
+    timeline: Arc<T::Timeline>,
+    #[allow(dead_code)] // the field exists to keep the gate open
+    gate_guard: Arc<utils::sync::gate::GateGuard>,
+    inner: Arc<Mutex<HandleInner<T>>>,
+}
+pub(crate) struct WeakHandle<T: Types> {
+    inner: Weak<Mutex<HandleInner<T>>>,
+}
+enum HandleInner<T: Types> {
+    KeepingTimelineGateOpen {
+        #[allow(dead_code)]
+        gate_guard: Arc<utils::sync::gate::GateGuard>,
+        timeline: Arc<T::Timeline>,
+    },
+    ShutDown,
 }
 
 /// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
@@ -183,7 +288,8 @@ struct HandleInner<T: Types> {
 /// See module-level comment for details.
 pub struct PerTimelineState<T: Types> {
     // None = shutting down
-    handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
+    #[allow(clippy::type_complexity)]
+    handles: Mutex<Option<HashMap<CacheId, Arc<Mutex<HandleInner<T>>>>>>,
 }
 
 impl<T: Types> Default for PerTimelineState<T> {
@@ -243,49 +349,24 @@ impl<T: Types> Cache<T> {
         shard_selector: ShardSelector,
         tenant_manager: &T::TenantManager,
     ) -> Result<Handle<T>, GetError<T>> {
-        // terminates because each iteration removes an element from the map
-        loop {
-            let handle = self
-                .get_impl(timeline_id, shard_selector, tenant_manager)
-                .await?;
-            if handle.0.shut_down.load(Ordering::Relaxed) {
-                let removed = self
-                    .map
-                    .remove(&handle.0.timeline.shard_timeline_id())
-                    .expect("invariant of get_impl is that the returned handle is in the map");
-                assert!(
-                    Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
-                    "shard_timeline_id() incorrect?"
-                );
-            } else {
-                return Ok(handle);
-            }
-        }
-    }
-
-    #[instrument(level = "trace", skip_all)]
-    async fn get_impl(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-        tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        let miss: ShardSelector = {
+        // terminates because when every iteration we remove an element from the map
+        let miss: ShardSelector = loop {
             let routing_state = self.shard_routing(timeline_id, shard_selector);
             match routing_state {
                 RoutingResult::FastPath(handle) => return Ok(handle),
                 RoutingResult::SlowPath(key) => match self.map.get(&key) {
                     Some(cached) => match cached.upgrade() {
-                        Some(upgraded) => return Ok(Handle(upgraded)),
-                        None => {
+                        Ok(upgraded) => return Ok(upgraded),
+                        Err(HandleUpgradeError::ShutDown) => {
+                            // TODO: dedup with shard_routing()
                             trace!("handle cache stale");
                             self.map.remove(&key).unwrap();
-                            ShardSelector::Known(key.shard_index)
+                            continue;
                         }
                     },
-                    None => ShardSelector::Known(key.shard_index),
+                    None => break ShardSelector::Known(key.shard_index),
                 },
-                RoutingResult::NeedConsultTenantManager => shard_selector,
+                RoutingResult::NeedConsultTenantManager => break shard_selector,
             }
         };
         self.get_miss(timeline_id, miss, tenant_manager).await
@@ -302,7 +383,7 @@ impl<T: Types> Cache<T> {
             let Some((first_key, first_handle)) = self.map.iter().next() else {
                 return RoutingResult::NeedConsultTenantManager;
             };
-            let Some(first_handle) = first_handle.upgrade() else {
+            let Ok(first_handle) = first_handle.upgrade() else {
                 // TODO: dedup with get()
                 trace!("handle cache stale");
                 let first_key_owned = *first_key;
@@ -310,7 +391,7 @@ impl<T: Types> Cache<T> {
                 continue;
             };
 
-            let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
+            let first_handle_shard_identity = first_handle.get_shard_identity();
             let make_shard_index = |shard_num: ShardNumber| ShardIndex {
                 shard_number: shard_num,
                 shard_count: first_handle_shard_identity.count,
@@ -329,11 +410,11 @@ impl<T: Types> Cache<T> {
             };
             let first_handle_shard_timeline_id = ShardTimelineId {
                 shard_index: first_handle_shard_identity.shard_index(),
-                timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
+                timeline_id: first_handle.shard_timeline_id().timeline_id,
             };
 
             if need_shard_timeline_id == first_handle_shard_timeline_id {
-                return RoutingResult::FastPath(Handle(first_handle));
+                return RoutingResult::FastPath(first_handle);
             } else {
                 return RoutingResult::SlowPath(need_shard_timeline_id);
             }
@@ -357,23 +438,30 @@ impl<T: Types> Cache<T> {
                     ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
                 }
 
-                let gate_guard = match timeline.gate().enter() {
-                    Ok(guard) => guard,
-                    Err(_) => {
-                        return Err(GetError::TimelineGateClosed);
-                    }
-                };
                 trace!("creating new HandleInner");
-                let handle = Arc::new(
-                    // TODO: global metric that keeps track of the number of live HandlerTimeline instances
-                    // so we can identify reference cycle bugs.
-                    HandleInner {
-                        shut_down: AtomicBool::new(false),
-                        _gate_guard: gate_guard,
-                        timeline: timeline.clone(),
-                    },
-                );
-                let handle = {
+                let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen {
+                    gate_guard: Arc::new(
+                        // this enter() is expensive in production code because
+                        // it hits the global Arc<Timeline>::gate refcounts
+                        match timeline.gate().enter() {
+                            Ok(guard) => guard,
+                            Err(_) => {
+                                return Err(GetError::TimelineGateClosed);
+                            }
+                        },
+                    ),
+                    // this clone is expensive in production code because
+                    // it hits the global Arc<Timeline>::clone refcounts
+                    timeline: Arc::new(timeline.clone()),
+                }));
+                let handle_weak = WeakHandle {
+                    inner: Arc::downgrade(&handle_inner_arc),
+                };
+                let handle = handle_weak
+                    .upgrade()
+                    .ok()
+                    .expect("we just created it and it's not linked anywhere yet");
+                {
                     let mut lock_guard = timeline
                         .per_timeline_state()
                         .handles
@@ -381,7 +469,8 @@ impl<T: Types> Cache<T> {
                         .expect("mutex poisoned");
                     match &mut *lock_guard {
                         Some(per_timeline_state) => {
-                            let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
+                            let replaced =
+                                per_timeline_state.insert(self.id, Arc::clone(&handle_inner_arc));
                             assert!(replaced.is_none(), "some earlier code left a stale handle");
                             match self.map.entry(key) {
                                 hash_map::Entry::Occupied(_o) => {
@@ -392,8 +481,7 @@ impl<T: Types> Cache<T> {
                                     unreachable!()
                                 }
                                 hash_map::Entry::Vacant(v) => {
-                                    v.insert(Arc::downgrade(&handle));
-                                    handle
+                                    v.insert(handle_weak);
                                 }
                             }
                         }
@@ -401,14 +489,62 @@ impl<T: Types> Cache<T> {
                             return Err(GetError::PerTimelineStateShutDown);
                         }
                     }
-                };
-                Ok(Handle(handle))
+                }
+                Ok(handle)
             }
             Err(e) => Err(GetError::TenantManager(e)),
         }
     }
 }
 
+pub(crate) enum HandleUpgradeError {
+    ShutDown,
+}
+
+impl<T: Types> WeakHandle<T> {
+    pub(crate) fn upgrade(&self) -> Result<Handle<T>, HandleUpgradeError> {
+        let Some(inner) = Weak::upgrade(&self.inner) else {
+            return Err(HandleUpgradeError::ShutDown);
+        };
+        let lock_guard = inner.lock().expect("poisoned");
+        match &*lock_guard {
+            HandleInner::KeepingTimelineGateOpen {
+                timeline,
+                gate_guard,
+            } => {
+                let gate_guard = Arc::clone(gate_guard);
+                let timeline = Arc::clone(timeline);
+                drop(lock_guard);
+                Ok(Handle {
+                    timeline,
+                    gate_guard,
+                    inner,
+                })
+            }
+            HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown),
+        }
+    }
+
+    pub(crate) fn is_same_handle_as(&self, other: &WeakHandle<T>) -> bool {
+        Weak::ptr_eq(&self.inner, &other.inner)
+    }
+}
+
+impl<T: Types> std::ops::Deref for Handle<T> {
+    type Target = T::Timeline;
+    fn deref(&self) -> &Self::Target {
+        &self.timeline
+    }
+}
+
+impl<T: Types> Handle<T> {
+    pub(crate) fn downgrade(&self) -> WeakHandle<T> {
+        WeakHandle {
+            inner: Arc::downgrade(&self.inner),
+        }
+    }
+}
+
 impl<T: Types> PerTimelineState<T> {
     /// After this method returns, [`Cache::get`] will never again return a [`Handle`]
     /// to the [`Types::Timeline`] that embeds this per-timeline state.
@@ -430,43 +566,54 @@ impl<T: Types> PerTimelineState<T> {
             trace!("already shut down");
             return;
         };
-        for handle in handles.values() {
+        for handle_inner_arc in handles.values() {
             // Make hits fail.
-            handle.shut_down.store(true, Ordering::Relaxed);
+            let mut lock_guard = handle_inner_arc.lock().expect("poisoned");
+            lock_guard.shutdown();
         }
         drop(handles);
     }
 }
 
-impl<T: Types> std::ops::Deref for Handle<T> {
-    type Target = T::Timeline;
-    fn deref(&self) -> &Self::Target {
-        &self.0.timeline
-    }
-}
-
-#[cfg(test)]
-impl<T: Types> Drop for HandleInner<T> {
-    fn drop(&mut self) {
-        trace!("HandleInner dropped");
-    }
-}
-
 // When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
 impl<T: Types> Drop for Cache<T> {
     fn drop(&mut self) {
-        for (_, weak) in self.map.drain() {
-            if let Some(strong) = weak.upgrade() {
-                // handle is still being kept alive in PerTimelineState
-                let timeline = strong.timeline.per_timeline_state();
-                let mut handles = timeline.handles.lock().expect("mutex poisoned");
-                if let Some(handles) = &mut *handles {
-                    let Some(removed) = handles.remove(&self.id) else {
-                        // There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
-                        continue;
-                    };
-                    assert!(Arc::ptr_eq(&removed, &strong));
-                }
+        for (
+            _,
+            WeakHandle {
+                inner: handle_inner_weak,
+            },
+        ) in self.map.drain()
+        {
+            let Some(handle_inner_arc) = handle_inner_weak.upgrade() else {
+                continue;
+            };
+            let handle_timeline = handle_inner_arc
+                // locking rules: drop lock before acquiring other lock below
+                .lock()
+                .expect("poisoned")
+                .shutdown();
+            let per_timeline_state = handle_timeline.per_timeline_state();
+            let mut handles_lock_guard = per_timeline_state.handles.lock().expect("mutex poisoned");
+            let Some(handles) = &mut *handles_lock_guard else {
+                continue;
+            };
+            let Some(removed_handle_inner_arc) = handles.remove(&self.id) else {
+                // There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
+                continue;
+            };
+            drop(handles_lock_guard); // locking rules: remember them when!
+            assert!(Arc::ptr_eq(&removed_handle_inner_arc, &handle_inner_arc,));
+        }
+    }
+}
+
+impl<T: Types> HandleInner<T> {
+    fn shutdown(&mut self) -> Arc<T::Timeline> {
+        match std::mem::replace(self, HandleInner::ShutDown) {
+            HandleInner::KeepingTimelineGateOpen { timeline, .. } => timeline,
+            HandleInner::ShutDown => {
+                unreachable!("handles are only shut down once in their lifetime");
             }
         }
     }
@@ -474,6 +621,8 @@ impl<T: Types> Drop for Cache<T> {
 
 #[cfg(test)]
 mod tests {
+    use std::sync::Weak;
+
     use pageserver_api::{
         key::{rel_block_to_key, Key, DBDIR_KEY},
         models::ShardParameters,
@@ -583,39 +732,13 @@ mod tests {
         //
         // fill the cache
         //
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (2, 1),
-            "strong: shard0, mgr; weak: myself"
-        );
-
         let handle: Handle<_> = cache
             .get(timeline_id, ShardSelector::Page(key), &mgr)
             .await
             .expect("we have the timeline");
-        let handle_inner_weak = Arc::downgrade(&handle.0);
         assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
-        assert_eq!(
-            (
-                Weak::strong_count(&handle_inner_weak),
-                Weak::weak_count(&handle_inner_weak)
-            ),
-            (2, 2),
-            "strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
-        );
         assert_eq!(cache.map.len(), 1);
-
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
-        );
         drop(handle);
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
-        );
 
         //
         // demonstrate that Handle holds up gate closure
@@ -640,21 +763,11 @@ mod tests {
         // SHUTDOWN
         shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
 
-        assert_eq!(
-            1,
-            Weak::strong_count(&handle_inner_weak),
-            "through local var handle"
-        );
         assert_eq!(
             cache.map.len(),
             1,
             "this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
         );
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(via handle), shard0, mgr; weak: myself"
-        );
 
         // this handle is perfectly usable
         handle.getpage();
@@ -678,16 +791,6 @@ mod tests {
         }
 
         drop(handle);
-        assert_eq!(
-            0,
-            Weak::strong_count(&handle_inner_weak),
-            "the HandleInner destructor already ran"
-        );
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (2, 1),
-            "strong: shard0, mgr; weak: myself"
-        );
 
         // closing gate succeeds after dropping handle
         tokio::select! {
@@ -706,10 +809,8 @@ mod tests {
         assert_eq!(cache.map.len(), 0);
 
         // ensure all refs to shard0 are gone and we're not leaking anything
-        let myself = Weak::clone(&shard0.myself);
         drop(shard0);
         drop(mgr);
-        assert_eq!(Weak::strong_count(&myself), 0);
     }
 
     #[tokio::test]
@@ -948,15 +1049,11 @@ mod tests {
                 handle
             };
             handle.getpage();
-            used_handles.push(Arc::downgrade(&handle.0));
+            used_handles.push(Arc::downgrade(&handle.timeline));
         }
 
-        // No handles exist, thus gates are closed and don't require shutdown
-        assert!(used_handles
-            .iter()
-            .all(|weak| Weak::strong_count(weak) == 0));
-
-        // ... thus the gate should close immediately, even without shutdown
+        // No handles exist, thus gates are closed and don't require shutdown.
+        // Thus the gate should close immediately, even without shutdown.
         tokio::select! {
             _ = shard0.gate.close() => { }
             _ = tokio::time::sleep(FOREVER) => {
@@ -964,4 +1061,75 @@ mod tests {
             }
         }
     }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_weak_handles() {
+        crate::tenant::harness::setup_logging();
+        let timeline_id = TimelineId::generate();
+        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_id,
+            shard: ShardIdentity::unsharded(),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let mgr = StubManager {
+            shards: vec![shard0.clone()],
+        };
+
+        let refcount_start = Arc::strong_count(&shard0);
+
+        let key = DBDIR_KEY;
+
+        let mut cache = Cache::<TestTypes>::default();
+
+        let handle = cache
+            .get(timeline_id, ShardSelector::Page(key), &mgr)
+            .await
+            .expect("we have the timeline");
+        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
+
+        let weak_handle = handle.downgrade();
+
+        drop(handle);
+
+        let upgraded_handle = weak_handle.upgrade().ok().expect("we can upgrade it");
+
+        // Start shutdown
+        shard0.per_timeline_state.shutdown();
+
+        // Upgrades during shutdown don't work, even if upgraded_handle exists.
+        weak_handle
+            .upgrade()
+            .err()
+            .expect("can't upgrade weak handle as soon as shutdown started");
+
+        // But upgraded_handle is still alive, so the gate won't close.
+        tokio::select! {
+            _ = shard0.gate.close() => {
+                panic!("handle is keeping gate open");
+            }
+            _ = tokio::time::sleep(FOREVER) => { }
+        }
+
+        // Drop the last handle.
+        drop(upgraded_handle);
+
+        // The gate should close now, despite there still being a weak_handle.
+        tokio::select! {
+            _ = shard0.gate.close() => { }
+            _ = tokio::time::sleep(FOREVER) => {
+                panic!("only strong handle is dropped and we shut down per-timeline-state")
+            }
+        }
+
+        // The weak handle still can't be upgraded.
+        weak_handle
+            .upgrade()
+            .err()
+            .expect("still shouldn't be able to upgrade the weak handle");
+
+        // There should be no strong references to the timeline object except the one on "stack".
+        assert_eq!(Arc::strong_count(&shard0), refcount_start);
+    }
 }
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index b751235595..7b748d7252 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -34,6 +34,8 @@ typedef enum
 	T_NeonGetPageRequest,
 	T_NeonDbSizeRequest,
 	T_NeonGetSlruSegmentRequest,
+	/* future tags above this line */
+	T_NeonTestRequest = 99, /* only in cfg(feature = "testing") */
 
 	/* pagestore -> pagestore_client */
 	T_NeonExistsResponse = 100,
@@ -42,6 +44,8 @@ typedef enum
 	T_NeonErrorResponse,
 	T_NeonDbSizeResponse,
 	T_NeonGetSlruSegmentResponse,
+	/* future tags above this line */
+	T_NeonTestResponse = 199, /* only in cfg(feature = "testing") */
 } NeonMessageTag;
 
 typedef uint64 NeonRequestId;
diff --git a/test_runner/regress/test_page_service_batching_regressions.py b/test_runner/regress/test_page_service_batching_regressions.py
new file mode 100644
index 0000000000..fa85e1210b
--- /dev/null
+++ b/test_runner/regress/test_page_service_batching_regressions.py
@@ -0,0 +1,60 @@
+# NB: there are benchmarks that double-serve as tests inside the `performance` directory.
+
+import subprocess
+from pathlib import Path
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+@pytest.mark.timeout(30)  # test takes <20s if pageserver impl is correct
+@pytest.mark.parametrize("kind", ["pageserver-stop", "tenant-detach"])
+def test_slow_flush(neon_env_builder: NeonEnvBuilder, neon_binpath: Path, kind: str):
+    def patch_pageserver_toml(config):
+        config["page_service_pipelining"] = {
+            "mode": "pipelined",
+            "max_batch_size": 32,
+            "execution": "concurrent-futures",
+        }
+
+    neon_env_builder.pageserver_config_override = patch_pageserver_toml
+    env = neon_env_builder.init_start()
+
+    log.info("make flush appear slow")
+
+    log.info("sending requests until pageserver accepts no more")
+    # TODO: extract this into a helper, like subprocess_capture,
+    # so that we capture the stderr from the helper somewhere.
+    child = subprocess.Popen(
+        [
+            neon_binpath / "test_helper_slow_client_reads",
+            env.pageserver.connstr(),
+            str(env.initial_tenant),
+            str(env.initial_timeline),
+        ],
+        bufsize=0,  # unbuffered
+        stdin=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+    )
+    assert child.stdout is not None
+    buf = child.stdout.read(1)
+    if len(buf) != 1:
+        raise Exception("unexpected EOF")
+    if buf != b"R":
+        raise Exception(f"unexpected data: {buf!r}")
+    log.info("helper reports pageserver accepts no more requests")
+    log.info(
+        "assuming pageserver connection handle is in a state where TCP has backpressured pageserver=>client response flush() into userspace"
+    )
+
+    if kind == "pageserver-stop":
+        log.info("try to shut down the pageserver cleanly")
+        env.pageserver.stop()
+    elif kind == "tenant-detach":
+        log.info("try to shut down the tenant")
+        env.pageserver.tenant_detach(env.initial_tenant)
+    else:
+        raise ValueError(f"unexpected kind: {kind}")
+
+    log.info("shutdown did not time out, test passed")

From 871e8b325f1509c0ec5cba03537297847345c02e Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 16 Jan 2025 14:46:53 -0600
Subject: [PATCH 216/583] Use the request ID given by the control plane in
 compute_ctl (#10418)

Instead of generating our own request ID, we can just use the one
provided by the control plane. In the event, we get a request from a
client which doesn't set X-Request-ID, then we just generate one which
is useful for tracing purposes.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 Cargo.lock                       |  1 +
 compute_tools/Cargo.toml         |  1 +
 compute_tools/src/http/server.rs | 44 ++++++++++++--------------------
 3 files changed, 19 insertions(+), 27 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3f184ebe0b..02b02a09c1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1312,6 +1312,7 @@ dependencies = [
  "tracing-utils",
  "url",
  "utils",
+ "uuid",
  "vm_monitor",
  "workspace_hack",
  "zstd",
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 33892813c4..b04f364cbb 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -51,6 +51,7 @@ tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 thiserror.workspace = true
 url.workspace = true
+uuid.workspace = true
 prometheus.workspace = true
 
 postgres_initdb.workspace = true
diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs
index 33d4b489a0..40fb1f4b4d 100644
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -1,15 +1,14 @@
 use std::{
     net::{IpAddr, Ipv6Addr, SocketAddr},
-    sync::{
-        atomic::{AtomicU64, Ordering},
-        Arc,
-    },
+    sync::Arc,
     thread,
     time::Duration,
 };
 
 use anyhow::Result;
 use axum::{
+    extract::Request,
+    middleware::{self, Next},
     response::{IntoResponse, Response},
     routing::{get, post},
     Router,
@@ -17,11 +16,9 @@ use axum::{
 use http::StatusCode;
 use tokio::net::TcpListener;
 use tower::ServiceBuilder;
-use tower_http::{
-    request_id::{MakeRequestId, PropagateRequestIdLayer, RequestId, SetRequestIdLayer},
-    trace::TraceLayer,
-};
+use tower_http::{request_id::PropagateRequestIdLayer, trace::TraceLayer};
 use tracing::{debug, error, info, Span};
+use uuid::Uuid;
 
 use super::routes::{
     check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
@@ -34,30 +31,24 @@ async fn handle_404() -> Response {
     StatusCode::NOT_FOUND.into_response()
 }
 
-#[derive(Clone, Default)]
-struct ComputeMakeRequestId(Arc<AtomicU64>);
+const X_REQUEST_ID: &str = "x-request-id";
 
-impl MakeRequestId for ComputeMakeRequestId {
-    fn make_request_id<B>(
-        &mut self,
-        _request: &http::Request<B>,
-    ) -> Option<tower_http::request_id::RequestId> {
-        let request_id = self
-            .0
-            .fetch_add(1, Ordering::SeqCst)
-            .to_string()
-            .parse()
-            .unwrap();
+/// This middleware function allows compute_ctl to generate its own request ID
+/// if one isn't supplied. The control plane will always send one as a UUID. The
+/// neon Postgres extension on the other hand does not send one.
+async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
+    let headers = request.headers_mut();
 
-        Some(RequestId::new(request_id))
+    if headers.get(X_REQUEST_ID).is_none() {
+        headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
     }
+
+    next.run(request).await
 }
 
 /// Run the HTTP server and wait on it forever.
 #[tokio::main]
 async fn serve(port: u16, compute: Arc<ComputeNode>) {
-    const X_REQUEST_ID: &str = "x-request-id";
-
     let mut app = Router::new()
         .route("/check_writability", post(check_writability::is_writable))
         .route("/configure", post(configure::configure))
@@ -82,9 +73,8 @@ async fn serve(port: u16, compute: Arc<ComputeNode>) {
         .fallback(handle_404)
         .layer(
             ServiceBuilder::new()
-                .layer(SetRequestIdLayer::x_request_id(
-                    ComputeMakeRequestId::default(),
-                ))
+                // Add this middleware since we assume the request ID exists
+                .layer(middleware::from_fn(maybe_add_request_id_header))
                 .layer(
                     TraceLayer::new_for_http()
                         .on_request(|request: &http::Request<_>, _span: &Span| {

From 053abff71f41a2d3eefaef4c94ac7f65b6956c47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Fri, 17 Jan 2025 15:21:30 +0100
Subject: [PATCH 217/583] Fix dependency on neon-image in promote-images-dev
 (#10437)

## Problem
871e8b325f1509c0ec5cba03537297847345c02e failed CI on main because a job
ran to soon. This was caused by
ea84ec357fa4caa5a48ec65a0aab9e37d1a9fda4. While `promote-images-dev`
does not inherently need `neon-image`, a few jobs depending on
`promote-images-dev` do need it, and previously had it when it was
`promote-images`, which depended on `test-images`, which in turn
depended on `neon-image`.

## Summary of changes
To ensure jobs depending `docker.io/neondatabase/neon` images get them,
`promote-images-dev` gets the dependency to `neon-image` back which it
previously had transitively through `test-images`.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9ec5273af7..b0e07535b3 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -824,7 +824,7 @@ jobs:
           docker compose -f ./docker-compose/docker-compose.yml down
 
   promote-images-dev:
-    needs: [ check-permissions, tag, vm-compute-node-image ]
+    needs: [ check-permissions, tag, vm-compute-node-image, neon-image ]
     runs-on: ubuntu-22.04
 
     permissions:

From 6975228a766bc2e5df36559a49fee0ef3417283a Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 17 Jan 2025 14:51:33 +0000
Subject: [PATCH 218/583] pageserver: add initdb metrics (#10434)

## Problem

Initdb observability is poor.

## Summary of changes

Add some metrics so we can figure out which part, if any, is slow.

Closes https://github.com/neondatabase/neon/issues/10423
---
 pageserver/src/metrics.rs | 26 ++++++++++++++++++++++++++
 pageserver/src/tenant.rs  | 14 +++++++++++++-
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 3c4830e3cd..4758aaf230 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -100,6 +100,32 @@ pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) static CONCURRENT_INITDBS: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_concurrent_initdb",
+        "Number of initdb processes running"
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static INITDB_SEMAPHORE_ACQUISITION_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_initdb_semaphore_seconds_global",
+        "Time spent getting a permit from the global initdb semaphore",
+        STORAGE_OP_BUCKETS.into()
+    )
+    .expect("failed to define metric")
+});
+
+pub(crate) static INITDB_RUN_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_initdb_seconds_global",
+        "Time spent performing initdb",
+        STORAGE_OP_BUCKETS.into()
+    )
+    .expect("failed to define metric")
+});
+
 // Metrics collected on operations on the storage repository.
 #[derive(
     Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f6d758ad22..bb1b36aed6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -95,6 +95,9 @@ use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
 use crate::is_uninit_mark;
 use crate::l0_flush::L0FlushGlobalState;
+use crate::metrics::CONCURRENT_INITDBS;
+use crate::metrics::INITDB_RUN_TIME;
+use crate::metrics::INITDB_SEMAPHORE_ACQUISITION_TIME;
 use crate::metrics::TENANT;
 use crate::metrics::{
     remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
@@ -5347,8 +5350,17 @@ async fn run_initdb(
         initdb_bin_path, initdb_target_dir, initdb_lib_dir,
     );
 
-    let _permit = INIT_DB_SEMAPHORE.acquire().await;
+    let _permit = {
+        let _timer = INITDB_SEMAPHORE_ACQUISITION_TIME.start_timer();
+        INIT_DB_SEMAPHORE.acquire().await
+    };
 
+    CONCURRENT_INITDBS.inc();
+    scopeguard::defer! {
+        CONCURRENT_INITDBS.dec();
+    }
+
+    let _timer = INITDB_RUN_TIME.start_timer();
     let res = postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
         superuser: &conf.superuser,
         locale: &conf.locale,

From b0f34099f90cfa08223ed653a7c7460943f34f0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 17 Jan 2025 22:43:52 +0100
Subject: [PATCH 219/583] Add safekeeper utilization endpoint (#10429)

Add an endpoint to obtain the utilization of a safekeeper. Future
changes to the storage controller can use this endpoint to find the most
suitable safekeepers for newly created timelines, analogously to how
it's done for pageservers already.

Initially we just want to assign by timeline count, then we can iterate
from there.

Part of https://github.com/neondatabase/neon/issues/9011
---
 libs/safekeeper_api/src/models.rs      |  5 +++++
 safekeeper/client/src/mgmt_api.rs      |  5 +++++
 safekeeper/src/http/routes.rs          |  8 ++++++++
 safekeeper/src/timelines_global_map.rs | 15 +++++++++++++++
 4 files changed, 33 insertions(+)

diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index b5fa903820..30418b0efd 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -277,3 +277,8 @@ pub struct TimelineTermBumpResponse {
     pub previous_term: u64,
     pub current_term: u64,
 }
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct SafekeeperUtilization {
+    pub timeline_count: u64,
+}
diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs
index f78745043a..5727f32509 100644
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -102,6 +102,11 @@ impl Client {
         self.get(&uri).await
     }
 
+    pub async fn utilization(&self) -> Result<reqwest::Response> {
+        let uri = format!("{}/v1/utilization/", self.mgmt_api_endpoint);
+        self.get(&uri).await
+    }
+
     async fn get<U: IntoUrl>(&self, uri: U) -> Result<reqwest::Response> {
         self.request(Method::GET, uri, ()).await
     }
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 4b9fb9eb67..7ec08ecf9a 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -127,6 +127,13 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
     json_response(StatusCode::OK, ())
 }
 
+async fn utilization_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let global_timelines = get_global_timelines(&request);
+    let utilization = global_timelines.get_timeline_counts();
+    json_response(StatusCode::OK, utilization)
+}
+
 /// List all (not deleted) timelines.
 /// Note: it is possible to do the same with debug_dump.
 async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -620,6 +627,7 @@ pub fn make_router(
                 failpoints_handler(r, cancel).await
             })
         })
+        .get("/v1/uzilization", |r| request_span(r, utilization_handler))
         .delete("/v1/tenant/:tenant_id", |r| {
             request_span(r, tenant_delete_handler)
         })
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index a701534f65..01c6aff6c3 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -13,6 +13,7 @@ use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
 use camino_tempfile::Utf8TempDir;
 use safekeeper_api::membership::Configuration;
+use safekeeper_api::models::SafekeeperUtilization;
 use safekeeper_api::ServerInfo;
 use serde::Serialize;
 use std::collections::HashMap;
@@ -416,6 +417,20 @@ impl GlobalTimelines {
             .collect()
     }
 
+    /// Returns statistics about timeline counts
+    pub fn get_timeline_counts(&self) -> SafekeeperUtilization {
+        let global_lock = self.state.lock().unwrap();
+        let timeline_count = global_lock
+            .timelines
+            .values()
+            .filter(|t| match t {
+                GlobalMapTimeline::CreationInProgress => false,
+                GlobalMapTimeline::Timeline(t) => !t.is_cancelled(),
+            })
+            .count() as u64;
+        SafekeeperUtilization { timeline_count }
+    }
+
     /// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant,
     /// and that's why it can return cancelled timelines, to retry deleting them.
     fn get_all_for_tenant(&self, tenant_id: TenantId) -> Vec<Arc<Timeline>> {

From 8bdaee35f3dec86b37bb6b91be57a88a86d9ad33 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 20 Jan 2025 09:20:31 +0000
Subject: [PATCH 220/583] pageserver: safety checks on validity of uploaded
 indices (#10403)

## Problem

Occasionally, we encounter bugs in test environments that can be
detected at the point of uploading an index, but we proceed to upload it
anyway and leave a tenant in a broken state that's awkward to handle.

## Summary of changes

- Validate index when submitting it for upload, so that we can see the
issue quickly e.g. in an API invoking compaction
- Validate index before executing the upload, so that we have a hard
enforcement that any code path that tries to upload an index will not
overwrite a valid index with an invalid one.
---
 .../src/tenant/remote_timeline_client.rs      |  6 ++
 .../tenant/remote_timeline_client/index.rs    | 15 +++++
 .../tenant/remote_timeline_client/upload.rs   |  4 ++
 .../src/tenant/storage_layer/layer/tests.rs   | 56 ++++++++++++++++---
 pageserver/src/tenant/timeline.rs             | 20 ++++++-
 5 files changed, 91 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index a006647785..bcba6d1f62 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -803,6 +803,12 @@ impl RemoteTimelineClient {
 
         upload_queue.dirty.metadata.apply(update);
 
+        // Defense in depth: if we somehow generated invalid metadata, do not persist it.
+        upload_queue
+            .dirty
+            .validate()
+            .map_err(|e| anyhow::anyhow!(e))?;
+
         self.schedule_index_upload(upload_queue);
 
         Ok(())
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 244be5bbb7..08e94ae197 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -152,6 +152,21 @@ impl IndexPart {
         };
         is_same_remote_layer_path(name, metadata, name, index_metadata)
     }
+
+    /// Check for invariants in the index: this is useful when uploading an index to ensure that if
+    /// we encounter a bug, we do not persist buggy metadata.
+    pub(crate) fn validate(&self) -> Result<(), String> {
+        if self.import_pgdata.is_none()
+            && self.metadata.ancestor_timeline().is_none()
+            && self.layer_metadata.is_empty()
+        {
+            // Unless we're in the middle of a raw pgdata import, or this is a child timeline,the index must
+            // always have at least one layer.
+            return Err("Index has no ancestor and no layers".to_string());
+        }
+
+        Ok(())
+    }
 }
 
 /// Metadata gathered for each of the layer files.
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index e434d24e5f..af4dbbbfb6 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -40,6 +40,10 @@ pub(crate) async fn upload_index_part(
     });
     pausable_failpoint!("before-upload-index-pausable");
 
+    // Safety: refuse to persist invalid index metadata, to mitigate the impact of any bug that produces this
+    // (this should never happen)
+    index_part.validate().map_err(|e| anyhow::anyhow!(e))?;
+
     // FIXME: this error comes too late
     let serialized = index_part.to_json_bytes()?;
     let serialized = Bytes::from(serialized);
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 36dcc8d805..fcb73ad20d 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1,6 +1,6 @@
 use std::time::UNIX_EPOCH;
 
-use pageserver_api::key::CONTROLFILE_KEY;
+use pageserver_api::key::{Key, CONTROLFILE_KEY};
 use tokio::task::JoinSet;
 use utils::{
     completion::{self, Completion},
@@ -9,7 +9,10 @@ use utils::{
 
 use super::failpoints::{Failpoint, FailpointKind};
 use super::*;
-use crate::{context::DownloadBehavior, tenant::storage_layer::LayerVisibilityHint};
+use crate::{
+    context::DownloadBehavior,
+    tenant::{harness::test_img, storage_layer::LayerVisibilityHint},
+};
 use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
 
 /// Used in tests to advance a future to wanted await point, and not futher.
@@ -31,20 +34,51 @@ async fn smoke_test() {
 
     let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
 
+    let image_layers = vec![(
+        Lsn(0x40),
+        vec![(
+            Key::from_hex("620000000033333333444444445500000000").unwrap(),
+            test_img("foo"),
+        )],
+    )];
+
+    // Create a test timeline with one real layer, and one synthetic test layer.  The synthetic
+    // one is only there so that we can GC the real one without leaving the timeline's metadata
+    // empty, which is an illegal state (see [`IndexPart::validate`]).
     let timeline = tenant
-        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .create_test_timeline_with_layers(
+            TimelineId::generate(),
+            Lsn(0x10),
+            14,
+            &ctx,
+            Default::default(),
+            image_layers,
+            Lsn(0x100),
+        )
         .await
         .unwrap();
 
-    let layer = {
+    // Grab one of the timeline's layers to exercise in the test, and the other layer that is just
+    // there to avoid the timeline being illegally empty
+    let (layer, dummy_layer) = {
         let mut layers = {
             let layers = timeline.layers.read().await;
             layers.likely_resident_layers().cloned().collect::<Vec<_>>()
         };
 
-        assert_eq!(layers.len(), 1);
+        assert_eq!(layers.len(), 2);
 
-        layers.swap_remove(0)
+        layers.sort_by_key(|l| l.layer_desc().get_key_range().start);
+        let synthetic_layer = layers.pop().unwrap();
+        let real_layer = layers.pop().unwrap();
+        tracing::info!(
+            "real_layer={:?} ({}), synthetic_layer={:?} ({})",
+            real_layer,
+            real_layer.layer_desc().file_size,
+            synthetic_layer,
+            synthetic_layer.layer_desc().file_size
+        );
+        (real_layer, synthetic_layer)
     };
 
     // all layers created at pageserver are like `layer`, initialized with strong
@@ -173,10 +207,13 @@ async fn smoke_test() {
 
     let rtc = &timeline.remote_client;
 
+    // Simulate GC removing our test layer.
     {
-        let layers = &[layer];
         let mut g = timeline.layers.write().await;
+
+        let layers = &[layer];
         g.open_mut().unwrap().finish_gc_timeline(layers);
+
         // this just updates the remote_physical_size for demonstration purposes
         rtc.schedule_gc_update(layers).unwrap();
     }
@@ -191,7 +228,10 @@ async fn smoke_test() {
 
     rtc.wait_completion().await.unwrap();
 
-    assert_eq!(rtc.get_remote_physical_size(), 0);
+    assert_eq!(
+        rtc.get_remote_physical_size(),
+        dummy_layer.metadata().file_size
+    );
     assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
 }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2ba71416b8..5f4272fb2b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5678,9 +5678,17 @@ impl Timeline {
         info!("force created image layer {}", image_layer.local_path());
         {
             let mut guard = self.layers.write().await;
-            guard.open_mut().unwrap().force_insert_layer(image_layer);
+            guard
+                .open_mut()
+                .unwrap()
+                .force_insert_layer(image_layer.clone());
         }
 
+        // Update remote_timeline_client state to reflect existence of this layer
+        self.remote_client
+            .schedule_layer_file_upload(image_layer)
+            .unwrap();
+
         Ok(())
     }
 
@@ -5731,9 +5739,17 @@ impl Timeline {
         info!("force created delta layer {}", delta_layer.local_path());
         {
             let mut guard = self.layers.write().await;
-            guard.open_mut().unwrap().force_insert_layer(delta_layer);
+            guard
+                .open_mut()
+                .unwrap()
+                .force_insert_layer(delta_layer.clone());
         }
 
+        // Update remote_timeline_client state to reflect existence of this layer
+        self.remote_client
+            .schedule_layer_file_upload(delta_layer)
+            .unwrap();
+
         Ok(())
     }
 

From 7d761a9d22e0c3ca0e337af1793b65cb4d3f7203 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 20 Jan 2025 09:47:23 +0000
Subject: [PATCH 221/583] storage controller: make chaos less disruptive to AZ
 locality (#10438)

## Problem

Since #9916 , the chaos code is actively fighting the optimizer: tenants
tend to be attached in their preferred AZ, so most chaos migrations were
moving them to a non-preferred AZ.

## Summary of changes

- When picking migrations, prefer to migrate things _toward_ their
preferred AZ when possible. Then pick shards to move the other way when
necessary.

The resulting behavior should be an alternating "back and forth" where
the chaos code migrates thiings away from home, and then migrates them
back on the next iteration.

The side effect will be that the chaos code actively helps to push
things into their home AZ. That's not contrary to its purpose though: we
mainly just want it to continuously migrate things to exercise
migration+notification code.
---
 .../src/service/chaos_injector.rs             | 110 ++++++++++++------
 storage_controller/src/tenant_shard.rs        |  17 +++
 2 files changed, 93 insertions(+), 34 deletions(-)

diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs
index 0e551beaa7..98034421d6 100644
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -1,11 +1,17 @@
-use std::{sync::Arc, time::Duration};
+use std::{
+    collections::{BTreeMap, HashMap},
+    sync::Arc,
+    time::Duration,
+};
 
 use pageserver_api::controller_api::ShardSchedulingPolicy;
 use rand::seq::SliceRandom;
 use rand::thread_rng;
 use tokio_util::sync::CancellationToken;
+use utils::id::NodeId;
+use utils::shard::TenantShardId;
 
-use super::Service;
+use super::{Node, Scheduler, Service, TenantShard};
 
 pub struct ChaosInjector {
     service: Arc<Service>,
@@ -35,50 +41,86 @@ impl ChaosInjector {
         }
     }
 
+    /// If a shard has a secondary and attached location, then re-assign the secondary to be
+    /// attached and the attached to be secondary.
+    ///
+    /// Only modifies tenants if they're in Active scheduling policy.
+    fn maybe_migrate_to_secondary(
+        &self,
+        tenant_shard_id: TenantShardId,
+        nodes: &Arc<HashMap<NodeId, Node>>,
+        tenants: &mut BTreeMap<TenantShardId, TenantShard>,
+        scheduler: &mut Scheduler,
+    ) {
+        let shard = tenants
+            .get_mut(&tenant_shard_id)
+            .expect("Held lock between choosing ID and this get");
+
+        if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) {
+            // Skip non-active scheduling policies, so that a shard with a policy like Pause can
+            // be pinned without being disrupted by us.
+            tracing::info!(
+                "Skipping shard {tenant_shard_id}: scheduling policy is {:?}",
+                shard.get_scheduling_policy()
+            );
+            return;
+        }
+
+        // Pick a secondary to promote
+        let Some(new_location) = shard
+            .intent
+            .get_secondary()
+            .choose(&mut thread_rng())
+            .cloned()
+        else {
+            tracing::info!(
+                "Skipping shard {tenant_shard_id}: no secondary location, can't migrate"
+            );
+            return;
+        };
+
+        let Some(old_location) = *shard.intent.get_attached() else {
+            tracing::info!("Skipping shard {tenant_shard_id}: currently has no attached location");
+            return;
+        };
+
+        tracing::info!("Injecting chaos: migrate {tenant_shard_id} {old_location}->{new_location}");
+
+        shard.intent.demote_attached(scheduler, old_location);
+        shard.intent.promote_attached(scheduler, new_location);
+        self.service.maybe_reconcile_shard(shard, nodes);
+    }
+
     async fn inject_chaos(&mut self) {
         // Pick some shards to interfere with
         let batch_size = 128;
         let mut inner = self.service.inner.write().unwrap();
         let (nodes, tenants, scheduler) = inner.parts_mut();
         let tenant_ids = tenants.keys().cloned().collect::<Vec<_>>();
-        let victims = tenant_ids.choose_multiple(&mut thread_rng(), batch_size);
 
-        for victim in victims {
-            let shard = tenants
-                .get_mut(victim)
-                .expect("Held lock between choosing ID and this get");
-
-            if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) {
-                // Skip non-active scheduling policies, so that a shard with a policy like Pause can
-                // be pinned without being disrupted by us.
-                tracing::info!(
-                    "Skipping shard {victim}: scheduling policy is {:?}",
-                    shard.get_scheduling_policy()
-                );
-                continue;
+        // Prefer to migrate tenants that are currently outside their home AZ.  This avoids the chaos injector
+        // continuously pushing tenants outside their home AZ: instead, we'll tend to cycle between picking some
+        // random tenants to move, and then on next chaos iteration moving them back, then picking some new
+        // random tenants on the next iteration.
+        let mut victims = Vec::with_capacity(batch_size);
+        for shard in tenants.values() {
+            if shard.is_attached_outside_preferred_az(nodes) {
+                victims.push(shard.tenant_shard_id);
             }
 
-            // Pick a secondary to promote
-            let Some(new_location) = shard
-                .intent
-                .get_secondary()
-                .choose(&mut thread_rng())
-                .cloned()
-            else {
-                tracing::info!("Skipping shard {victim}: no secondary location, can't migrate");
-                continue;
-            };
+            if victims.len() >= batch_size {
+                break;
+            }
+        }
 
-            let Some(old_location) = *shard.intent.get_attached() else {
-                tracing::info!("Skipping shard {victim}: currently has no attached location");
-                continue;
-            };
+        let choose_random = batch_size.saturating_sub(victims.len());
+        tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {choose_random} random shards to migrate", victims.len());
 
-            tracing::info!("Injecting chaos: migrate {victim} {old_location}->{new_location}");
+        let random_victims = tenant_ids.choose_multiple(&mut thread_rng(), choose_random);
+        victims.extend(random_victims);
 
-            shard.intent.demote_attached(scheduler, old_location);
-            shard.intent.promote_attached(scheduler, new_location);
-            self.service.maybe_reconcile_shard(shard, nodes);
+        for victim in victims {
+            self.maybe_migrate_to_secondary(victim, nodes, tenants, scheduler);
         }
     }
 }
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 79ed628c25..cbc2696b26 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1793,6 +1793,23 @@ impl TenantShard {
             }
         }
     }
+
+    /// Returns true if the tenant shard is attached to a node that is outside the preferred AZ.
+    ///
+    /// If the shard does not have a preferred AZ, returns false.
+    pub(crate) fn is_attached_outside_preferred_az(&self, nodes: &HashMap<NodeId, Node>) -> bool {
+        self.intent
+            .get_attached()
+            .map(|node_id| {
+                Some(
+                    nodes
+                        .get(&node_id)
+                        .expect("referenced node exists")
+                        .get_availability_zone_id(),
+                ) == self.intent.preferred_az_id.as_ref()
+            })
+            .unwrap_or(false)
+    }
 }
 
 impl Drop for TenantShard {

From b312a3c320695a4b528968250225dfbd40af0e2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 20 Jan 2025 13:50:44 +0100
Subject: [PATCH 222/583] Move DeleteTimelineFlow::prepare to separate function
 and use enum (#10334)

It was requested by review in #10305 to use an enum or something like it
for distinguishing the different modes instead of two parameters,
because two flags allow four combinations, and two of them don't really
make sense/ aren't used.

follow-up of #10305
---
 pageserver/src/tenant/timeline/delete.rs  | 149 +++++++++++-----------
 pageserver/src/tenant/timeline/offload.rs |  12 +-
 2 files changed, 81 insertions(+), 80 deletions(-)

diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index bdc315d985..3c828c8a9e 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -112,7 +112,7 @@ pub(super) async fn delete_local_timeline_directory(
 }
 
 /// It is important that this gets called when DeletionGuard is being held.
-/// For more context see comments in [`DeleteTimelineFlow::prepare`]
+/// For more context see comments in [`make_timeline_delete_guard`]
 async fn remove_maybe_offloaded_timeline_from_tenant(
     tenant: &Tenant,
     timeline: &TimelineOrOffloaded,
@@ -193,10 +193,8 @@ impl DeleteTimelineFlow {
     ) -> Result<(), DeleteTimelineError> {
         super::debug_assert_current_span_has_tenant_and_timeline_id();
 
-        let allow_offloaded_children = false;
-        let set_stopping = true;
         let (timeline, mut guard) =
-            Self::prepare(tenant, timeline_id, allow_offloaded_children, set_stopping)?;
+            make_timeline_delete_guard(tenant, timeline_id, TimelineDeleteGuardKind::Delete)?;
 
         guard.mark_in_progress()?;
 
@@ -333,75 +331,6 @@ impl DeleteTimelineFlow {
         Ok(())
     }
 
-    pub(super) fn prepare(
-        tenant: &Tenant,
-        timeline_id: TimelineId,
-        allow_offloaded_children: bool,
-        set_stopping: bool,
-    ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
-        // Note the interaction between this guard and deletion guard.
-        // Here we attempt to lock deletion guard when we're holding a lock on timelines.
-        // This is important because when you take into account `remove_timeline_from_tenant`
-        // we remove timeline from memory when we still hold the deletion guard.
-        // So here when timeline deletion is finished timeline wont be present in timelines map at all
-        // which makes the following sequence impossible:
-        // T1: get preempted right before the try_lock on `Timeline::delete_progress`
-        // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
-        // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
-        // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
-        let timelines = tenant.timelines.lock().unwrap();
-        let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
-
-        let timeline = match timelines.get(&timeline_id) {
-            Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)),
-            None => match timelines_offloaded.get(&timeline_id) {
-                Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
-                None => return Err(DeleteTimelineError::NotFound),
-            },
-        };
-
-        // Ensure that there are no child timelines, because we are about to remove files,
-        // which will break child branches
-        let mut children = Vec::new();
-        if !allow_offloaded_children {
-            children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| {
-                (entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id)
-            }));
-        }
-        children.extend(timelines.iter().filter_map(|(id, entry)| {
-            (entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id)
-        }));
-
-        if !children.is_empty() {
-            return Err(DeleteTimelineError::HasChildren(children));
-        }
-
-        // Note that using try_lock here is important to avoid a deadlock.
-        // Here we take lock on timelines and then the deletion guard.
-        // At the end of the operation we're holding the guard and need to lock timelines map
-        // to remove the timeline from it.
-        // Always if you have two locks that are taken in different order this can result in a deadlock.
-
-        let delete_progress = Arc::clone(timeline.delete_progress());
-        let delete_lock_guard = match delete_progress.try_lock_owned() {
-            Ok(guard) => DeletionGuard(guard),
-            Err(_) => {
-                // Unfortunately if lock fails arc is consumed.
-                return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
-                    timeline.delete_progress(),
-                )));
-            }
-        };
-
-        if set_stopping {
-            if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
-                timeline.set_state(TimelineState::Stopping);
-            }
-        }
-
-        Ok((timeline, delete_lock_guard))
-    }
-
     fn schedule_background(
         guard: DeletionGuard,
         conf: &'static PageServerConf,
@@ -483,6 +412,80 @@ impl DeleteTimelineFlow {
     }
 }
 
+#[derive(Copy, Clone, PartialEq, Eq)]
+pub(super) enum TimelineDeleteGuardKind {
+    Offload,
+    Delete,
+}
+
+pub(super) fn make_timeline_delete_guard(
+    tenant: &Tenant,
+    timeline_id: TimelineId,
+    guard_kind: TimelineDeleteGuardKind,
+) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
+    // Note the interaction between this guard and deletion guard.
+    // Here we attempt to lock deletion guard when we're holding a lock on timelines.
+    // This is important because when you take into account `remove_timeline_from_tenant`
+    // we remove timeline from memory when we still hold the deletion guard.
+    // So here when timeline deletion is finished timeline wont be present in timelines map at all
+    // which makes the following sequence impossible:
+    // T1: get preempted right before the try_lock on `Timeline::delete_progress`
+    // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
+    // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
+    // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
+    let timelines = tenant.timelines.lock().unwrap();
+    let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
+
+    let timeline = match timelines.get(&timeline_id) {
+        Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)),
+        None => match timelines_offloaded.get(&timeline_id) {
+            Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
+            None => return Err(DeleteTimelineError::NotFound),
+        },
+    };
+
+    // Ensure that there are no child timelines, because we are about to remove files,
+    // which will break child branches
+    let mut children = Vec::new();
+    if guard_kind == TimelineDeleteGuardKind::Delete {
+        children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| {
+            (entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id)
+        }));
+    }
+    children.extend(timelines.iter().filter_map(|(id, entry)| {
+        (entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id)
+    }));
+
+    if !children.is_empty() {
+        return Err(DeleteTimelineError::HasChildren(children));
+    }
+
+    // Note that using try_lock here is important to avoid a deadlock.
+    // Here we take lock on timelines and then the deletion guard.
+    // At the end of the operation we're holding the guard and need to lock timelines map
+    // to remove the timeline from it.
+    // Always if you have two locks that are taken in different order this can result in a deadlock.
+
+    let delete_progress = Arc::clone(timeline.delete_progress());
+    let delete_lock_guard = match delete_progress.try_lock_owned() {
+        Ok(guard) => DeletionGuard(guard),
+        Err(_) => {
+            // Unfortunately if lock fails arc is consumed.
+            return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
+                timeline.delete_progress(),
+            )));
+        }
+    };
+
+    if guard_kind == TimelineDeleteGuardKind::Delete {
+        if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
+            timeline.set_state(TimelineState::Stopping);
+        }
+    }
+
+    Ok((timeline, delete_lock_guard))
+}
+
 pub(super) struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
 
 impl Deref for DeletionGuard {
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 6c6b19e8b1..3b5bf8290c 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -2,10 +2,11 @@ use std::sync::Arc;
 
 use pageserver_api::models::{TenantState, TimelineState};
 
-use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
+use super::delete::{delete_local_timeline_directory, DeletionGuard};
 use super::Timeline;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
+use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind};
 use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
 
 #[derive(thiserror::Error, Debug)]
@@ -36,13 +37,10 @@ pub(crate) async fn offload_timeline(
     debug_assert_current_span_has_tenant_and_timeline_id();
     tracing::info!("offloading archived timeline");
 
-    let allow_offloaded_children = true;
-    let set_stopping = false;
-    let (timeline, guard) = DeleteTimelineFlow::prepare(
+    let (timeline, guard) = make_timeline_delete_guard(
         tenant,
         timeline.timeline_id,
-        allow_offloaded_children,
-        set_stopping,
+        TimelineDeleteGuardKind::Offload,
     )
     .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
 
@@ -106,7 +104,7 @@ pub(crate) async fn offload_timeline(
 }
 
 /// It is important that this gets called when DeletionGuard is being held.
-/// For more context see comments in [`DeleteTimelineFlow::prepare`]
+/// For more context see comments in [`make_timeline_delete_guard`]
 ///
 /// Returns the strong count of the timeline `Arc`
 fn remove_timeline_from_tenant(

From 02fc58b878d4342c05c084cd7db7a01940a70c3f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 20 Jan 2025 15:37:24 +0100
Subject: [PATCH 223/583] impr(timeline handles): add more tests covering
 reference cyle (#10446)

The other test focus on the external interface usage while the tests
added in this PR add some testing around HandleInner's lifecycle,
ensuring we don't leak it once either connection gets dropped or
per-timeline-state is shut down explicitly.
---
 pageserver/src/tenant/timeline/handle.rs | 97 ++++++++++++++++++++++++
 1 file changed, 97 insertions(+)

diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs
index 35d8c75ce1..4c7bea25be 100644
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -1132,4 +1132,101 @@ mod tests {
         // There should be no strong references to the timeline object except the one on "stack".
         assert_eq!(Arc::strong_count(&shard0), refcount_start);
     }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_reference_cycle_broken_when_cache_is_dropped() {
+        crate::tenant::harness::setup_logging();
+        let timeline_id = TimelineId::generate();
+        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_id,
+            shard: ShardIdentity::unsharded(),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let mgr = StubManager {
+            shards: vec![shard0.clone()],
+        };
+        let key = DBDIR_KEY;
+
+        let mut cache = Cache::<TestTypes>::default();
+
+        // helper to check if a handle is referenced by per_timeline_state
+        let per_timeline_state_refs_handle = |handle_weak: &Weak<Mutex<HandleInner<_>>>| {
+            let per_timeline_state = shard0.per_timeline_state.handles.lock().unwrap();
+            let per_timeline_state = per_timeline_state.as_ref().unwrap();
+            per_timeline_state
+                .values()
+                .any(|v| Weak::ptr_eq(&Arc::downgrade(v), handle_weak))
+        };
+
+        // Fill the cache.
+        let handle = cache
+            .get(timeline_id, ShardSelector::Page(key), &mgr)
+            .await
+            .expect("we have the timeline");
+        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
+        let handle_inner_weak = Arc::downgrade(&handle.inner);
+        assert!(
+            per_timeline_state_refs_handle(&handle_inner_weak),
+            "we still hold `handle` _and_ haven't dropped `cache` yet"
+        );
+
+        // Drop the cache.
+        drop(cache);
+
+        assert!(
+            !(per_timeline_state_refs_handle(&handle_inner_weak)),
+            "nothing should reference the handle allocation anymore"
+        );
+        assert!(
+            Weak::upgrade(&handle_inner_weak).is_some(),
+            "the local `handle` still keeps the allocation alive"
+        );
+        // but obviously the cache is gone so no new allocations can be handed out.
+
+        // Drop handle.
+        drop(handle);
+        assert!(
+            Weak::upgrade(&handle_inner_weak).is_none(),
+            "the local `handle` is dropped, so the allocation should be dropped by now"
+        );
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_reference_cycle_broken_when_per_timeline_state_shutdown() {
+        crate::tenant::harness::setup_logging();
+        let timeline_id = TimelineId::generate();
+        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_id,
+            shard: ShardIdentity::unsharded(),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let mgr = StubManager {
+            shards: vec![shard0.clone()],
+        };
+        let key = DBDIR_KEY;
+
+        let mut cache = Cache::<TestTypes>::default();
+        let handle = cache
+            .get(timeline_id, ShardSelector::Page(key), &mgr)
+            .await
+            .expect("we have the timeline");
+        // grab a weak reference to the inner so can later try to Weak::upgrade it and assert that fails
+        let handle_inner_weak = Arc::downgrade(&handle.inner);
+
+        // drop the handle, obviously the lifetime of `inner` is at least as long as each strong reference to it
+        drop(handle);
+        assert!(Weak::upgrade(&handle_inner_weak).is_some(), "can still");
+
+        // Shutdown the per_timeline_state.
+        shard0.per_timeline_state.shutdown();
+        assert!(Weak::upgrade(&handle_inner_weak).is_none(), "can no longer");
+
+        // cache only contains Weak's, so, it can outlive the per_timeline_state without
+        // Drop explicitly solely to make this point.
+        drop(cache);
+    }
 }

From 2657b7ec7540df3d9060ff2ed15442ed14d7843c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 20 Jan 2025 17:33:07 +0000
Subject: [PATCH 224/583] rfcs: add sharded ingest RFC (#8754)

## Summary

Whereas currently we send all WAL to all pageserver shards, and each
shard filters out the data that it needs,
in this RFC we add a mechanism to filter the WAL on the safekeeper, so
that each shard receives
only the data it needs.

This will place some extra CPU load on the safekeepers, in exchange for
reducing the network bandwidth
for ingesting WAL back to scaling as O(1) with shard count, rather than
O(N_shards).

Touches #9329.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Vlad Lazar <vlalazar.vlad@gmail.com>
Co-authored-by: Vlad Lazar <vlad@neon.tech>
---
 docs/rfcs/041-sharded-ingest.md | 255 ++++++++++++++++++++++++++++++++
 1 file changed, 255 insertions(+)
 create mode 100644 docs/rfcs/041-sharded-ingest.md

diff --git a/docs/rfcs/041-sharded-ingest.md b/docs/rfcs/041-sharded-ingest.md
new file mode 100644
index 0000000000..47b314891c
--- /dev/null
+++ b/docs/rfcs/041-sharded-ingest.md
@@ -0,0 +1,255 @@
+# 
+Created on Aug 2024
+Implemented on Jan 2025
+
+## Summary
+
+Data in large tenants is split up between multiple pageservers according to key hashes, as
+introduced in the [sharding RFC](031-sharding-static.md) and [shard splitting RFC](032-shard-splitting.md).
+
+Whereas currently we send all WAL to all pageserver shards, and each shard filters out the data that it needs,
+in this RFC we add a mechanism to filter the WAL on the safekeeper, so that each shard receives
+only the data it needs.
+
+This will place some extra CPU load on the safekeepers, in exchange for reducing the network bandwidth
+for ingesting WAL back to scaling as O(1) with shard count, rather than O(N_shards).
+
+## Motivation
+
+1. Large databases require higher shard counts.  Whereas currently we run with up to 8 shards for tenants
+with a few TB of storage, the next order of magnitude capacity increase will require tens of shards, such
+that sending all WAL to all shards is impractical in terms of bandwidth.
+2. For contemporary database sizes (~2TB), the pageserver is the bottleneck for ingest: since each
+   shard has to decode and process the whole WAL, sharding doesn't fully relieve this bottleneck.  To achieve significantly higher ingest speeds, we need to filter the WAL earlier so that each pageserver
+   only has to process relevant parts.
+
+## Non Goals (if relevant)
+
+We do not seek to introduce multiple WALs per timeline, or to share the work of handling a timeline's
+WAL across safekeepers (beyond simple 3x replication).  This RFC may be thought of as an incremental
+move of the ingestion bottleneck up the stack: instead of high write rates bottlenecking on the
+pageserver, they will bottleneck on the safekeeper.
+
+## Impacted components (e.g. pageserver, safekeeper, console, etc)
+
+Safekeeper, pageserver.
+
+There will be no control plane or storage controller coordination needed, as pageservers will directly
+indicate their sharding parameters to the safekeeper when subscribing for WAL.
+
+## Proposed implementation
+
+Terminology:
+- "Data pages" refers to postgres relation blocks, and SLRU blocks.
+- "Metadata pages" refers to everything else the pageserver stores, such as relation sizes and
+  directories of relations.
+
+### Phase 1: Refactor ingest
+
+Currently, pageserver ingest code is structured approximately as follows:
+1. `handle_walreceiver_connection` reads a stream of binary WAL records off a network
+   socket
+2. `WalIngest::ingest_record` to translate the record into a series of page-level modifications
+3. `DatadirModification` accumulates page updates from several `ingest_record` calls, and when
+   its `commit()` method is called, flushes these into a Timeline's open `InMemoryLayer`.
+
+This process currently assumes access to a pageserver `Timeline` throughout `ingest_record` and
+from `DatadirModification`, which is used to do read-modify-write cycles on metadata pages
+such as relation sizes and the master DBDIR page.  It also assumes that records are ingested
+strictly one after the other: they cannot be ingested in parallel because each record assumes
+that earlier records' changes have already been applied to `Timeline`.
+
+This code will be refactored to disentangle the simple, fast decode of relation page writes
+from the more complex logic for updating internal metadata.  An intermediate representation
+called `InterpretedWalRecords` will be introduced.  This is similar to the internal state of
+a `DatadirModification`, but does not require access to a Timeline.  Instead of storing
+metadata updates as materialized writes to pages, it will accumulate these as abstract operations,
+for example rather than including a write to a relation size key, this structure will include
+an operation that indicates "Update relation _foo_'s size to the max of its current value and
+_bar_", such that these may be applied later to a real Timeline.
+
+The `DatadirModification` will be aware of the `EphemeralFile` format, so that as it accumulates
+simple page writes of relation blocks, it can write them directly into a buffer in the serialized
+format.  This will avoid the need to later deserialize/reserialize this data when passing the
+structure between safekeeper and pageserver.
+
+The new pipeline will be:
+1. `handle_walreceiver_connection` reads a stream of binary WAL records off a network
+2. A `InterpretedWalRecords` is generated from the incoming WAL records.  This does not
+   require a reference to a Timeline.
+3. The logic that is current spread between `WalIngest` and `DatadirModification` for updating
+   metadata will be refactored to consume the metadata operations from the `InterpretedWalRecords`
+   and turn them into literal writes to metadata pages.  This part must be done sequentially.
+4. The resulting buffer of metadata page writes is combined with the buffer of relation block
+   writes, and written into the `InMemoryLayer`.
+
+Implemented in:
+1. https://github.com/neondatabase/neon/pull/9472
+2. https://github.com/neondatabase/neon/pull/9504
+3. https://github.com/neondatabase/neon/pull/9524
+
+### Phase 2: Decode & filter on safekeeper
+
+In the previous phase, the ingest code was modified to be able to do most of its work without access to
+a Timeline: this first stage of ingest simply converts a series of binary wal records into
+a buffer of relation/SLRU page writes, and a buffer of abstract metadata writes.
+
+The modified ingest code may be transplanted from pageserver to safekeeper (probably via a
+shared crate).  The safekeeper->pageserver network protocol is modified to:
+ - in subscription requests, send the `ShardIdentity` from the pageserver to the safekeeper
+ - in responses, transmit a `InterpretedWalRecords` instead of a raw `WalRecord`.
+ - use the `ShardIdentity` to filter the `ProcessedWalIngest` to relevant content for
+   the subscribing shard before transmitting it.
+
+The overall behavior of the pageserver->safekeeper interaction remains the same, in terms of
+consistent LSN feedback, and connection management.  Only the payload of the subscriptions
+changes, to express an LSN range of WAL as a filtered `ProcessedWalIngest` instead of the
+raw data.
+
+The ingest code on the pageserver can now skip the part where it does the first phase of
+processing, as it will receive pre-processed, compressed data off the wire.
+
+Note that `InterpretedWalRecord` batches multiple `InterpretedWalRecord(s)` in the same network
+message. Safekeeper reads WAL in chunks of 16 blocks and then decodes as many Postgres WAL records
+as possible. Each Postgres WAL record maps to one `InterpretedWalRecord` for potentially multiple shards.
+Hence, the size of the batch is given by the number of Postgres WAL records that fit in 16 blocks.
+
+The protocol needs to support evolution. Protobuf was chosen here with the view that, in the future,
+we may migrate it to GRPC altogether
+
+Implemented in:
+1. https://github.com/neondatabase/neon/pull/9746
+2. https://github.com/neondatabase/neon/pull/9821
+
+### Phase 3: Fan out interpreted WAL
+
+In the previous phase, the initial processing of WAL was moved to the safekeeper, but it is still
+done once for each shard: this will generate O(N_shards) CPU work on the safekeeper (especially
+when considering converting to Protobuf format and compression).
+
+To avoid this, we fan-out WAL from one (tenant, timeline, shard) to all other shards subscribed on
+the same safekeeper. Under normal operation, the WAL will be read from disk, decoded and interpreted
+_only_ once per (safekeeper, timeline).
+
+When the first shard of a sharded timeline subscribes to a given safekeeper a task is spawned
+for the WAL reader (`InterpretedWalReader`). This task reads WAL, decodes, interprets it and sends
+it to the sender (`InterpretedWalSender`). The sender is a future that is polled from the connection
+task. When further shards subscribe on the safekeeper they will attach themselves to the existing WAL reader.
+There's two cases to consider:
+1. The shard's requested `start_lsn` is ahead of the current position of the WAL reader. In this case, the shard
+will start receiving data when the reader reaches that LSN. The intuition here is that there's little to gain
+by letting shards "front-run" since compute backpressure is based on the laggard LSN.
+2. The shard's requested `start_lsn` is below the current position of the WAL reader. In this case, the WAL reader
+gets reset to this requested position (same intuition). Special care is taken such that advanced shards do not receive
+interpreted WAL records below their current position.
+
+The approach above implies that there is at most one WAL reader per (tenant, timeline) on a given safekeeper at any point in time.
+If this turns out to be operationally problematic, there's a trick we can deploy: `--max-delta-for-fanout` is an optional safekeeper
+argument that controls the max absolute delta between a new shard and the current WAL position of the WAL reader. If the absolute
+delta is above that value, a new reader is spawned. Note that there's currently no concurrency control on the number of WAL readers,
+so it's recommended to use large values to avoid pushing CPU utilisation too high.
+
+Unsharded tenants do not spawn a separate task for the interpreted WAL reader since there's no benefit to it. Instead they poll
+the reader and sender concurrently from the connection task.
+
+Shard splits are interesting here because it is the only case when the same shard might have two subscriptions at the same time.
+This is handled by giving readers a unique identifier. Both shards will receive the same data while respecting their requested start
+position.
+
+Implemented in:
+1. https://github.com/neondatabase/neon/pull/10190
+
+## Deployment
+
+Each phase shall be deployed independently. Special care should be taken around protocol changes.
+
+## Observability Tips
+
+* The safekeeper logs the protocol requested by the pageserver
+along with the pageserver ID, tenant, timeline and shard: `starting streaming from`.
+* There's metrics for the number of wal readers:
+  * `safekeeper_wal_readers{kind="task", target=~"pageserver.*"}` gives the number of wal reader tasks for each SK
+  * `safekeeper_wal_readers{kind="future", target=~"pageserver.*"}` gives the numer of wal readers polled inline by each SK
+  * `safekeeper_interpreted_wal_reader_tasks` gives the number of wal reader tasks per tenant, timeline
+* Interesting log lines for the fan-out reader:
+  * `Spawning interpreted`: first shard creates the interpreted wal reader
+  * `Fanning out`: a subsequent shard attaches itself to an interpreted wal reader
+  * `Aborting interpreted`: all senders have finished and the reader task is being aborted
+
+## Future Optimizations
+
+This sections describes some improvement areas which may be revisited in the future.
+
+### Buffering of Interpreted WAL
+
+The interpreted WAL reader may buffer interpreted WAL records in user space to help with serving
+subscriptions that are lagging behind the current position of the reader.
+
+Counterpoints:
+* Safekeepers serve many thousands of timelines and allocating a buffer for each might be wasteful,
+especially given that it would go unused on the happy path.
+* WAL is buffered in the kernel page cache. Usually we'd only pay the CPU cost of decoding and interpreting.
+
+### Tweaking the Pagserver Safekeeper Selection Algorithm
+
+We could make the pageserver aware of which safekeeper's already host shards for the timeline along
+with their current WAL positions. The pageserver should then prefer safkeepers that are in the same
+AZ _and_ already have a shard with a position close to the desired start position.
+
+We currently run one safekeeper per AZ, so the point is mute until that changes.
+
+### Pipelining first ingest phase
+
+The first ingest phase is a stateless transformation of a binary WAL record into a pre-processed
+output per shard.  To put multiple CPUs to work, we may pipeline this processing up to some defined buffer
+depth.
+
+## Alternatives considered
+
+### Give safekeepers enough state to fully decode WAL
+
+In this RFC, we only do the first phase of ingest on the safekeeper, because this is
+the phase that is stateless.  Subsequent changes then happen on the pageserver, with
+access to the `Timeline` state.
+
+We could do more work on the safekeeper if we transmitted metadata state to the safekeeper
+when subscribing to the WAL: for example, by telling the safekeeper all the relation sizes,
+so that it could then generate all the metadata writes for relation sizes.
+
+We avoid doing this for several reasons:
+1. Complexity: it's a more invasive protocol change
+2. Decoupling: having the safekeeper understand the `ProcessedWalIngest` already somewhat
+   infects it with knowledge of the pageserver, but this is mainly an abstract structure
+   that describes postgres writes.  However, if we taught the safekeeper about the exact
+   way that pageserver deals with metadata keys, this would be a much tighter coupling.
+3. Load: once the WAL has been processed to the point that it can be split between shards,
+   it is preferable to share out work on the remaining shards rather than adding extra CPU
+   load to the safekeeper.
+
+### Do pre-processing on the compute instead of the safekeeper
+
+Since our first stage of ingest is stateless, it could be done at any stage in the pipeline,
+all the way up to the compute.
+
+We choose not to do this, because it is useful for the safekeeper to store the raw WAL rather
+than just the preprocessed WAL:
+- The safekeeper still needs to be able to serve raw WAL back to postgres for e.g. physical replication
+- It simplifies our paxos implementation to have the offset in the write log be literally
+  the same as the LSN
+- Raw WAL must have a stable protocol since we might have to re-ingest it at arbitrary points in the future.
+  Storing raw WAL give us more flexibility to evolve the pageserver, safekeeper protocol.
+
+### Do wal pre-processing on shard 0 or a separate service, send it to other shards from there
+
+If we wanted to keep the safekeepers as entirely pure stores of raw WAL bytes, then
+we could do the initial decode and shard-splitting in some other location:
+- Shard 0 could subscribe to the full WAL and then send writes to other shards
+- A new intermediate service between the safekeeper and pageserver could do the splitting.
+
+So why not?
+- Extra network hop from shard 0 to the final destination shard
+- Clearly there is more infrastructure involved here compared with doing it inline on the safekeeper.
+- Safekeepers already have very light CPU load: typical cloud instances shapes with appropriate
+  disks for the safekeepers effectively have "free" CPU resources.
+- Doing extra work on shard 0 would complicate scheduling of shards on pageservers, because
+  shard 0 would have significantly higher CPU load under write workloads than other shards.

From 72130d7d6c975df81249b4c3862d16d4fff40cf6 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 20 Jan 2025 18:51:30 +0100
Subject: [PATCH 225/583] fix(page_service / handle): panic when parallel
 client disconnect & Timeline shutdown (#10445)

## Refs
- fixes https://github.com/neondatabase/neon/issues/10444

## Problem

We're seeing a panic `handles are only shut down once in their lifetime`
in our performance testbed.

## Hypothesis

Annotated code in
https://github.com/neondatabase/neon/issues/10444#issuecomment-2602286415.

```
T1: drop Cache, executes up to (1)
=> HandleInner is now in state ShutDown
T2: Timeline::shutdown => PerTimelineState::shutdown  executes shutdown() again => panics
```

Likely this snuck in the final touches of #10386 where I narrowed down
the locking rules.

## Summary of changes

Make duplicate shutdowns a no-op.
---
 pageserver/src/tenant/timeline/handle.rs | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs
index 4c7bea25be..5b39daaaf8 100644
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -588,32 +588,40 @@ impl<T: Types> Drop for Cache<T> {
             let Some(handle_inner_arc) = handle_inner_weak.upgrade() else {
                 continue;
             };
-            let handle_timeline = handle_inner_arc
+            let Some(handle_timeline) = handle_inner_arc
                 // locking rules: drop lock before acquiring other lock below
                 .lock()
                 .expect("poisoned")
-                .shutdown();
+                .shutdown()
+            else {
+                // Concurrent PerTimelineState::shutdown.
+                continue;
+            };
+            // Clean up per_timeline_state so the HandleInner allocation can be dropped.
             let per_timeline_state = handle_timeline.per_timeline_state();
             let mut handles_lock_guard = per_timeline_state.handles.lock().expect("mutex poisoned");
             let Some(handles) = &mut *handles_lock_guard else {
                 continue;
             };
             let Some(removed_handle_inner_arc) = handles.remove(&self.id) else {
-                // There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
+                // Concurrent PerTimelineState::shutdown.
                 continue;
             };
-            drop(handles_lock_guard); // locking rules: remember them when!
-            assert!(Arc::ptr_eq(&removed_handle_inner_arc, &handle_inner_arc,));
+            drop(handles_lock_guard); // locking rules!
+            assert!(Arc::ptr_eq(&removed_handle_inner_arc, &handle_inner_arc));
         }
     }
 }
 
 impl<T: Types> HandleInner<T> {
-    fn shutdown(&mut self) -> Arc<T::Timeline> {
+    fn shutdown(&mut self) -> Option<Arc<T::Timeline>> {
         match std::mem::replace(self, HandleInner::ShutDown) {
-            HandleInner::KeepingTimelineGateOpen { timeline, .. } => timeline,
+            HandleInner::KeepingTimelineGateOpen { timeline, .. } => Some(timeline),
             HandleInner::ShutDown => {
-                unreachable!("handles are only shut down once in their lifetime");
+                // Duplicate shutdowns are possible because both Cache::drop and PerTimelineState::shutdown
+                // may do it concurrently, but locking rules disallow holding per-timeline-state lock and
+                // the handle lock at the same time.
+                None
             }
         }
     }

From e781cf6dd82a150133621ad0165e1c6b03c844ad Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Mon, 20 Jan 2025 19:29:21 +0100
Subject: [PATCH 226/583] Compute/LFC: Apply limits consistently (#10449)

Otherwise we might hit ERRORs in otherwise safe situations (such as user
queries), which isn't a great user experience.

## Problem

https://github.com/neondatabase/neon/pull/10376

## Summary of changes

Instead of accepting internal errors as acceptable, we ensure we don't
exceed our allocated usage.
---
 pgxn/neon/file_cache.c                       | 110 ++++++++++++-------
 test_runner/regress/test_local_file_cache.py |  71 +++++++++++-
 2 files changed, 139 insertions(+), 42 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index ad5667cbab..64b236061d 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -911,57 +911,85 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			if (entry->access_count++ == 0)
 				dlist_delete(&entry->list_node);
 		}
-		else
+		/*-----------
+		 * If the chunk wasn't already in the LFC then we have these
+		 * options, in order of preference:
+		 *
+		 * Unless there is no space available, we can:
+		 *  1. Use an entry from the `holes` list, and
+		 *  2. Create a new entry.
+		 * We can always, regardless of space in the LFC:
+		 *  3. evict an entry from LRU, and
+		 *  4. ignore the write operation (the least favorite option)
+		 */
+		else if (lfc_ctl->used < lfc_ctl->limit)
 		{
-			/*
-			 * We have two choices if all cache pages are pinned (i.e. used in IO
-			 * operations):
-			 *
-			 * 1) Wait until some of this operation is completed and pages is
-			 * unpinned.
-			 *
-			 * 2) Allocate one more chunk, so that specified cache size is more
-			 * recommendation than hard limit.
-			 *
-			 * As far as probability of such event (that all pages are pinned) is
-			 * considered to be very very small: there are should be very large
-			 * number of concurrent IO operations and them are limited by
-			 * max_connections, we prefer not to complicate code and use second
-			 * approach.
-			 */
-			if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
-			{
-				/* Cache overflow: evict least recently used chunk */
-				FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
-	
-				for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
-				{
-					lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
-				}
-				CriticalAssert(victim->access_count == 0);
-				entry->offset = victim->offset; /* grab victim's chunk */
-				hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
-				neon_log(DEBUG2, "Swap file cache page");
-			}
-			else if (!dlist_is_empty(&lfc_ctl->holes))
+			if (!dlist_is_empty(&lfc_ctl->holes))
 			{
 				/* We can reuse a hole that was left behind when the LFC was shrunk previously */
-				FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
-				uint32		offset = hole->offset;
-				bool		hole_found;
-	
-				hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &hole_found);
+				FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node,
+													   dlist_pop_head_node(&lfc_ctl->holes));
+				uint32 offset = hole->offset;
+				bool hole_found;
+
+				hash_search_with_hash_value(lfc_hash, &hole->key,
+											hole->hash, HASH_REMOVE, &hole_found);
 				CriticalAssert(hole_found);
-	
+
 				lfc_ctl->used += 1;
-				entry->offset = offset;	/* reuse the hole */
+				entry->offset = offset;			/* reuse the hole */
 			}
 			else
 			{
 				lfc_ctl->used += 1;
-				entry->offset = lfc_ctl->size++;	/* allocate new chunk at end
-													 * of file */
+				entry->offset = lfc_ctl->size++;/* allocate new chunk at end
+												 * of file */
 			}
+		}
+		/*
+		 * We've already used up all allocated LFC entries.
+		 *
+		 * If we can clear an entry from the LRU, do that.
+		 * If we can't (e.g. because all other slots are being accessed)
+		 * then we will remove this entry from the hash and continue
+		 * on to the next chunk, as we may not exceed the limit.
+		 */
+		else if (!dlist_is_empty(&lfc_ctl->lru))
+		{
+			/* Cache overflow: evict least recently used chunk */
+			FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node,
+													 dlist_pop_head_node(&lfc_ctl->lru));
+
+			for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+			{
+				lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
+			}
+
+			CriticalAssert(victim->access_count == 0);
+			entry->offset = victim->offset; /* grab victim's chunk */
+			hash_search_with_hash_value(lfc_hash, &victim->key,
+										victim->hash, HASH_REMOVE, NULL);
+			neon_log(DEBUG2, "Swap file cache page");
+		}
+		else
+		{
+			/* Can't add this chunk - we don't have the space for it */
+			hash_search_with_hash_value(lfc_hash, &entry->key, hash,
+										HASH_REMOVE, NULL);
+
+			/*
+			 * We can't process this chunk due to lack of space in LFC,
+			 * so skip to the next one
+			 */
+			LWLockRelease(lfc_lock);
+			blkno += blocks_in_chunk;
+			buf_offset += blocks_in_chunk;
+			nblocks -= blocks_in_chunk;
+			continue;
+		}
+
+		if (!found)
+		{
 			entry->access_count = 1;
 			entry->hash = hash;
 			memset(entry->bitmap, 0, sizeof entry->bitmap);
diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py
index 21c9e97a42..52ee2f32a2 100644
--- a/test_runner/regress/test_local_file_cache.py
+++ b/test_runner/regress/test_local_file_cache.py
@@ -7,9 +7,78 @@ import threading
 import time
 
 import pytest
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
 from fixtures.utils import USE_LFC, query_scalar
 
+"""
+Test whether LFC doesn't error out when the LRU is empty, but the LFC is
+already at its maximum size.
+
+If we don't handle this safely, we might allocate more hash entries than
+otherwise considered safe, thus causing ERRORs in hash_search(HASH_ENTER) once
+we hit lfc->used >= lfc->limit.
+"""
+
+
+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
+def test_local_file_cache_all_pinned(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            "neon.max_file_cache_size='1MB'",
+            "neon.file_cache_size_limit='1MB'",
+        ],
+    )
+    top_cur = endpoint.connect().cursor()
+
+    stop = threading.Event()
+    n_rows = 10000
+    n_threads = 5
+    n_updates_per_connection = 1000
+
+    top_cur.execute("CREATE TABLE lfctest (id int4 PRIMARY KEY, n int) WITH (fillfactor=10)")
+    top_cur.execute(f"INSERT INTO lfctest SELECT g, 1 FROM generate_series(1, {n_rows}) g")
+
+    # Start threads that will perform random UPDATEs. Each UPDATE
+    # increments the counter on the row, so that we can check at the
+    # end that the sum of all the counters match the number of updates
+    # performed (plus the initial 1 on each row).
+    #
+    # Furthermore, each thread will reconnect between every 1000 updates.
+    def run_updates(n_updates_performed_q: queue.Queue[int]):
+        n_updates_performed = 0
+        conn = endpoint.connect()
+        cur = conn.cursor()
+        while not stop.is_set():
+            id = random.randint(1, n_rows)
+            cur.execute(f"UPDATE lfctest SET n = n + 1 WHERE id = {id}")
+            n_updates_performed += 1
+            if n_updates_performed % n_updates_per_connection == 0:
+                cur.close()
+                conn.close()
+                conn = endpoint.connect()
+                cur = conn.cursor()
+        n_updates_performed_q.put(n_updates_performed)
+
+    n_updates_performed_q: queue.Queue[int] = queue.Queue()
+    threads: list[threading.Thread] = []
+    for _i in range(n_threads):
+        thread = threading.Thread(target=run_updates, args=(n_updates_performed_q,), daemon=True)
+        thread.start()
+        threads.append(thread)
+
+    time.sleep(15)
+
+    stop.set()
+
+    n_updates_performed = 0
+    for thread in threads:
+        thread.join()
+        n_updates_performed += n_updates_performed_q.get()
+
+    assert query_scalar(top_cur, "SELECT SUM(n) FROM lfctest") == n_rows + n_updates_performed
+
 
 @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
 def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):

From 2de2b26c62016cce48cbc5449d44e3259f237b56 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 20 Jan 2025 15:44:12 -0500
Subject: [PATCH 227/583] feat(pageserver): add reldir migration configs
 (#10439)

## Problem

Part of #9516 per RFC at https://github.com/neondatabase/neon/pull/10412

## Summary of changes

Adding the necessary config items and index_part items for the large
relation count work.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 control_plane/src/pageserver.rs               |   5 +
 libs/pageserver_api/src/config.rs             |   5 +
 libs/pageserver_api/src/models.rs             |   6 +
 pageserver/src/tenant.rs                      |   1 +
 pageserver/src/tenant/config.rs               |   8 ++
 .../tenant/remote_timeline_client/index.rs    | 115 +++++++++++++++++-
 .../regress/test_attach_tenant_config.py      |   1 +
 7 files changed, 138 insertions(+), 3 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index ef5b3d6593..df81b44f2d 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -418,6 +418,11 @@ impl PageServerNode {
                 .map(serde_json::from_str)
                 .transpose()
                 .context("parse `wal_receiver_protocol_override` from json")?,
+            rel_size_v2_enabled: settings
+                .remove("rel_size_v2_enabled")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'rel_size_v2_enabled' as bool")?,
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 09cfbc55fd..7fb7a9d54e 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -301,6 +301,10 @@ pub struct TenantConfigToml {
     pub timeline_offloading: bool,
 
     pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
+
+    /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into
+    /// `index_part.json`, and it cannot be reversed.
+    pub rel_size_v2_enabled: Option<bool>,
 }
 
 pub mod defaults {
@@ -538,6 +542,7 @@ impl Default for TenantConfigToml {
             lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
             timeline_offloading: false,
             wal_receiver_protocol_override: None,
+            rel_size_v2_enabled: None,
         }
     }
 }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index c38af9cb80..1538134c96 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -497,6 +497,8 @@ pub struct TenantConfigPatch {
     pub timeline_offloading: FieldPatch<bool>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub wal_receiver_protocol_override: FieldPatch<PostgresClientProtocol>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub rel_size_v2_enabled: FieldPatch<bool>,
 }
 
 /// An alternative representation of `pageserver::tenant::TenantConf` with
@@ -528,6 +530,7 @@ pub struct TenantConfig {
     pub lsn_lease_length_for_ts: Option<String>,
     pub timeline_offloading: Option<bool>,
     pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
+    pub rel_size_v2_enabled: Option<bool>,
 }
 
 impl TenantConfig {
@@ -557,6 +560,7 @@ impl TenantConfig {
             mut lsn_lease_length_for_ts,
             mut timeline_offloading,
             mut wal_receiver_protocol_override,
+            mut rel_size_v2_enabled,
         } = self;
 
         patch.checkpoint_distance.apply(&mut checkpoint_distance);
@@ -601,6 +605,7 @@ impl TenantConfig {
         patch
             .wal_receiver_protocol_override
             .apply(&mut wal_receiver_protocol_override);
+        patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled);
 
         Self {
             checkpoint_distance,
@@ -627,6 +632,7 @@ impl TenantConfig {
             lsn_lease_length_for_ts,
             timeline_offloading,
             wal_receiver_protocol_override,
+            rel_size_v2_enabled,
         }
     }
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index bb1b36aed6..05a311391c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5475,6 +5475,7 @@ pub(crate) mod harness {
                 lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
                 timeline_offloading: Some(tenant_conf.timeline_offloading),
                 wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override,
+                rel_size_v2_enabled: tenant_conf.rel_size_v2_enabled,
             }
         }
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index edf2e6a3aa..14d8e9ccd4 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -357,6 +357,9 @@ pub struct TenantConfOpt {
 
     #[serde(skip_serializing_if = "Option::is_none")]
     pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub rel_size_v2_enabled: Option<bool>,
 }
 
 impl TenantConfOpt {
@@ -425,6 +428,7 @@ impl TenantConfOpt {
             wal_receiver_protocol_override: self
                 .wal_receiver_protocol_override
                 .or(global_conf.wal_receiver_protocol_override),
+            rel_size_v2_enabled: self.rel_size_v2_enabled.or(global_conf.rel_size_v2_enabled),
         }
     }
 
@@ -454,6 +458,7 @@ impl TenantConfOpt {
             mut lsn_lease_length_for_ts,
             mut timeline_offloading,
             mut wal_receiver_protocol_override,
+            mut rel_size_v2_enabled,
         } = self;
 
         patch.checkpoint_distance.apply(&mut checkpoint_distance);
@@ -522,6 +527,7 @@ impl TenantConfOpt {
         patch
             .wal_receiver_protocol_override
             .apply(&mut wal_receiver_protocol_override);
+        patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled);
 
         Ok(Self {
             checkpoint_distance,
@@ -548,6 +554,7 @@ impl TenantConfOpt {
             lsn_lease_length_for_ts,
             timeline_offloading,
             wal_receiver_protocol_override,
+            rel_size_v2_enabled,
         })
     }
 }
@@ -603,6 +610,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
             timeline_offloading: value.timeline_offloading,
             wal_receiver_protocol_override: value.wal_receiver_protocol_override,
+            rel_size_v2_enabled: value.rel_size_v2_enabled,
         }
     }
 }
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 08e94ae197..30b6b07ca3 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -79,6 +79,24 @@ pub struct IndexPart {
     /// when this flag is introduced.
     #[serde(skip_serializing_if = "Option::is_none", default)]
     pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
+
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub(crate) rel_size_migration: Option<RelSizeMigration>,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub enum RelSizeMigration {
+    /// The tenant is using the old rel_size format.
+    /// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
+    /// `None` is the same as `Some(RelSizeMigration::Legacy)`.
+    Legacy,
+    /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
+    /// persisted in the index part. The read path will read both formats and merge them.
+    Migrating,
+    /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
+    /// in the index part, and the read path will not read the old format.
+    Migrated,
 }
 
 impl IndexPart {
@@ -97,10 +115,11 @@ impl IndexPart {
     /// - 8: added `archived_at`
     /// - 9: +gc_blocking
     /// - 10: +import_pgdata
-    const LATEST_VERSION: usize = 10;
+    /// - 11: +rel_size_migration
+    const LATEST_VERSION: usize = 11;
 
     // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
@@ -116,6 +135,7 @@ impl IndexPart {
             gc_blocking: None,
             last_aux_file_policy: None,
             import_pgdata: None,
+            rel_size_migration: None,
         }
     }
 
@@ -416,6 +436,7 @@ mod tests {
             gc_blocking: None,
             last_aux_file_policy: None,
             import_pgdata: None,
+            rel_size_migration: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -461,6 +482,7 @@ mod tests {
             gc_blocking: None,
             last_aux_file_policy: None,
             import_pgdata: None,
+            rel_size_migration: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -507,6 +529,7 @@ mod tests {
             gc_blocking: None,
             last_aux_file_policy: None,
             import_pgdata: None,
+            rel_size_migration: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -556,6 +579,7 @@ mod tests {
             gc_blocking: None,
             last_aux_file_policy: None,
             import_pgdata: None,
+            rel_size_migration: None,
         };
 
         let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap();
@@ -600,6 +624,7 @@ mod tests {
             gc_blocking: None,
             last_aux_file_policy: None,
             import_pgdata: None,
+            rel_size_migration: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -647,6 +672,7 @@ mod tests {
             gc_blocking: None,
             last_aux_file_policy: None,
             import_pgdata: None,
+            rel_size_migration: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -699,6 +725,7 @@ mod tests {
             gc_blocking: None,
             last_aux_file_policy: Some(AuxFilePolicy::V2),
             import_pgdata: None,
+            rel_size_migration: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -756,6 +783,7 @@ mod tests {
             gc_blocking: None,
             last_aux_file_policy: Default::default(),
             import_pgdata: None,
+            rel_size_migration: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -814,6 +842,7 @@ mod tests {
             gc_blocking: None,
             last_aux_file_policy: Default::default(),
             import_pgdata: None,
+            rel_size_migration: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -877,6 +906,7 @@ mod tests {
             last_aux_file_policy: Default::default(),
             archived_at: None,
             import_pgdata: None,
+            rel_size_migration: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -952,7 +982,86 @@ mod tests {
                 started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"),
                 finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"),
                 idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()),
-            })))
+            }))),
+            rel_size_migration: None,
+        };
+
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
+    #[test]
+    fn v11_rel_size_migration_is_parsed() {
+        let example = r#"{
+            "version": 11,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata": {
+                "disk_consistent_lsn": "0/16960E8",
+                "prev_record_lsn": "0/1696070",
+                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
+                "ancestor_lsn": "0/0",
+                "latest_gc_cutoff_lsn": "0/1696070",
+                "initdb_lsn": "0/1696070",
+                "pg_version": 14
+            },
+            "gc_blocking": {
+                "started_at": "2024-07-19T09:00:00.123",
+                "reasons": ["DetachAncestor"]
+            },
+            "import_pgdata": {
+                "V1": {
+                    "Done": {
+                        "idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5",
+                        "started_at": "2024-11-13T09:23:42.123",
+                        "finished_at": "2024-11-13T09:42:23.123"
+                    }
+                }
+            },
+            "rel_size_migration": "legacy"
+        }"#;
+
+        let expected = IndexPart {
+            version: 11,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                    file_size: 9007199254741001,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::new(
+                Lsn::from_str("0/16960E8").unwrap(),
+                Some(Lsn::from_str("0/1696070").unwrap()),
+                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
+                Lsn::INVALID,
+                Lsn::from_str("0/1696070").unwrap(),
+                Lsn::from_str("0/1696070").unwrap(),
+                14,
+            ).with_recalculated_checksum().unwrap(),
+            deleted_at: None,
+            lineage: Default::default(),
+            gc_blocking: Some(GcBlocking {
+                started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
+                reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
+            }),
+            last_aux_file_policy: Default::default(),
+            archived_at: None,
+            import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{
+                started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"),
+                finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"),
+                idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()),
+            }))),
+            rel_size_migration: Some(RelSizeMigration::Legacy),
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 45112fd67e..b34dbddc80 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -176,6 +176,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
             "type": "interpreted",
             "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
         },
+        "rel_size_v2_enabled": True,
     }
 
     vps_http = env.storage_controller.pageserver_api()

From 2ab9f6982590cd8570f505df23c134cc71a1a576 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 20 Jan 2025 22:57:15 +0100
Subject: [PATCH 228/583] Simplify pageserver_physical_gc function (#10104)

This simplifies the code in `pageserver_physical_gc` a little bit after
the feedback in #10007 that the code is too complicated.

Most importantly, we don't pass around `GcSummary` any more in a
complicated fashion, and we save on async stream-combinator-inception in
one place in favour of `try_stream!{}`.

Follow-up of #10007
---
 .../src/pageserver_physical_gc.rs             | 86 +++++++++----------
 1 file changed, 40 insertions(+), 46 deletions(-)

diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index a997373375..063c6bcfb9 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -8,6 +8,8 @@ use crate::checks::{
 };
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
 use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES};
+use async_stream::try_stream;
+use futures::future::Either;
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::remote_timeline_client::manifest::OffloadedTimelineManifest;
@@ -578,7 +580,7 @@ async fn gc_timeline(
     target: &RootTarget,
     mode: GcMode,
     ttid: TenantShardTimelineId,
-    accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
+    accumulator: &std::sync::Mutex<TenantRefAccumulator>,
     tenant_manifest_info: Arc<Option<RemoteTenantManifestInfo>>,
 ) -> anyhow::Result<GcSummary> {
     let mut summary = GcSummary::default();
@@ -721,9 +723,9 @@ pub async fn pageserver_physical_gc(
 
     let remote_client = Arc::new(remote_client);
     let tenants = if tenant_shard_ids.is_empty() {
-        futures::future::Either::Left(stream_tenants(&remote_client, &target))
+        Either::Left(stream_tenants(&remote_client, &target))
     } else {
-        futures::future::Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok)))
+        Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok)))
     };
 
     // How many tenants to process in parallel.  We need to be mindful of pageservers
@@ -731,16 +733,16 @@ pub async fn pageserver_physical_gc(
     const CONCURRENCY: usize = 32;
 
     // Accumulate information about each tenant for cross-shard GC step we'll do at the end
-    let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default()));
+    let accumulator = std::sync::Mutex::new(TenantRefAccumulator::default());
+
+    // Accumulate information about how many manifests we have GCd
+    let manifest_gc_summary = std::sync::Mutex::new(GcSummary::default());
 
     // Generate a stream of TenantTimelineId
-    enum GcSummaryOrContent<T> {
-        Content(T),
-        GcSummary(GcSummary),
-    }
     let timelines = tenants.map_ok(|tenant_shard_id| {
         let target_ref = &target;
         let remote_client_ref = &remote_client;
+        let manifest_gc_summary_ref = &manifest_gc_summary;
         async move {
             let gc_manifest_result = gc_tenant_manifests(
                 remote_client_ref,
@@ -757,55 +759,48 @@ pub async fn pageserver_physical_gc(
                     (GcSummary::default(), None)
                 }
             };
+            manifest_gc_summary_ref
+                .lock()
+                .unwrap()
+                .merge(summary_from_manifest);
             let tenant_manifest_arc = Arc::new(tenant_manifest_opt);
-            let summary_from_manifest = Ok(GcSummaryOrContent::<(_, _)>::GcSummary(
-                summary_from_manifest,
-            ));
-            stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id)
-                .await
-                .map(|stream| {
-                    stream
-                        .zip(futures::stream::iter(std::iter::repeat(
-                            tenant_manifest_arc,
-                        )))
-                        .map(|(ttid_res, tenant_manifest_arc)| {
-                            ttid_res.map(move |ttid| {
-                                GcSummaryOrContent::Content((ttid, tenant_manifest_arc))
-                            })
-                        })
-                        .chain(futures::stream::iter([summary_from_manifest].into_iter()))
-                })
+            let mut timelines = Box::pin(
+                stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id).await?,
+            );
+            Ok(try_stream! {
+                while let Some(ttid_res) = timelines.next().await {
+                    let ttid = ttid_res?;
+                    yield (ttid, tenant_manifest_arc.clone());
+                }
+            })
         }
     });
-    let timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
-    let timelines = timelines.try_flatten();
 
     let mut summary = GcSummary::default();
-
-    // Drain futures for per-shard GC, populating accumulator as a side effect
     {
-        let timelines = timelines.map_ok(|summary_or_ttid| match summary_or_ttid {
-            GcSummaryOrContent::Content((ttid, tenant_manifest_arc)) => {
-                futures::future::Either::Left(gc_timeline(
-                    &remote_client,
-                    &min_age,
-                    &target,
-                    mode,
-                    ttid,
-                    &accumulator,
-                    tenant_manifest_arc,
-                ))
-            }
-            GcSummaryOrContent::GcSummary(gc_summary) => {
-                futures::future::Either::Right(futures::future::ok(gc_summary))
-            }
+        let timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
+        let timelines = timelines.try_flatten();
+
+        let timelines = timelines.map_ok(|(ttid, tenant_manifest_arc)| {
+            gc_timeline(
+                &remote_client,
+                &min_age,
+                &target,
+                mode,
+                ttid,
+                &accumulator,
+                tenant_manifest_arc,
+            )
         });
         let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
 
+        // Drain futures for per-shard GC, populating accumulator as a side effect
         while let Some(i) = timelines.next().await {
             summary.merge(i?);
         }
     }
+    // Streams are lazily evaluated, so only now do we have access to the inner object
+    summary.merge(manifest_gc_summary.into_inner().unwrap());
 
     // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
     let Some(client) = controller_client else {
@@ -813,8 +808,7 @@ pub async fn pageserver_physical_gc(
         return Ok(summary);
     };
 
-    let (ancestor_shards, ancestor_refs) = Arc::into_inner(accumulator)
-        .unwrap()
+    let (ancestor_shards, ancestor_refs) = accumulator
         .into_inner()
         .unwrap()
         .into_gc_ancestors(client, &mut summary)

From 624a5075444a92a199378e80182fad4eecf8e509 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Tue, 21 Jan 2025 13:45:21 +0100
Subject: [PATCH 229/583] Create Github releases with empty body for now
 (#10448)

## Problem
When releasing `release-7574`, the Github Release creation failed with
"body is too long" (see
https://github.com/neondatabase/neon/actions/runs/12834025431/job/35792346745#step:5:77).
There's lots of room for improvement of the release notes, but for now
we'll disable them instead.

## Summary of changes
- Disable automatic generation of release notes for Github releases
- Enable creation of Github releases for proxy/compute
---
 .github/workflows/build_and_test.yml | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b0e07535b3..4fc81dccaa 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1078,12 +1078,6 @@ jobs:
               console.log(`Tag ${tag} created successfully.`);
             }
 
-            // TODO: check how GitHub releases looks for proxy/compute releases and enable them if they're ok
-            if (context.ref !== 'refs/heads/release') {
-              console.log(`GitHub release skipped for ${context.ref}.`);
-              return;
-            }
-
             try {
               const existingRelease = await github.rest.repos.getReleaseByTag({
                 owner: context.repo.owner,
@@ -1102,7 +1096,8 @@ jobs:
                 owner: context.repo.owner,
                 repo: context.repo.repo,
                 tag_name: tag,
-                generate_release_notes: true,
+                // TODO: Automate release notes properly
+                generate_release_notes: false,
               });
               console.log(`Release for tag ${tag} created successfully.`);
             }

From 7e4a39ea539abedf78e885797e23923f1d5e2873 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 21 Jan 2025 16:40:04 +0100
Subject: [PATCH 230/583] Fix two flakiness sources in
 test_scrubber_physical_gc_ancestors (#10457)

We currently have some flakiness in
`test_scrubber_physical_gc_ancestors`, see #10391.

The first flakiness kind is about the reconciler not actually becoming
idle within the timeout of 30 seconds. We see continuous forward
progress so this is likely not a hang. We also see this happen in
parallel to a test failure, so is likely due to runners being
overloaded. Therefore, we increase the timeout.

The second flakiness kind is an assertion failure. This one is a little
bit more tricky, but we saw in the successful run that there was some
advance of the lsn between the compaction ran (which created layer
files) and the gc run. Apparently gc rejects reductions to the single
image layer setting if the cutoff lsn is the same as the lsn of the
image layer: it will claim that that layer is newer than the space
cutoff and therefore skip it, while thinking the old layer (that we want
to delete) is the latest one (so it's not deleted).

We address the second flakiness kind by inserting a tiny amount of WAL
between the compaction and gc. This should hopefully fix things.

Related issue: #10391

(not closing it with the merger of the PR as we'll need to validate that
these changes had the intended effect).

Thanks to Chi for going over this together with me in a call.
---
 test_runner/regress/test_storage_scrubber.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 220c428531..a782e85567 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -227,7 +227,9 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_
     new_shard_count = 4
     assert shard_count is None or new_shard_count > shard_count
     shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
-    env.storage_controller.reconcile_until_idle()  # Move shards to their final locations immediately
+    env.storage_controller.reconcile_until_idle(
+        timeout_secs=120
+    )  # Move shards to their final locations immediately
 
     # Create a timeline after split, to ensure scrubber can handle timelines that exist in child shards but not ancestors
     env.storage_controller.pageserver_api().timeline_create(
@@ -269,6 +271,8 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_
         ps.http_client().timeline_compact(
             shard, timeline_id, force_image_layer_creation=True, wait_until_uploaded=True
         )
+        # Add some WAL so that we don't gc at the latest remote consistent lsn
+        workload.churn_rows(1)
         ps.http_client().timeline_gc(shard, timeline_id, 0)
 
     # We will use a min_age_secs=1 threshold for deletion, let it pass

From 19bf7b78a0c38508753d3f54e8faea695f686d19 Mon Sep 17 00:00:00 2001
From: Gleb Novikov <NanoBjorn@users.noreply.github.com>
Date: Tue, 21 Jan 2025 16:50:44 +0000
Subject: [PATCH 231/583] fast import: basic python test (#10271)

We did not have any tests on fast_import binary yet.

In this PR I have introduced:
- `FastImport` class and tools for testing in python
- basic test that runs fast import against vanilla postgres and checks
that data is there

Should be merged after https://github.com/neondatabase/neon/pull/10251
---
 compute_tools/src/bin/fast_import.rs      |  21 ++++-
 test_runner/conftest.py                   |   1 +
 test_runner/fixtures/fast_import.py       | 104 ++++++++++++++++++++++
 test_runner/regress/test_import_pgdata.py |  42 ++++++++-
 4 files changed, 165 insertions(+), 3 deletions(-)
 create mode 100644 test_runner/fixtures/fast_import.py

diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index 5b008f8182..c8440afb64 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -58,6 +58,8 @@ struct Args {
     pg_bin_dir: Utf8PathBuf,
     #[clap(long)]
     pg_lib_dir: Utf8PathBuf,
+    #[clap(long)]
+    pg_port: Option<u16>, // port to run postgres on, 5432 is default
 }
 
 #[serde_with::serde_as]
@@ -74,6 +76,13 @@ enum EncryptionSecret {
     KMS { key_id: String },
 }
 
+// copied from pageserver_api::config::defaults::DEFAULT_LOCALE to avoid dependency just for a constant
+const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
+    "C"
+} else {
+    "C.UTF-8"
+};
+
 #[tokio::main]
 pub(crate) async fn main() -> anyhow::Result<()> {
     utils::logging::init(
@@ -97,6 +106,10 @@ pub(crate) async fn main() -> anyhow::Result<()> {
     let working_directory = args.working_directory;
     let pg_bin_dir = args.pg_bin_dir;
     let pg_lib_dir = args.pg_lib_dir;
+    let pg_port = args.pg_port.unwrap_or_else(|| {
+        info!("pg_port not specified, using default 5432");
+        5432
+    });
 
     // Initialize AWS clients only if s3_prefix is specified
     let (aws_config, kms_client) = if args.s3_prefix.is_some() {
@@ -180,7 +193,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
     let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded
     postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
         superuser,
-        locale: "en_US.UTF-8", // XXX: this shouldn't be hard-coded,
+        locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded,
         pg_version,
         initdb_bin: pg_bin_dir.join("initdb").as_ref(),
         library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local.
@@ -197,6 +210,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
     let mut postgres_proc = tokio::process::Command::new(pgbin)
         .arg("-D")
         .arg(&pgdata_dir)
+        .args(["-p", &format!("{pg_port}")])
         .args(["-c", "wal_level=minimal"])
         .args(["-c", "shared_buffers=10GB"])
         .args(["-c", "max_wal_senders=0"])
@@ -216,6 +230,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
             ),
         ])
         .env_clear()
+        .env("LD_LIBRARY_PATH", &pg_lib_dir)
         .stdout(std::process::Stdio::piped())
         .stderr(std::process::Stdio::piped())
         .spawn()
@@ -232,7 +247,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
 
     // Create neondb database in the running postgres
     let restore_pg_connstring =
-        format!("host=localhost port=5432 user={superuser} dbname=postgres");
+        format!("host=localhost port={pg_port} user={superuser} dbname=postgres");
 
     let start_time = std::time::Instant::now();
 
@@ -314,6 +329,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
             .arg(&source_connection_string)
             // how we run it
             .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir)
             .kill_on_drop(true)
             .stdout(std::process::Stdio::piped())
             .stderr(std::process::Stdio::piped())
@@ -347,6 +363,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
             .arg(&dumpdir)
             // how we run it
             .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir)
             .kill_on_drop(true)
             .stdout(std::process::Stdio::piped())
             .stderr(std::process::Stdio::piped())
diff --git a/test_runner/conftest.py b/test_runner/conftest.py
index 9e32469d69..4b591d3316 100644
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -15,4 +15,5 @@ pytest_plugins = (
     "fixtures.compare_fixtures",
     "fixtures.slow",
     "fixtures.reruns",
+    "fixtures.fast_import",
 )
diff --git a/test_runner/fixtures/fast_import.py b/test_runner/fixtures/fast_import.py
new file mode 100644
index 0000000000..33248132ab
--- /dev/null
+++ b/test_runner/fixtures/fast_import.py
@@ -0,0 +1,104 @@
+import os
+import shutil
+import subprocess
+import tempfile
+from collections.abc import Iterator
+from pathlib import Path
+
+import pytest
+
+from fixtures.log_helper import log
+from fixtures.neon_cli import AbstractNeonCli
+from fixtures.pg_version import PgVersion
+
+
+class FastImport(AbstractNeonCli):
+    COMMAND = "fast_import"
+    cmd: subprocess.CompletedProcess[str] | None = None
+
+    def __init__(
+        self,
+        extra_env: dict[str, str] | None,
+        binpath: Path,
+        pg_distrib_dir: Path,
+        pg_version: PgVersion,
+        workdir: Path,
+    ):
+        if extra_env is None:
+            env_vars = {}
+        else:
+            env_vars = extra_env.copy()
+
+        if not (binpath / self.COMMAND).exists():
+            raise Exception(f"{self.COMMAND} binary not found at '{binpath}'")
+        super().__init__(env_vars, binpath)
+
+        pg_dir = pg_distrib_dir / pg_version.v_prefixed
+        self.pg_distrib_dir = pg_distrib_dir
+        self.pg_version = pg_version
+        self.pg_bin = pg_dir / "bin"
+        if not (self.pg_bin / "postgres").exists():
+            raise Exception(f"postgres binary was not found at '{self.pg_bin}'")
+        self.pg_lib = pg_dir / "lib"
+        if env_vars.get("LD_LIBRARY_PATH") is not None:
+            self.pg_lib = Path(env_vars["LD_LIBRARY_PATH"])
+        elif os.getenv("LD_LIBRARY_PATH") is not None:
+            self.pg_lib = Path(str(os.getenv("LD_LIBRARY_PATH")))
+        if not workdir.exists():
+            raise Exception(f"Working directory '{workdir}' does not exist")
+        self.workdir = workdir
+
+    def run(
+        self,
+        pg_port: int,
+        source_connection_string: str | None = None,
+        s3prefix: str | None = None,
+        interactive: bool = False,
+    ) -> subprocess.CompletedProcess[str]:
+        if self.cmd is not None:
+            raise Exception("Command already executed")
+        args = [
+            f"--pg-bin-dir={self.pg_bin}",
+            f"--pg-lib-dir={self.pg_lib}",
+            f"--pg-port={pg_port}",
+            f"--working-directory={self.workdir}",
+        ]
+        if source_connection_string is not None:
+            args.append(f"--source-connection-string={source_connection_string}")
+        if s3prefix is not None:
+            args.append(f"--s3-prefix={s3prefix}")
+        if interactive:
+            args.append("--interactive")
+
+        self.cmd = self.raw_cli(args)
+        return self.cmd
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        if self.workdir.exists():
+            shutil.rmtree(self.workdir)
+
+
+@pytest.fixture(scope="function")
+def fast_import(
+    pg_version: PgVersion,
+    test_output_dir: Path,
+    neon_binpath: Path,
+    pg_distrib_dir: Path,
+) -> Iterator[FastImport]:
+    workdir = Path(tempfile.mkdtemp())
+    with FastImport(None, neon_binpath, pg_distrib_dir, pg_version, workdir) as fi:
+        yield fi
+
+        if fi.cmd is None:
+            return
+
+        # dump stdout & stderr into test log dir
+        with open(test_output_dir / "fast_import.stdout", "w") as f:
+            f.write(fi.cmd.stdout)
+        with open(test_output_dir / "fast_import.stderr", "w") as f:
+            f.write(fi.cmd.stderr)
+
+        log.info("Written logs to %s", test_output_dir)
diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py
index 6ea2393a9d..d02a9d19db 100644
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -7,13 +7,15 @@ import psycopg2
 import psycopg2.errors
 import pytest
 from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
+from fixtures.fast_import import FastImport
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, VanillaPostgres
+from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PgProtocol, VanillaPostgres
 from fixtures.pageserver.http import (
     ImportPgdataIdemptencyKey,
     PageserverApiException,
 )
 from fixtures.pg_version import PgVersion
+from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind
 from fixtures.utils import run_only_on_postgres
 from pytest_httpserver import HTTPServer
@@ -313,3 +315,41 @@ def test_pgdata_import_smoke(
     validate_vanilla_equivalence(br_initdb_endpoint)
     with pytest.raises(psycopg2.errors.UndefinedTable):
         br_initdb_endpoint.safe_psql("select * from othertable")
+
+
+@run_only_on_postgres(
+    [PgVersion.V14, PgVersion.V15, PgVersion.V16],
+    "newer control file catalog version and struct format isn't supported",
+)
+def test_fast_import_binary(
+    test_output_dir,
+    vanilla_pg: VanillaPostgres,
+    port_distributor: PortDistributor,
+    fast_import: FastImport,
+):
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);")
+
+    pg_port = port_distributor.get_port()
+    fast_import.run(pg_port, vanilla_pg.connstr())
+    vanilla_pg.stop()
+
+    pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version)
+    with VanillaPostgres(
+        fast_import.workdir / "pgdata", pgbin, pg_port, False
+    ) as new_pgdata_vanilla_pg:
+        new_pgdata_vanilla_pg.start()
+
+        # database name and user are hardcoded in fast_import binary, and they are different from normal vanilla postgres
+        conn = PgProtocol(dsn=f"postgresql://cloud_admin@localhost:{pg_port}/neondb")
+        res = conn.safe_psql("SELECT count(*) FROM foo;")
+        log.info(f"Result: {res}")
+        assert res[0][0] == 10
+
+
+# TODO: Maybe test with pageserver?
+# 1. run whole neon env
+# 2. create timeline with some s3 path???
+# 3. run fast_import with s3 prefix
+# 4. ??? mock http where pageserver will report progress
+# 5. run compute on this timeline and check if data is there

From 737888e5c99474b2c411418a97f2eb16d825aa07 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 21 Jan 2025 20:17:14 +0100
Subject: [PATCH 232/583] Remove the tests for `pg_anon` (#10382)

## Problem
We are removing the `pg_anon` v1 extension from Neon. So we don't need
to test it anymore and can remove the code for simplicity.
## Summary of changes
The code required for testing `pg_anon` is removed.
---
 compute/compute-node.Dockerfile           |  6 -----
 docker-compose/compute_wrapper/Dockerfile |  3 ---
 docker-compose/docker_compose_test.sh     | 30 +----------------------
 3 files changed, 1 insertion(+), 38 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 8c7200c5cb..dbe7de046b 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1347,9 +1347,6 @@ COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
 COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
 #COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
 #COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
-#pg_anon is not supported yet for pg v17 so, don't fail if nothing found
-COPY --from=pg-anon-pg-build /pg_anon.tar.g? /ext-src
-COPY compute/patches/pg_anon.patch /ext-src
 COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
 COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
 RUN cd /ext-src/ && for f in *.tar.gz; \
@@ -1360,9 +1357,6 @@ RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
 RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
 RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
-RUN case "${PG_VERSION}" in "v17") \
-    echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
-    esac && patch -p1 </ext-src/pg_anon.patch
 RUN patch -p1 </ext-src/pg_cron.patch
 ENV PATH=/usr/local/pgsql/bin:$PATH
 ENV PGHOST=compute
diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile
index 05a2cf124c..e2e5bc7248 100644
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -10,10 +10,7 @@ USER root
 RUN apt-get update &&       \
     apt-get install -y curl \
                        jq   \
-                       python3-pip \
                        netcat-openbsd
-#Faker is required for the pg_anon test
-RUN case $COMPUTE_IMAGE in compute-node-v17) OPT="--break-system-packages";; *) OPT= ;; esac && pip3 install $OPT Faker
 #This is required for the pg_hintplan test
 RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src 
 
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index 063664d0c6..4f1ae64873 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -18,9 +18,6 @@ cd $(dirname $0)
 COMPUTE_CONTAINER_NAME=docker-compose-compute-1
 TEST_CONTAINER_NAME=docker-compose-neon-test-extensions-1
 PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"
-: ${http_proxy:=}
-: ${https_proxy:=}
-export http_proxy https_proxy
 
 cleanup() {
     echo "show container information"
@@ -35,12 +32,6 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
     echo "clean up containers if exists"
     cleanup
     PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version))
-    # The support of pg_anon not yet added to PG17, so we have to add the corresponding option for other PG versions
-    if [ "${pg_version}" -ne 17 ]; then
-      SPEC_PATH="compute_wrapper/var/db/postgres/specs"
-      mv $SPEC_PATH/spec.json $SPEC_PATH/spec.bak
-      jq '.cluster.settings += [{"name": "session_preload_libraries","value": "anon","vartype": "string"}]' "${SPEC_PATH}/spec.bak" > "${SPEC_PATH}/spec.json"
-    fi
     PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d
 
     echo "wait until the compute is ready. timeout after 60s. "
@@ -62,27 +53,12 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
     done
 
     if [ $pg_version -ge 16 ]; then
-        echo Enabling trust connection
-        docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' "
-        echo Adding postgres role
-        docker exec $COMPUTE_CONTAINER_NAME psql $PSQL_OPTION -c "CREATE ROLE postgres SUPERUSER LOGIN"
         # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
         # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
         echo Adding dummy config
         docker exec $COMPUTE_CONTAINER_NAME touch /var/db/postgres/compute/compute_ctl_temp_override.conf
-        # This block is required for the pg_anon extension test.
-        # The test assumes that it is running on the same host with the postgres engine.
-        # In our case it's not true, that's why we are copying files to the compute node
+        # The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment
         TMPDIR=$(mktemp -d)
-        # Add support for pg_anon for pg_v16
-        if [ $pg_version -ne 17 ]; then
-          docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data
-          echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv
-          docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data
-        rm -rf $TMPDIR
-        fi
-        TMPDIR=$(mktemp -d)
-        # The following block does the same for the pg_hintplan test
         docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
         docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
         rm -rf $TMPDIR
@@ -106,8 +82,4 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
         fi
     fi
     cleanup
-    # Restore the original spec.json
-    if [ "$pg_version" -ne 17 ]; then
-      mv "$SPEC_PATH/spec.bak" "$SPEC_PATH/spec.json"
-    fi
 done

From 7d4bfcdc4795654100b87a02d398e41373027e44 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 21 Jan 2025 14:29:38 -0500
Subject: [PATCH 233/583] feat(pageserver): add config items for gc-compaction
 auto trigger (#10455)

## Problem

part of https://github.com/neondatabase/neon/issues/9114

The automatic trigger is already implemented at
https://github.com/neondatabase/neon/pull/10221 but I need to write some
tests and finish my experiments in staging before I can merge it with
confidence. Given that I have some other patches that will modify the
config items, I'd like to get the config items merged first to reduce
conflicts.

## Summary of changes

* add `l2_lsn` to index_part.json -- below that LSN, data have been
processed by gc-compaction
* add a set of gc-compaction auto trigger control items into the config

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 control_plane/src/pageserver.rs               |  15 +++
 libs/pageserver_api/src/config.rs             |  16 +++
 libs/pageserver_api/src/models.rs             |  24 ++++
 pageserver/src/tenant.rs                      |   5 +
 pageserver/src/tenant/config.rs               |  36 ++++++
 .../tenant/remote_timeline_client/index.rs    | 104 +++++++++++++++++-
 .../regress/test_attach_tenant_config.py      |   3 +
 7 files changed, 201 insertions(+), 2 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index df81b44f2d..b33b2877b3 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -423,6 +423,21 @@ impl PageServerNode {
                 .map(|x| x.parse::<bool>())
                 .transpose()
                 .context("Failed to parse 'rel_size_v2_enabled' as bool")?,
+            gc_compaction_enabled: settings
+                .remove("gc_compaction_enabled")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'gc_compaction_enabled' as bool")?,
+            gc_compaction_initial_threshold_kb: settings
+                .remove("gc_compaction_initial_threshold_kb")
+                .map(|x| x.parse::<u64>())
+                .transpose()
+                .context("Failed to parse 'gc_compaction_initial_threshold_kb' as integer")?,
+            gc_compaction_ratio_percent: settings
+                .remove("gc_compaction_ratio_percent")
+                .map(|x| x.parse::<u64>())
+                .transpose()
+                .context("Failed to parse 'gc_compaction_ratio_percent' as integer")?,
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 7fb7a9d54e..f0aeb00736 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -305,6 +305,16 @@ pub struct TenantConfigToml {
     /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into
     /// `index_part.json`, and it cannot be reversed.
     pub rel_size_v2_enabled: Option<bool>,
+
+    // gc-compaction related configs
+    /// Enable automatic gc-compaction trigger on this tenant.
+    pub gc_compaction_enabled: bool,
+    /// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold,
+    /// gc-compaction will be triggered.
+    pub gc_compaction_initial_threshold_kb: u64,
+    /// The ratio that triggers the auto gc-compaction. If (the total size of layers between L2 LSN and gc-horizon) / (size below the L2 LSN)
+    /// is above this ratio, gc-compaction will be triggered.
+    pub gc_compaction_ratio_percent: u64,
 }
 
 pub mod defaults {
@@ -498,6 +508,9 @@ pub mod tenant_conf_defaults {
     // By default ingest enough WAL for two new L0 layers before checking if new image
     // image layers should be created.
     pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
+    pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false;
+    pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 10240000;
+    pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
 }
 
 impl Default for TenantConfigToml {
@@ -543,6 +556,9 @@ impl Default for TenantConfigToml {
             timeline_offloading: false,
             wal_receiver_protocol_override: None,
             rel_size_v2_enabled: None,
+            gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
+            gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
+            gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
         }
     }
 }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 1538134c96..fd4879087f 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -499,6 +499,12 @@ pub struct TenantConfigPatch {
     pub wal_receiver_protocol_override: FieldPatch<PostgresClientProtocol>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub rel_size_v2_enabled: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_compaction_enabled: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_compaction_initial_threshold_kb: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_compaction_ratio_percent: FieldPatch<u64>,
 }
 
 /// An alternative representation of `pageserver::tenant::TenantConf` with
@@ -531,6 +537,9 @@ pub struct TenantConfig {
     pub timeline_offloading: Option<bool>,
     pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
     pub rel_size_v2_enabled: Option<bool>,
+    pub gc_compaction_enabled: Option<bool>,
+    pub gc_compaction_initial_threshold_kb: Option<u64>,
+    pub gc_compaction_ratio_percent: Option<u64>,
 }
 
 impl TenantConfig {
@@ -561,6 +570,9 @@ impl TenantConfig {
             mut timeline_offloading,
             mut wal_receiver_protocol_override,
             mut rel_size_v2_enabled,
+            mut gc_compaction_enabled,
+            mut gc_compaction_initial_threshold_kb,
+            mut gc_compaction_ratio_percent,
         } = self;
 
         patch.checkpoint_distance.apply(&mut checkpoint_distance);
@@ -606,6 +618,15 @@ impl TenantConfig {
             .wal_receiver_protocol_override
             .apply(&mut wal_receiver_protocol_override);
         patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled);
+        patch
+            .gc_compaction_enabled
+            .apply(&mut gc_compaction_enabled);
+        patch
+            .gc_compaction_initial_threshold_kb
+            .apply(&mut gc_compaction_initial_threshold_kb);
+        patch
+            .gc_compaction_ratio_percent
+            .apply(&mut gc_compaction_ratio_percent);
 
         Self {
             checkpoint_distance,
@@ -633,6 +654,9 @@ impl TenantConfig {
             timeline_offloading,
             wal_receiver_protocol_override,
             rel_size_v2_enabled,
+            gc_compaction_enabled,
+            gc_compaction_initial_threshold_kb,
+            gc_compaction_ratio_percent,
         }
     }
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 05a311391c..e45ba2ca3b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5476,6 +5476,11 @@ pub(crate) mod harness {
                 timeline_offloading: Some(tenant_conf.timeline_offloading),
                 wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override,
                 rel_size_v2_enabled: tenant_conf.rel_size_v2_enabled,
+                gc_compaction_enabled: Some(tenant_conf.gc_compaction_enabled),
+                gc_compaction_initial_threshold_kb: Some(
+                    tenant_conf.gc_compaction_initial_threshold_kb,
+                ),
+                gc_compaction_ratio_percent: Some(tenant_conf.gc_compaction_ratio_percent),
             }
         }
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 14d8e9ccd4..3db1445f6e 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -360,6 +360,15 @@ pub struct TenantConfOpt {
 
     #[serde(skip_serializing_if = "Option::is_none")]
     pub rel_size_v2_enabled: Option<bool>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub gc_compaction_enabled: Option<bool>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub gc_compaction_initial_threshold_kb: Option<u64>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub gc_compaction_ratio_percent: Option<u64>,
 }
 
 impl TenantConfOpt {
@@ -429,6 +438,15 @@ impl TenantConfOpt {
                 .wal_receiver_protocol_override
                 .or(global_conf.wal_receiver_protocol_override),
             rel_size_v2_enabled: self.rel_size_v2_enabled.or(global_conf.rel_size_v2_enabled),
+            gc_compaction_enabled: self
+                .gc_compaction_enabled
+                .unwrap_or(global_conf.gc_compaction_enabled),
+            gc_compaction_initial_threshold_kb: self
+                .gc_compaction_initial_threshold_kb
+                .unwrap_or(global_conf.gc_compaction_initial_threshold_kb),
+            gc_compaction_ratio_percent: self
+                .gc_compaction_ratio_percent
+                .unwrap_or(global_conf.gc_compaction_ratio_percent),
         }
     }
 
@@ -459,6 +477,9 @@ impl TenantConfOpt {
             mut timeline_offloading,
             mut wal_receiver_protocol_override,
             mut rel_size_v2_enabled,
+            mut gc_compaction_enabled,
+            mut gc_compaction_initial_threshold_kb,
+            mut gc_compaction_ratio_percent,
         } = self;
 
         patch.checkpoint_distance.apply(&mut checkpoint_distance);
@@ -528,6 +549,15 @@ impl TenantConfOpt {
             .wal_receiver_protocol_override
             .apply(&mut wal_receiver_protocol_override);
         patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled);
+        patch
+            .gc_compaction_enabled
+            .apply(&mut gc_compaction_enabled);
+        patch
+            .gc_compaction_initial_threshold_kb
+            .apply(&mut gc_compaction_initial_threshold_kb);
+        patch
+            .gc_compaction_ratio_percent
+            .apply(&mut gc_compaction_ratio_percent);
 
         Ok(Self {
             checkpoint_distance,
@@ -555,6 +585,9 @@ impl TenantConfOpt {
             timeline_offloading,
             wal_receiver_protocol_override,
             rel_size_v2_enabled,
+            gc_compaction_enabled,
+            gc_compaction_initial_threshold_kb,
+            gc_compaction_ratio_percent,
         })
     }
 }
@@ -611,6 +644,9 @@ impl From<TenantConfOpt> for models::TenantConfig {
             timeline_offloading: value.timeline_offloading,
             wal_receiver_protocol_override: value.wal_receiver_protocol_override,
             rel_size_v2_enabled: value.rel_size_v2_enabled,
+            gc_compaction_enabled: value.gc_compaction_enabled,
+            gc_compaction_initial_threshold_kb: value.gc_compaction_initial_threshold_kb,
+            gc_compaction_ratio_percent: value.gc_compaction_ratio_percent,
         }
     }
 }
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 30b6b07ca3..3824bc8f11 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -77,11 +77,17 @@ pub struct IndexPart {
     ///
     /// None means no aux files have been written to the storage before the point
     /// when this flag is introduced.
+    ///
+    /// This flag is not used any more as all tenants have been transitioned to the new aux file policy.
     #[serde(skip_serializing_if = "Option::is_none", default)]
     pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
 
     #[serde(skip_serializing_if = "Option::is_none", default)]
     pub(crate) rel_size_migration: Option<RelSizeMigration>,
+
+    /// The LSN of gc-compaction horizon. Once gc-compaction is finished for all layer files below an LSN, this LSN will be updated.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub(crate) l2_lsn: Option<Lsn>,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -116,10 +122,11 @@ impl IndexPart {
     /// - 9: +gc_blocking
     /// - 10: +import_pgdata
     /// - 11: +rel_size_migration
-    const LATEST_VERSION: usize = 11;
+    /// - 12: +l2_lsn
+    const LATEST_VERSION: usize = 12;
 
     // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
@@ -136,6 +143,7 @@ impl IndexPart {
             last_aux_file_policy: None,
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         }
     }
 
@@ -437,6 +445,7 @@ mod tests {
             last_aux_file_policy: None,
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -483,6 +492,7 @@ mod tests {
             last_aux_file_policy: None,
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -530,6 +540,7 @@ mod tests {
             last_aux_file_policy: None,
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -580,6 +591,7 @@ mod tests {
             last_aux_file_policy: None,
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap();
@@ -625,6 +637,7 @@ mod tests {
             last_aux_file_policy: None,
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -673,6 +686,7 @@ mod tests {
             last_aux_file_policy: None,
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -726,6 +740,7 @@ mod tests {
             last_aux_file_policy: Some(AuxFilePolicy::V2),
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -784,6 +799,7 @@ mod tests {
             last_aux_file_policy: Default::default(),
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -843,6 +859,7 @@ mod tests {
             last_aux_file_policy: Default::default(),
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -907,6 +924,7 @@ mod tests {
             archived_at: None,
             import_pgdata: None,
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -984,6 +1002,7 @@ mod tests {
                 idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()),
             }))),
             rel_size_migration: None,
+            l2_lsn: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -1062,6 +1081,87 @@ mod tests {
                 idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()),
             }))),
             rel_size_migration: Some(RelSizeMigration::Legacy),
+            l2_lsn: None,
+        };
+
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
+    #[test]
+    fn v12_l2_lsn_is_parsed() {
+        let example = r#"{
+            "version": 12,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata": {
+                "disk_consistent_lsn": "0/16960E8",
+                "prev_record_lsn": "0/1696070",
+                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
+                "ancestor_lsn": "0/0",
+                "latest_gc_cutoff_lsn": "0/1696070",
+                "initdb_lsn": "0/1696070",
+                "pg_version": 14
+            },
+            "gc_blocking": {
+                "started_at": "2024-07-19T09:00:00.123",
+                "reasons": ["DetachAncestor"]
+            },
+            "import_pgdata": {
+                "V1": {
+                    "Done": {
+                        "idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5",
+                        "started_at": "2024-11-13T09:23:42.123",
+                        "finished_at": "2024-11-13T09:42:23.123"
+                    }
+                }
+            },
+            "rel_size_migration": "legacy",
+            "l2_lsn": "0/16960E8"
+        }"#;
+
+        let expected = IndexPart {
+            version: 12,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                    file_size: 9007199254741001,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::new(
+                Lsn::from_str("0/16960E8").unwrap(),
+                Some(Lsn::from_str("0/1696070").unwrap()),
+                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
+                Lsn::INVALID,
+                Lsn::from_str("0/1696070").unwrap(),
+                Lsn::from_str("0/1696070").unwrap(),
+                14,
+            ).with_recalculated_checksum().unwrap(),
+            deleted_at: None,
+            lineage: Default::default(),
+            gc_blocking: Some(GcBlocking {
+                started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
+                reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
+            }),
+            last_aux_file_policy: Default::default(),
+            archived_at: None,
+            import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{
+                started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"),
+                finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"),
+                idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()),
+            }))),
+            rel_size_migration: Some(RelSizeMigration::Legacy),
+            l2_lsn: Some("0/16960E8".parse::<Lsn>().unwrap()),
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index b34dbddc80..b8d47346a3 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -177,6 +177,9 @@ def test_fully_custom_config(positive_env: NeonEnv):
             "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
         },
         "rel_size_v2_enabled": True,
+        "gc_compaction_enabled": True,
+        "gc_compaction_initial_threshold_kb": 1024000,
+        "gc_compaction_ratio_percent": 200,
     }
 
     vps_http = env.storage_controller.pageserver_api()

From a75e11cc002aae79efd3d8b46fe9c7be96eca326 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 21 Jan 2025 21:56:34 +0100
Subject: [PATCH 234/583] pageserver: return duration from
 `StorageTimeMetricsTimer` (#10468)

## Problem

It's sometimes useful to obtain the elapsed duration from a
`StorageTimeMetricsTimer` for purposes beyond just recording it in
metrics (e.g. to log it).

Extracted from #10405.

## Summary of changes

Add `StorageTimeMetricsTimer.elapsed()` and return the duration from
`stop_and_record()`.
---
 pageserver/src/metrics.rs | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 4758aaf230..252e566f70 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2550,12 +2550,19 @@ impl StorageTimeMetricsTimer {
         }
     }
 
-    /// Record the time from creation to now.
-    pub fn stop_and_record(self) {
-        let duration = self.start.elapsed().as_secs_f64();
-        self.metrics.timeline_sum.inc_by(duration);
+    /// Returns the elapsed duration of the timer.
+    pub fn elapsed(&self) -> Duration {
+        self.start.elapsed()
+    }
+
+    /// Record the time from creation to now and return it.
+    pub fn stop_and_record(self) -> Duration {
+        let duration = self.elapsed();
+        let seconds = duration.as_secs_f64();
+        self.metrics.timeline_sum.inc_by(seconds);
         self.metrics.timeline_count.inc();
-        self.metrics.global_histogram.observe(duration);
+        self.metrics.global_histogram.observe(seconds);
+        duration
     }
 
     /// Turns this timer into a timer, which will always record -- usually this means recording
@@ -2575,6 +2582,14 @@ impl Drop for AlwaysRecordingStorageTimeMetricsTimer {
     }
 }
 
+impl AlwaysRecordingStorageTimeMetricsTimer {
+    /// Returns the elapsed duration of the timer.
+    #[allow(unused)]
+    pub fn elapsed(&self) -> Duration {
+        self.0.as_ref().expect("not dropped yet").elapsed()
+    }
+}
+
 /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
 /// timeline total sum and count.
 #[derive(Clone, Debug)]

From 8a8c656c0646895fbfc7a7bc5df4a93917071032 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 21 Jan 2025 22:18:09 +0100
Subject: [PATCH 235/583] pageserver: add `LayerMap::watch_layer0_deltas()`
 (#10470)

## Problem

For compaction backpressure, we need a mechanism to signal when
compaction has reduced the L0 delta layer count below the backpressure
threshold.

Extracted from #10405.

## Summary of changes

Add `LayerMap::watch_level0_deltas()` which returns a
`tokio::sync::watch::Receiver` signalling the current L0 delta layer
count.
---
 pageserver/src/tenant/layer_map.rs | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 1b6924425c..a69cce932e 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -57,6 +57,7 @@ use std::collections::{HashMap, VecDeque};
 use std::iter::Peekable;
 use std::ops::Range;
 use std::sync::Arc;
+use tokio::sync::watch;
 use utils::lsn::Lsn;
 
 use historic_layer_coverage::BufferedHistoricLayerCoverage;
@@ -67,7 +68,6 @@ use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc};
 ///
 /// LayerMap tracks what layers exist on a timeline.
 ///
-#[derive(Default)]
 pub struct LayerMap {
     //
     // 'open_layer' holds the current InMemoryLayer that is accepting new
@@ -93,7 +93,25 @@ pub struct LayerMap {
 
     /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
     /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
+    ///
+    /// NB: make sure to notify `watch_l0_deltas` on changes.
     l0_delta_layers: Vec<Arc<PersistentLayerDesc>>,
+
+    /// Notifies about L0 delta layer changes, sending the current number of L0 layers.
+    watch_l0_deltas: watch::Sender<usize>,
+}
+
+impl Default for LayerMap {
+    fn default() -> Self {
+        Self {
+            open_layer: Default::default(),
+            next_open_layer_at: Default::default(),
+            frozen_layers: Default::default(),
+            historic: Default::default(),
+            l0_delta_layers: Default::default(),
+            watch_l0_deltas: watch::channel(0).0,
+        }
+    }
 }
 
 /// The primary update API for the layer map.
@@ -466,6 +484,8 @@ impl LayerMap {
 
         if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
             self.l0_delta_layers.push(layer_desc.clone().into());
+            self.watch_l0_deltas
+                .send_replace(self.l0_delta_layers.len());
         }
 
         self.historic.insert(
@@ -488,6 +508,8 @@ impl LayerMap {
             let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
             l0_delta_layers.retain(|other| other.key() != layer_key);
             self.l0_delta_layers = l0_delta_layers;
+            self.watch_l0_deltas
+                .send_replace(self.l0_delta_layers.len());
             // this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
             // there's a chance that the comparison fails at runtime due to it comparing (pointer,
             // vtable) pairs.
@@ -850,6 +872,11 @@ impl LayerMap {
         &self.l0_delta_layers
     }
 
+    /// Subscribes to L0 delta layer changes, sending the current number of L0 delta layers.
+    pub fn watch_level0_deltas(&self) -> watch::Receiver<usize> {
+        self.watch_l0_deltas.subscribe()
+    }
+
     /// debugging function to print out the contents of the layer map
     #[allow(unused)]
     pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {

From 14e1f89053b87a38f0dd697f5c32721380d19a9a Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 21 Jan 2025 23:01:27 +0100
Subject: [PATCH 236/583] pageserver: eagerly notify flush waiters (#10469)

## Problem

Currently, the layer flush loop will continue flushing layers as long as
any are pending, and only notify waiters once there are no further
layers to flush. This can cause waiters to wait longer than necessary,
and potentially starve them if pending layers keep arriving faster than
they can be flushed. The impact of this will increase when we add
compaction backpressure and propagate it up into the WAL receiver.

Extracted from #10405.

## Summary of changes

Break out of the layer flush loop once we've flushed up to the requested
LSN. If further flush requests have arrived in the meanwhile, flushing
will resume immediately after.
---
 pageserver/src/tenant/timeline.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5f4272fb2b..3245f23a28 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3617,6 +3617,12 @@ impl Timeline {
                     return;
                 }
 
+                // Break to notify potential waiters as soon as we've flushed the requested LSN. If
+                // more requests have arrived in the meanwhile, we'll resume flushing afterwards.
+                if flushed_to_lsn >= frozen_to_lsn {
+                    break Ok(());
+                }
+
                 let timer = self.metrics.flush_time_histo.start_timer();
 
                 let num_frozen_layers;

From 2b49d6ee050f41cb67d34e4f117196b3d01a2394 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 22 Jan 2025 09:15:52 +0000
Subject: [PATCH 237/583] feat: adjust the tonic features to remove axum
 dependency (#10348)

To help facilitate an upgrade to axum 0.8
(https://github.com/neondatabase/neon/pull/10332#pullrequestreview-2541989619)
this massages the tonic dependency features so that tonic does not
depend on axum.
---
 Cargo.lock                               |  8 -------
 Cargo.toml                               |  2 +-
 libs/wal_decoder/Cargo.toml              |  1 -
 libs/wal_decoder/src/models.rs           |  2 +-
 storage_broker/src/bin/storage_broker.rs | 29 ++++++++++++++----------
 workspace_hack/Cargo.toml                | 11 ++++-----
 6 files changed, 23 insertions(+), 30 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 02b02a09c1..2020c417f0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7004,12 +7004,9 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52"
 dependencies = [
- "async-stream",
  "async-trait",
- "axum",
  "base64 0.22.1",
  "bytes",
- "h2 0.4.4",
  "http 1.1.0",
  "http-body 1.0.0",
  "http-body-util",
@@ -7021,7 +7018,6 @@ dependencies = [
  "prost",
  "rustls-native-certs 0.8.0",
  "rustls-pemfile 2.1.1",
- "socket2",
  "tokio",
  "tokio-rustls 0.26.0",
  "tokio-stream",
@@ -7582,7 +7578,6 @@ dependencies = [
  "tikv-jemallocator",
  "tokio",
  "tokio-util",
- "tonic",
  "tonic-build",
  "tracing",
  "utils",
@@ -7991,8 +7986,6 @@ version = "0.1.0"
 dependencies = [
  "ahash",
  "anyhow",
- "axum",
- "axum-core",
  "base64 0.13.1",
  "base64 0.21.1",
  "base64ct",
@@ -8073,7 +8066,6 @@ dependencies = [
  "toml_edit",
  "tonic",
  "tower 0.4.13",
- "tower 0.5.2",
  "tracing",
  "tracing-core",
  "url",
diff --git a/Cargo.toml b/Cargo.toml
index a4e601bb58..6e1e288895 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -187,7 +187,7 @@ tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.8"
 toml_edit = "0.22"
-tonic = {version = "0.12.3", features = ["tls", "tls-roots"]}
+tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]}
 tower = { version = "0.5.2", default-features = false }
 tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
 tower-service = "0.3.3"
diff --git a/libs/wal_decoder/Cargo.toml b/libs/wal_decoder/Cargo.toml
index 09c4afb18a..cb0ef4b00d 100644
--- a/libs/wal_decoder/Cargo.toml
+++ b/libs/wal_decoder/Cargo.toml
@@ -17,7 +17,6 @@ postgres_ffi.workspace = true
 serde.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["io-util"] }
-tonic.workspace = true
 tracing.workspace = true
 utils.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs
index c2f9125b21..51bf7e44ab 100644
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -45,7 +45,7 @@ pub mod proto {
     #![allow(clippy::derive_partial_eq_without_eq)]
     // The generated ValueMeta has a `len` method generate for its `len` field.
     #![allow(clippy::len_without_is_empty)]
-    tonic::include_proto!("interpreted_wal");
+    include!(concat!(env!("OUT_DIR"), concat!("/interpreted_wal.rs")));
 }
 
 #[derive(Copy, Clone, Serialize, Deserialize)]
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 1fbb651656..9d4c22484c 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -32,7 +32,6 @@ use tokio::sync::broadcast::error::RecvError;
 use tokio::time;
 use tonic::body::{self, empty_body, BoxBody};
 use tonic::codegen::Service;
-use tonic::transport::server::Connected;
 use tonic::Code;
 use tonic::{Request, Response, Status};
 use tracing::*;
@@ -459,9 +458,10 @@ impl BrokerService for Broker {
         &self,
         request: Request<tonic::Streaming<SafekeeperTimelineInfo>>,
     ) -> Result<Response<()>, Status> {
-        let remote_addr = request
-            .remote_addr()
-            .expect("TCPConnectInfo inserted by handler");
+        let &RemoteAddr(remote_addr) = request
+            .extensions()
+            .get()
+            .expect("RemoteAddr inserted by handler");
         let mut publisher = self.registry.register_publisher(remote_addr);
 
         let mut stream = request.into_inner();
@@ -484,9 +484,10 @@ impl BrokerService for Broker {
         &self,
         request: Request<SubscribeSafekeeperInfoRequest>,
     ) -> Result<Response<Self::SubscribeSafekeeperInfoStream>, Status> {
-        let remote_addr = request
-            .remote_addr()
-            .expect("TCPConnectInfo inserted by handler");
+        let &RemoteAddr(remote_addr) = request
+            .extensions()
+            .get()
+            .expect("RemoteAddr inserted by handler");
         let proto_key = request
             .into_inner()
             .subscription_key
@@ -537,9 +538,10 @@ impl BrokerService for Broker {
         &self,
         request: Request<SubscribeByFilterRequest>,
     ) -> std::result::Result<Response<Self::SubscribeByFilterStream>, Status> {
-        let remote_addr = request
-            .remote_addr()
-            .expect("TCPConnectInfo inserted by handler");
+        let &RemoteAddr(remote_addr) = request
+            .extensions()
+            .get()
+            .expect("RemoteAddr inserted by handler");
         let proto_filter = request.into_inner();
         let ttid_filter = proto_filter.tenant_timeline_id.as_ref();
 
@@ -628,6 +630,9 @@ async fn http1_handler(
     Ok(resp)
 }
 
+#[derive(Clone, Copy)]
+struct RemoteAddr(SocketAddr);
+
 #[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
     let args = Args::parse();
@@ -687,13 +692,13 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
             .max_concurrent_streams(None);
 
         let storage_broker_server_cloned = storage_broker_server.clone();
-        let connect_info = stream.connect_info();
+        let remote_addr = RemoteAddr(addr);
         let service_fn_ = async move {
             service_fn(move |mut req| {
                 // That's what tonic's MakeSvc.call does to pass conninfo to
                 // the request handler (and where its request.remote_addr()
                 // expects it to find).
-                req.extensions_mut().insert(connect_info.clone());
+                req.extensions_mut().insert(remote_addr);
 
                 // Technically this second clone is not needed, but consume
                 // by async block is apparently unavoidable. BTW, error
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 0ffeeead18..a3dffa8f19 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -17,8 +17,6 @@ license.workspace = true
 [dependencies]
 ahash = { version = "0.8" }
 anyhow = { version = "1", features = ["backtrace"] }
-axum = { version = "0.7", features = ["ws"] }
-axum-core = { version = "0.4", default-features = false, features = ["tracing"] }
 base64-594e8ee84c453af0 = { package = "base64", version = "0.13", features = ["alloc"] }
 base64-647d43efb71741da = { package = "base64", version = "0.21", features = ["alloc"] }
 base64ct = { version = "1", default-features = false, features = ["std"] }
@@ -46,7 +44,7 @@ hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["full"] }
 hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] }
-hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] }
+hyper-util = { version = "0.1", features = ["client-legacy", "http1", "http2", "server", "service"] }
 indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] }
 indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] }
 itertools = { version = "0.12" }
@@ -87,12 +85,11 @@ tikv-jemalloc-sys = { version = "0.6", features = ["profiling", "stats", "unpref
 time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["full", "test-util"] }
 tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] }
-tokio-stream = { version = "0.1", features = ["net"] }
+tokio-stream = { version = "0.1" }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }
 toml_edit = { version = "0.22", features = ["serde"] }
-tonic = { version = "0.12", features = ["tls-roots"] }
-tower-9fbad63c4bcf4a8f = { package = "tower", version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "util"] }
-tower-d8f496e17d97b5cb = { package = "tower", version = "0.5", default-features = false, features = ["log", "make", "util"] }
+tonic = { version = "0.12", default-features = false, features = ["codegen", "prost", "tls-roots"] }
+tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
 url = { version = "2", features = ["serde"] }

From b4d87b9dfedc3d08d00091e2407d0996a9ea2026 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 22 Jan 2025 11:10:43 +0100
Subject: [PATCH 238/583] fix(tests): actually enable pipelinig by default in
 the test suite (#10472)

## Problem

PR #9993 was supposed to enable `page_service_pipelining` by default for
all `NeonEnv`s, but this was ineffective in our CI environment.

Thus, CI Python-based tests and benchmarks, unless explicitly
configuring pipelining, were still using serial protocol handling.

## Analysis

The root cause was that in our CI environment,
`config.compatibility_neon_binpath` is always Truthy.
It's not in local environments, which is why this slipped through in
local testing.

Lesson: always add a log line ot pageserver startup and spot-check tests
to ensure the intended default is picked up.

## Summary of changes

Fix it. Since enough time has passed, the compatiblity snapshot contains
a recent enough software version so we don't need to worry about
`compatibility_neon_binpath` anymore.

## Future Work

The question how to add a new default except for compatibliity tests,
which is what the broken code was supposed to do, is still unsolved.

Slack discussion:
https://neondb.slack.com/archives/C059ZC138NR/p1737490501941309
---
 test_runner/fixtures/neon_fixtures.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a01cb47984..d79c2a5ea8 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1115,13 +1115,11 @@ class NeonEnv:
 
             # Batching (https://github.com/neondatabase/neon/issues/9377):
             # enable batching by default in tests and benchmarks.
-            # Compat tests are exempt because old versions fail to parse the new config.
-            if not config.compatibility_neon_binpath:
-                ps_cfg["page_service_pipelining"] = {
-                    "mode": "pipelined",
-                    "execution": "concurrent-futures",
-                    "max_batch_size": 32,
-                }
+            ps_cfg["page_service_pipelining"] = {
+                "mode": "pipelined",
+                "execution": "concurrent-futures",
+                "max_batch_size": 32,
+            }
 
             if self.pageserver_virtual_file_io_engine is not None:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine

From b31ce14083d3d4130e1190daadbf085082ac7280 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 22 Jan 2025 13:28:26 +0100
Subject: [PATCH 239/583] initial logical size calculation: always poll to
 completion (#10471)

# Refs

- extracted from https://github.com/neondatabase/neon/pull/9353

# Problem

Before this PR, when task_mgr shutdown is signalled, e.g. during
pageserver shutdown or Tenant shutdown, initial logical size calculation
stops polling and drops the future that represents the calculation.

This is against the current policy that we poll all futures to
completion.

This became apparent during development of concurrent IO which warns if
we drop a `Timeline::get_vectored` future that still has in-flight IOs.

We may revise the policy in the future, but, right now initial logical
size calculation is the only part of the codebase that doesn't adhere to
the policy, so let's fix it.

## Code Changes

- make sensitive exclusively to `Timeline::cancel`
- This should be sufficient for all cases of shutdowns; the sensitivity
to task_mgr shutdown is unnecessary.
- this broke the various cancel tests in `test_timeline_size.py`, e.g.,
`test_timeline_initial_logical_size_calculation_cancellation`
- the tests would time out because the await point was not sensitive to
cancellation
- to fix this, refactor `pausable_failpoint` so that it accepts a
cancellation token
- side note: we _really_ should write our own failpoint library; maybe
after we get heap-allocated RequestContext, we can plumb failpoints
through there.
---
 libs/utils/src/failpoint_support.rs | 54 ++++++++++++++++-------
 pageserver/src/tenant/tasks.rs      |  7 ++-
 pageserver/src/tenant/timeline.rs   | 67 +++++++++++------------------
 3 files changed, 66 insertions(+), 62 deletions(-)

diff --git a/libs/utils/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs
index 701ba2d42c..272c6ebb26 100644
--- a/libs/utils/src/failpoint_support.rs
+++ b/libs/utils/src/failpoint_support.rs
@@ -11,31 +11,55 @@ use tracing::*;
 
 /// Declare a failpoint that can use to `pause` failpoint action.
 /// We don't want to block the executor thread, hence, spawn_blocking + await.
+///
+/// Optionally pass a cancellation token, and this failpoint will drop out of
+/// its pause when the cancellation token fires. This is useful for testing
+/// cases where we would like to block something, but test its clean shutdown behavior.
+/// The macro evaluates to a Result in that case, where Ok(()) is the case
+/// where the failpoint was not paused, and Err() is the case where cancellation
+/// token fired while evaluating the failpoint.
+///
+/// Remember to unpause the failpoint in the test; until that happens, one of the
+/// limited number of spawn_blocking thread pool threads is leaked.
 #[macro_export]
 macro_rules! pausable_failpoint {
-    ($name:literal) => {
+    ($name:literal) => {{
         if cfg!(feature = "testing") {
-            tokio::task::spawn_blocking({
-                let current = tracing::Span::current();
+            let cancel = ::tokio_util::sync::CancellationToken::new();
+            let _ = $crate::pausable_failpoint!($name, &cancel);
+        }
+    }};
+    ($name:literal, $cancel:expr) => {{
+        if cfg!(feature = "testing") {
+            let failpoint_fut = ::tokio::task::spawn_blocking({
+                let current = ::tracing::Span::current();
                 move || {
                     let _entered = current.entered();
-                    tracing::info!("at failpoint {}", $name);
-                    fail::fail_point!($name);
+                    ::tracing::info!("at failpoint {}", $name);
+                    ::fail::fail_point!($name);
+                }
+            });
+            let cancel_fut = async move {
+                $cancel.cancelled().await;
+            };
+            ::tokio::select! {
+                res = failpoint_fut => {
+                    res.expect("spawn_blocking");
+                    // continue with execution
+                    Ok(())
+                },
+                _ = cancel_fut => {
+                    Err(())
                 }
-            })
-            .await
-            .expect("spawn_blocking");
-        }
-    };
-    ($name:literal, $cond:expr) => {
-        if cfg!(feature = "testing") {
-            if $cond {
-                pausable_failpoint!($name)
             }
+        } else {
+            Ok(())
         }
-    };
+    }};
 }
 
+pub use pausable_failpoint;
+
 /// use with fail::cfg("$name", "return(2000)")
 ///
 /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 0118a5ce5f..3725e2f7fc 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -67,10 +67,9 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
 ) -> tokio::sync::SemaphorePermit<'static> {
     let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind);
 
-    pausable_failpoint!(
-        "initial-size-calculation-permit-pause",
-        loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation
-    );
+    if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation {
+        pausable_failpoint!("initial-size-calculation-permit-pause");
+    }
 
     // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
     match CONCURRENT_BACKGROUND_TASKS.acquire().await {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3245f23a28..e83b516d79 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -60,6 +60,7 @@ use utils::{
 use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
 
 use std::sync::atomic::Ordering as AtomicOrdering;
+use std::sync::OnceLock;
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 use std::{
@@ -72,7 +73,6 @@ use std::{
     collections::btree_map::Entry,
     ops::{Deref, Range},
 };
-use std::{pin::pin, sync::OnceLock};
 
 use crate::{
     aux_file::AuxFileSizeEstimator,
@@ -2804,12 +2804,10 @@ impl Timeline {
             "initial size calculation",
             // NB: don't log errors here, task_mgr will do that.
             async move {
-                let cancel = task_mgr::shutdown_token();
                 self_clone
                     .initial_logical_size_calculation_task(
                         initial_part_end,
                         cancel_wait_for_background_loop_concurrency_limit_semaphore,
-                        cancel,
                         background_ctx,
                     )
                     .await;
@@ -2819,11 +2817,21 @@ impl Timeline {
         );
     }
 
+    /// # Cancellation
+    ///
+    /// This method is sensitive to `Timeline::cancel`.
+    ///
+    /// It is _not_ sensitive to task_mgr::shutdown_token().
+    ///
+    /// # Cancel-Safety
+    ///
+    /// It does Timeline IO, hence this should be polled to completion because
+    /// we could be leaving in-flight IOs behind, which is safe, but annoying
+    /// to reason about.
     async fn initial_logical_size_calculation_task(
         self: Arc<Self>,
         initial_part_end: Lsn,
         skip_concurrency_limiter: CancellationToken,
-        cancel: CancellationToken,
         background_ctx: RequestContext,
     ) {
         scopeguard::defer! {
@@ -2836,7 +2844,6 @@ impl Timeline {
             let self_ref = &self;
             let skip_concurrency_limiter = &skip_concurrency_limiter;
             async move {
-                let cancel = task_mgr::shutdown_token();
                 let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
                     BackgroundLoopKind::InitialLogicalSizeCalculation,
                     background_ctx,
@@ -2850,9 +2857,6 @@ impl Timeline {
                     _ = self_ref.cancel.cancelled() => {
                         return Err(CalculateLogicalSizeError::Cancelled);
                     }
-                    _ = cancel.cancelled() => {
-                        return Err(CalculateLogicalSizeError::Cancelled);
-                    },
                     () = skip_concurrency_limiter.cancelled() => {
                         // Some action that is part of a end user interaction requested logical size
                         // => break out of the rate limit
@@ -2911,22 +2915,18 @@ impl Timeline {
                             )
                             .expect("10min < 1hour"),
                         );
-                        tokio::time::sleep(sleep_duration).await;
+                        tokio::select! {
+                            _ = tokio::time::sleep(sleep_duration) => {}
+                            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
+                        }
                     }
                 }
             }
         };
 
-        let (calculated_size, metrics_guard) = tokio::select! {
-            res = retrying  => {
-                match res {
-                    ControlFlow::Continue(calculated_size) => calculated_size,
-                    ControlFlow::Break(()) => return,
-                }
-            }
-            _ = cancel.cancelled() => {
-                return;
-            }
+        let (calculated_size, metrics_guard) = match retrying.await {
+            ControlFlow::Continue(calculated_size) => calculated_size,
+            ControlFlow::Break(()) => return,
         };
 
         // we cannot query current_logical_size.current_size() to know the current
@@ -2982,9 +2982,6 @@ impl Timeline {
         receiver
     }
 
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
     #[instrument(skip_all)]
     async fn logical_size_calculation_task(
         self: &Arc<Self>,
@@ -3002,32 +2999,13 @@ impl Timeline {
             .enter()
             .map_err(|_| CalculateLogicalSizeError::Cancelled)?;
 
-        let self_calculation = Arc::clone(self);
-
-        let mut calculation = pin!(async {
-            let ctx = ctx.attached_child();
-            self_calculation
-                .calculate_logical_size(lsn, cause, &guard, &ctx)
-                .await
-        });
-
-        tokio::select! {
-            res = &mut calculation => { res }
-            _ = self.cancel.cancelled() => {
-                debug!("cancelling logical size calculation for timeline shutdown");
-                calculation.await
-            }
-        }
+        self.calculate_logical_size(lsn, cause, &guard, ctx).await
     }
 
     /// Calculate the logical size of the database at the latest LSN.
     ///
     /// NOTE: counted incrementally, includes ancestors. This can be a slow operation,
     /// especially if we need to download remote layers.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
     async fn calculate_logical_size(
         &self,
         up_to_lsn: Lsn,
@@ -3040,7 +3018,10 @@ impl Timeline {
             self.timeline_id, up_to_lsn
         );
 
-        pausable_failpoint!("timeline-calculate-logical-size-pause");
+        if let Err(()) = pausable_failpoint!("timeline-calculate-logical-size-pause", &self.cancel)
+        {
+            return Err(CalculateLogicalSizeError::Cancelled);
+        }
 
         // See if we've already done the work for initial size calculation.
         // This is a short-cut for timelines that are mostly unused.

From 881e351f693ae1d177d57b171569d327d0abcfe8 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Wed, 22 Jan 2025 13:38:23 +0100
Subject: [PATCH 240/583] feat(compute): Allow installing both 0.8.0 and 0.7.4
 pgvector (#10345)

## Problem

Both these versions are binary compatible, but the way pgvector
structures the SQL files forbids installing 0.7.4 if you have a 0.8.0
distribution. Yet, some users may need a previous version for backward
compatibility, e.g., restoring the dump.

See this thread for discussion

https://neondb.slack.com/archives/C04DGM6SMTM/p1735911490242919?thread_ts=1731343604.259169&cid=C04DGM6SMTM

## Summary of changes

Put `vector--0.7.4.sql` file into compute image to allow installing this
version as well.

Tested on staging and it seems to be working as expected:
```sql
select * from pg_available_extensions where name = 'vector';
  name  | default_version | installed_version |                       comment
--------+-----------------+-------------------+------------------------------------------------------
 vector | 0.8.0           | (null)            | vector data type and ivfflat and hnsw access methods

create extension vector version '0.7.4';

select * from pg_available_extensions where name = 'vector';
  name  | default_version | installed_version |                       comment
--------+-----------------+-------------------+------------------------------------------------------
 vector | 0.8.0           | 0.7.4             | vector data type and ivfflat and hnsw access methods

alter extension vector update;

select * from pg_available_extensions where name = 'vector';
  name  | default_version | installed_version |                       comment
--------+-----------------+-------------------+------------------------------------------------------
 vector | 0.8.0           | 0.8.0             | vector data type and ivfflat and hnsw access methods

drop extension vector;
create extension vector;

select * from pg_available_extensions where name = 'vector';
  name  | default_version | installed_version |                       comment
--------+-----------------+-------------------+------------------------------------------------------
 vector | 0.8.0           | 0.8.0             | vector data type and ivfflat and hnsw access methods
```

If we find out it's a good approach, we can adopt the same for other
extensions with a stable ABI -- support both `current` and `current - 1`
releases.
---
 compute/compute-node.Dockerfile |  2 ++
 compute/patches/pgvector.patch  | 22 +++++++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index dbe7de046b..706c947008 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -360,6 +360,8 @@ COPY compute/patches/pgvector.patch /pgvector.patch
 RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O pgvector.tar.gz && \
     echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
+    wget https://github.com/pgvector/pgvector/raw/refs/tags/v0.7.4/sql/vector.sql -O ./sql/vector--0.7.4.sql && \
+    echo "10218d05dc02299562252a9484775178b14a1d8edb92a2d1672ef488530f7778 ./sql/vector--0.7.4.sql" | sha256sum --check && \
     patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \
     make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \
diff --git a/compute/patches/pgvector.patch b/compute/patches/pgvector.patch
index 3e1ffcaaaf..da41c86140 100644
--- a/compute/patches/pgvector.patch
+++ b/compute/patches/pgvector.patch
@@ -1,8 +1,24 @@
+diff --git a/Makefile b/Makefile
+index 7a4b88c..56678af 100644
+--- a/Makefile
++++ b/Makefile
+@@ -3,7 +3,10 @@ EXTVERSION = 0.8.0
+ 
+ MODULE_big = vector
+ DATA = $(wildcard sql/*--*--*.sql)
+-DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql
++# This change is needed to install different per-version SQL files
++# like pgvector--0.8.0.sql and pgvector--0.7.4.sql
++# The corresponding file is downloaded during the Docker image build process
++DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql sql/vector--0.7.4.sql
+ OBJS = src/bitutils.o src/bitvec.o src/halfutils.o src/halfvec.o src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/sparsevec.o src/vector.o
+ HEADERS = src/halfvec.h src/sparsevec.h src/vector.h
+ 
 diff --git a/src/hnswbuild.c b/src/hnswbuild.c
-index dcfb2bd..d5189ee 100644
+index b667478..fc1897c 100644
 --- a/src/hnswbuild.c
 +++ b/src/hnswbuild.c
-@@ -860,9 +860,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
+@@ -843,9 +843,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
  
  	hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
  
@@ -20,7 +36,7 @@ index dcfb2bd..d5189ee 100644
  	/* Close relations within worker */
  	index_close(indexRel, indexLockmode);
  	table_close(heapRel, heapLockmode);
-@@ -1117,12 +1125,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+@@ -1100,12 +1108,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
  	SeedRandom(42);
  #endif
  

From 414ed82c1fec460a9e2e460a32ef1122aac8a32c Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 22 Jan 2025 15:30:23 +0000
Subject: [PATCH 241/583] pageserver: issue concurrent IO on the read path
 (#9353)

## Refs

- Epic: https://github.com/neondatabase/neon/issues/9378

Co-authored-by: Vlad Lazar <vlad@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>

## Problem

The read path does its IOs sequentially.
This means that if N values need to be read to reconstruct a page,
we will do N IOs and getpage latency is `O(N*IoLatency)`.

## Solution

With this PR we gain the ability to issue IO concurrently within one
layer visit **and** to move on to the next layer without waiting for IOs
from the previous visit to complete.

This is an evolved version of the work done at the Lisbon hackathon,
cf https://github.com/neondatabase/neon/pull/9002.

## Design

### `will_init` now sourced from disk btree index keys

On the algorithmic level, the only change is that the
`get_values_reconstruct_data`
now sources `will_init` from the disk btree index key (which is
PS-page_cache'd), instead
of from the `Value`, which is only available after the IO completes.

### Concurrent IOs, Submission & Completion

To separate IO submission from waiting for its completion, while
simultaneously
feature-gating the change, we introduce the notion of an `IoConcurrency`
struct
through which IO futures are "spawned".

An IO is an opaque future, and waiting for completions is handled
through
`tokio::sync::oneshot` channels.
The oneshot Receiver's take the place of the `img` and `records` fields
inside `VectoredValueReconstructState`.

When we're done visiting all the layers and submitting all the IOs along
the way
we concurrently `collect_pending_ios` for each value, which means
for each value there is a future that awaits all the oneshot receivers
and then calls into walredo to reconstruct the page image.
Walredo is now invoked concurrently for each value instead of
sequentially.
Walredo itself remains unchanged.

The spawned IO futures are driven to completion by a sidecar tokio task
that
is separate from the task that performs all the layer visiting and
spawning of IOs.
That tasks receives the IO futures via an unbounded mpsc channel and
drives them to completion inside a `FuturedUnordered`.

(The behavior from before this PR is available through
`IoConcurrency::Sequential`,
which awaits the IO futures in place, without "spawning" or "submitting"
them
anywhere.)

#### Alternatives Explored

A few words on the rationale behind having a sidecar *task* and what
alternatives were considered.

One option is to queue up all IO futures in a FuturesUnordered that is
polled
the first time when we `collect_pending_ios`.

Firstly, the IO futures are opaque, compiler-generated futures that need
to be polled at least once to submit their IO. "At least once" because
tokio-epoll-uring may not be able to submit the IO to the kernel on
first
poll right away.

Second, there are deadlocks if we don't drive the IO futures to
completion
independently of the spawning task.
The reason is that both the IO futures and the spawning task may hold
some
_and_ try to acquire _more_ shared limited resources.
For example, both spawning task and IO future may try to acquire
* a VirtualFile file descriptor cache slot async mutex (observed during
impl)
* a tokio-epoll-uring submission slot (observed during impl)
* a PageCache slot (currently this is not the case but we may move more
code into the IO futures in the future)

Another option is to spawn a short-lived `tokio::task` for each IO
future.
We implemented and benchmarked it during development, but found little
throughput improvement and moderate mean & tail latency degradation.
Concerns about pressure on the tokio scheduler made us discard this
variant.

The sidecar task could be obsoleted if the IOs were not arbitrary code
but a well-defined struct.
However,
1. the opaque futures approach taken in this PR allows leaving the
existing
   code unchanged, which
2. allows us to implement the `IoConcurrency::Sequential` mode for
feature-gating
   the change.

Once the new mode sidecar task implementation is rolled out everywhere,
and `::Sequential` removed, we can think about a descriptive submission
& completion interface.
The problems around deadlocks pointed out earlier will need to be solved
then.
For example, we could eliminate VirtualFile file descriptor cache and
tokio-epoll-uring slots.
The latter has been drafted in
https://github.com/neondatabase/tokio-epoll-uring/pull/63.

See the lengthy doc comment on `spawn_io()` for more details.

### Error handling

There are two error classes during reconstruct data retrieval:
* traversal errors: index lookup, move to next layer, and the like
* value read IO errors

A traversal error fails the entire get_vectored request, as before this
PR.
A value read error only fails that value.

In any case, we preserve the existing behavior that once
`get_vectored` returns, all IOs are done. Panics and failing
to poll `get_vectored` to completion will leave the IOs dangling,
which is safe but shouldn't happen, and so, a rate-limited
log statement will be emitted at warning level.
There is a doc comment on `collect_pending_ios` giving more code-level
details and rationale.

### Feature Gating

The new behavior is opt-in via pageserver config.
The `Sequential` mode is the default.
The only significant change in `Sequential` mode compared to before
this PR is the buffering of results in the `oneshot`s.

## Code-Level Changes

Prep work:
  * Make `GateGuard` clonable.

Core Feature:
* Traversal code: track  `will_init` in `BlobMeta` and source it from
the Delta/Image/InMemory layer index, instead of determining `will_init`
  after we've read the value. This avoids having to read the value to
  determine whether traversal can stop.
* Introduce `IoConcurrency` & its sidecar task.
  * `IoConcurrency` is the clonable handle.
  * It connects to the sidecar task via an `mpsc`.
* Plumb through `IoConcurrency` from high level code to the
  individual layer implementations' `get_values_reconstruct_data`.
  We piggy-back on the `ValuesReconstructState` for this.
   * The sidecar task should be long-lived, so, `IoConcurrency` needs
     to be rooted up "high" in the call stack.
   * Roots as of this PR:
     * `page_service`: outside of pagestream loop
     * `create_image_layers`: when it is called
     * `basebackup`(only auxfiles + replorigin + SLRU segments)
   * Code with no roots that uses `IoConcurrency::sequential`
     * any `Timeline::get` call
       * `collect_keyspace` is a good example
       * follow-up: https://github.com/neondatabase/neon/issues/10460
* `TimelineAdaptor` code used by the compaction simulator, unused in
practive
     * `ingest_xlog_dbase_create`
* Transform Delta/Image/InMemoryLayer to
  * do their values IO in a distinct `async {}` block
  * extend the residence of the Delta/Image layer until the IO is done
  * buffer their results in a `oneshot` channel instead of straight
    in `ValuesReconstructState`
* the `oneshot` channel is wrapped in `OnDiskValueIo` /
`OnDiskValueIoWaiter`
    types that aid in expressiveness and are used to keep track of
    in-flight IOs so we can print warnings if we leave them dangling.
* Change `ValuesReconstructState` to hold the receiving end of the
 `oneshot` channel aka `OnDiskValueIoWaiter`.
* Change `get_vectored_impl` to `collect_pending_ios` and issue walredo
concurrently, in a `FuturesUnordered`.

Testing / Benchmarking:
* Support queue-depth in pagebench for manual benchmarkinng.
* Add test suite support for setting concurrency mode ps config
   field via a) an env var and b) via NeonEnvBuilder.
* Hacky helper to have sidecar-based IoConcurrency in tests.
   This will be cleaned up later.

More benchmarking will happen post-merge in nightly benchmarks, plus in
staging/pre-prod.
Some intermediate helpers for manual benchmarking have been preserved in
https://github.com/neondatabase/neon/pull/10466 and will be landed in
later PRs.
(L0 layer stack generator!)

Drive-By:
* test suite actually didn't enable batching by default because
`config.compatibility_neon_binpath` is always Truthy in our CI
environment
  => https://neondb.slack.com/archives/C059ZC138NR/p1737490501941309
* initial logical size calculation wasn't always polled to completion,
which was
  surfaced through the added WARN logs emitted when dropping a
  `ValuesReconstructState` that still has inflight IOs.
* remove the timing histograms
`pageserver_getpage_get_reconstruct_data_seconds`
and `pageserver_getpage_reconstruct_seconds` because with planning,
value read
IO, and walredo happening concurrently, one can no longer attribute
latency
to any one of them; we'll revisit this when Vlad's work on
tracing/sampling
  through RequestContext lands.
* remove code related to `get_cached_lsn()`.
  The logic around this has been dead at runtime for a long time,
  ever since the removal of the materialized page cache in #8105.

## Testing

Unit tests use the sidecar task by default and run both modes in CI.
Python regression tests and benchmarks also use the sidecar task by
default.
We'll test more in staging and possibly preprod.

# Future Work

Please refer to the parent epic for the full plan.

The next step will be to fold the plumbing of IoConcurrency
into RequestContext so that the function signatures get cleaned up.

Once `Sequential` isn't used anymore, we can take the next
big leap which is replacing the opaque IOs with structs
that have well-defined semantics.

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 .github/workflows/_build-and-test-locally.yml |  10 +-
 Cargo.lock                                    |   4 +-
 libs/pageserver_api/src/config.rs             |  25 +
 libs/utils/src/env.rs                         |  26 +-
 libs/utils/src/sync/gate.rs                   |  53 +-
 .../pagebench/src/cmd/getpage_latest_lsn.rs   |  61 +-
 pageserver/src/basebackup.rs                  |  23 +-
 pageserver/src/bin/pageserver.rs              |   1 +
 pageserver/src/config.rs                      |   4 +
 pageserver/src/http/routes.rs                 |  10 +-
 pageserver/src/metrics.rs                     |  68 --
 pageserver/src/page_service.rs                |  63 +-
 pageserver/src/pgdatadir_mapping.rs           |  45 +-
 pageserver/src/tenant.rs                      |  68 +-
 pageserver/src/tenant/storage_layer.rs        | 713 +++++++++++++++---
 .../src/tenant/storage_layer/delta_layer.rs   | 179 ++---
 .../src/tenant/storage_layer/image_layer.rs   | 112 +--
 .../tenant/storage_layer/inmemory_layer.rs    | 106 ++-
 pageserver/src/tenant/storage_layer/layer.rs  |  18 +-
 .../src/tenant/storage_layer/layer/tests.rs   |  19 +-
 pageserver/src/tenant/timeline.rs             | 133 ++--
 pageserver/src/tenant/timeline/compaction.rs  |   3 +-
 pageserver/src/tenant/vectored_blob_io.rs     | 103 ++-
 pageserver/src/walingest.rs                   | 155 +++-
 test_runner/fixtures/common_types.py          |   4 +
 test_runner/fixtures/metrics.py               |   4 -
 test_runner/fixtures/neon_fixtures.py         |  29 +
 test_runner/fixtures/parametrize.py           |   5 +
 test_runner/regress/test_compatibility.py     |   2 +
 29 files changed, 1490 insertions(+), 556 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 4263bacce8..2daed90386 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -229,8 +229,13 @@ jobs:
           ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)'
 
           # run pageserver tests with different settings
-          for io_engine in std-fs tokio-epoll-uring ; do
-            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
+          for get_vectored_concurrent_io in sequential sidecar-task; do
+            for io_engine in std-fs tokio-epoll-uring ; do
+              NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
+                NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
+                ${cov_prefix} \
+                cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
+            done
           done
 
           # Run separate tests for real S3
@@ -314,6 +319,7 @@ jobs:
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
           BUILD_TAG: ${{ inputs.build-tag }}
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
           USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
 
       # Temporary disable this step until we figure out why it's so flaky
diff --git a/Cargo.lock b/Cargo.lock
index 2020c417f0..1f090a27e4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6774,7 +6774,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#781989bb540a1408b0b93daa1e9d1fa452195497"
 dependencies = [
  "futures",
  "nix 0.26.4",
@@ -7369,7 +7369,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#781989bb540a1408b0b93daa1e9d1fa452195497"
 dependencies = [
  "bytes",
  "io-uring",
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index f0aeb00736..4982c6233d 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -120,6 +120,7 @@ pub struct ConfigToml {
     pub no_sync: Option<bool>,
     pub wal_receiver_protocol: PostgresClientProtocol,
     pub page_service_pipelining: PageServicePipeliningConfig,
+    pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -158,6 +159,25 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy {
     Tasks,
 }
 
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(tag = "mode", rename_all = "kebab-case")]
+#[serde(deny_unknown_fields)]
+pub enum GetVectoredConcurrentIo {
+    /// The read path is fully sequential: layers are visited
+    /// one after the other and IOs are issued and waited upon
+    /// from the same task that traverses the layers.
+    Sequential,
+    /// The read path still traverses layers sequentially, and
+    /// index blocks will be read into the PS PageCache from
+    /// that task, with waiting.
+    /// But data IOs are dispatched and waited upon from a sidecar
+    /// task so that the traversing task can continue to traverse
+    /// layers while the IOs are in flight.
+    /// If the PS PageCache miss rate is low, this improves
+    /// throughput dramatically.
+    SidecarTask,
+}
+
 pub mod statvfs {
     pub mod mock {
         #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -464,6 +484,11 @@ impl Default for ConfigToml {
                     execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
                 })
             },
+            get_vectored_concurrent_io: if !cfg!(test) {
+                GetVectoredConcurrentIo::Sequential
+            } else {
+                GetVectoredConcurrentIo::SidecarTask
+            },
         }
     }
 }
diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs
index b3e326bfd0..a1bcec9229 100644
--- a/libs/utils/src/env.rs
+++ b/libs/utils/src/env.rs
@@ -2,6 +2,7 @@
 
 use std::{fmt::Display, str::FromStr};
 
+/// For types `V` that implement [`FromStr`].
 pub fn var<V, E>(varname: &str) -> Option<V>
 where
     V: FromStr<Err = E>,
@@ -10,7 +11,9 @@ where
     match std::env::var(varname) {
         Ok(s) => Some(
             s.parse()
-                .map_err(|e| format!("failed to parse env var {varname}: {e:#}"))
+                .map_err(|e| {
+                    format!("failed to parse env var {varname} using FromStr::parse: {e:#}")
+                })
                 .unwrap(),
         ),
         Err(std::env::VarError::NotPresent) => None,
@@ -19,3 +22,24 @@ where
         }
     }
 }
+
+/// For types `V` that implement [`serde::de::DeserializeOwned`].
+pub fn var_serde_json_string<V>(varname: &str) -> Option<V>
+where
+    V: serde::de::DeserializeOwned,
+{
+    match std::env::var(varname) {
+        Ok(s) => Some({
+            let value = serde_json::Value::String(s);
+            serde_json::from_value(value)
+                .map_err(|e| {
+                    format!("failed to parse env var {varname} as a serde_json json string: {e:#}")
+                })
+                .unwrap()
+        }),
+        Err(std::env::VarError::NotPresent) => None,
+        Err(std::env::VarError::NotUnicode(_)) => {
+            panic!("env var {varname} is not unicode")
+        }
+    }
+}
diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs
index 16ec563fa7..0a1ed81621 100644
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -64,6 +64,12 @@ pub struct GateGuard {
     gate: Arc<GateInner>,
 }
 
+impl GateGuard {
+    pub fn try_clone(&self) -> Result<Self, GateError> {
+        Gate::enter_impl(self.gate.clone())
+    }
+}
+
 impl Drop for GateGuard {
     fn drop(&mut self) {
         if self.gate.closing.load(Ordering::Relaxed) {
@@ -107,11 +113,11 @@ impl Gate {
     /// to avoid blocking close() indefinitely: typically types that contain a Gate will
     /// also contain a CancellationToken.
     pub fn enter(&self) -> Result<GateGuard, GateError> {
-        let permit = self
-            .inner
-            .sem
-            .try_acquire()
-            .map_err(|_| GateError::GateClosed)?;
+        Self::enter_impl(self.inner.clone())
+    }
+
+    fn enter_impl(gate: Arc<GateInner>) -> Result<GateGuard, GateError> {
+        let permit = gate.sem.try_acquire().map_err(|_| GateError::GateClosed)?;
 
         // we now have the permit, let's disable the normal raii functionality and leave
         // "returning" the permit to our GateGuard::drop.
@@ -122,7 +128,7 @@ impl Gate {
 
         Ok(GateGuard {
             span_at_enter: tracing::Span::current(),
-            gate: self.inner.clone(),
+            gate,
         })
     }
 
@@ -252,4 +258,39 @@ mod tests {
         // Attempting to enter() is still forbidden
         gate.enter().expect_err("enter should fail finishing close");
     }
+
+    #[tokio::test(start_paused = true)]
+    async fn clone_gate_guard() {
+        let gate = Gate::default();
+        let forever = Duration::from_secs(24 * 7 * 365);
+
+        let guard1 = gate.enter().expect("gate isn't closed");
+
+        let guard2 = guard1.try_clone().expect("gate isn't clsoed");
+
+        let mut close_fut = std::pin::pin!(gate.close());
+
+        tokio::time::timeout(forever, &mut close_fut)
+            .await
+            .unwrap_err();
+
+        // we polled close_fut once, that should prevent all later enters and clones
+        gate.enter().unwrap_err();
+        guard1.try_clone().unwrap_err();
+        guard2.try_clone().unwrap_err();
+
+        // guard2 keeps gate open even if guard1 is closed
+        drop(guard1);
+        tokio::time::timeout(forever, &mut close_fut)
+            .await
+            .unwrap_err();
+
+        drop(guard2);
+
+        // now that the last guard is dropped, closing should complete
+        close_fut.await;
+
+        // entering is still forbidden
+        gate.enter().expect_err("enter should stilll fail");
+    }
 }
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 9f3984f1bd..a60efc7567 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -13,7 +13,7 @@ use rand::prelude::*;
 use tokio::task::JoinSet;
 use tracing::info;
 
-use std::collections::HashSet;
+use std::collections::{HashSet, VecDeque};
 use std::future::Future;
 use std::num::NonZeroUsize;
 use std::pin::Pin;
@@ -63,6 +63,10 @@ pub(crate) struct Args {
     #[clap(long)]
     set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,
 
+    /// Queue depth generated in each client.
+    #[clap(long, default_value = "1")]
+    queue_depth: NonZeroUsize,
+
     targets: Option<Vec<TenantTimelineId>>,
 }
 
@@ -298,6 +302,7 @@ async fn main_impl(
             start_work_barrier.wait().await;
             let client_start = Instant::now();
             let mut ticks_processed = 0;
+            let mut inflight = VecDeque::new();
             while !cancel.is_cancelled() {
                 // Detect if a request took longer than the RPS rate
                 if let Some(period) = &rps_period {
@@ -311,31 +316,37 @@ async fn main_impl(
                     ticks_processed = periods_passed_until_now;
                 }
 
-                let start = Instant::now();
-                let req = {
-                    let mut rng = rand::thread_rng();
-                    let r = &ranges[weights.sample(&mut rng)];
-                    let key: i128 = rng.gen_range(r.start..r.end);
-                    let key = Key::from_i128(key);
-                    assert!(key.is_rel_block_key());
-                    let (rel_tag, block_no) = key
-                        .to_rel_block()
-                        .expect("we filter non-rel-block keys out above");
-                    PagestreamGetPageRequest {
-                        hdr: PagestreamRequest {
-                            reqid: 0,
-                            request_lsn: if rng.gen_bool(args.req_latest_probability) {
-                                Lsn::MAX
-                            } else {
-                                r.timeline_lsn
+                while inflight.len() < args.queue_depth.get() {
+                    let start = Instant::now();
+                    let req = {
+                        let mut rng = rand::thread_rng();
+                        let r = &ranges[weights.sample(&mut rng)];
+                        let key: i128 = rng.gen_range(r.start..r.end);
+                        let key = Key::from_i128(key);
+                        assert!(key.is_rel_block_key());
+                        let (rel_tag, block_no) = key
+                            .to_rel_block()
+                            .expect("we filter non-rel-block keys out above");
+                        PagestreamGetPageRequest {
+                            hdr: PagestreamRequest {
+                                reqid: 0,
+                                request_lsn: if rng.gen_bool(args.req_latest_probability) {
+                                    Lsn::MAX
+                                } else {
+                                    r.timeline_lsn
+                                },
+                                not_modified_since: r.timeline_lsn,
                             },
-                            not_modified_since: r.timeline_lsn,
-                        },
-                        rel: rel_tag,
-                        blkno: block_no,
-                    }
-                };
-                client.getpage(req).await.unwrap();
+                            rel: rel_tag,
+                            blkno: block_no,
+                        }
+                    };
+                    client.getpage_send(req).await.unwrap();
+                    inflight.push_back(start);
+                }
+
+                let start = inflight.pop_front().unwrap();
+                client.getpage_recv().await.unwrap();
                 let end = Instant::now();
                 live_stats.request_done();
                 ticks_processed += 1;
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index e1b5676f46..a6087920fd 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -25,6 +25,7 @@ use tokio_tar::{Builder, EntryType, Header};
 
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::Version;
+use crate::tenant::storage_layer::IoConcurrency;
 use crate::tenant::Timeline;
 use pageserver_api::reltag::{RelTag, SlruKind};
 
@@ -123,6 +124,13 @@ where
         full_backup,
         replica,
         ctx,
+        io_concurrency: IoConcurrency::spawn_from_conf(
+            timeline.conf,
+            timeline
+                .gate
+                .enter()
+                .map_err(|e| BasebackupError::Server(e.into()))?,
+        ),
     };
     basebackup
         .send_tarball()
@@ -144,6 +152,7 @@ where
     full_backup: bool,
     replica: bool,
     ctx: &'a RequestContext,
+    io_concurrency: IoConcurrency,
 }
 
 /// A sink that accepts SLRU blocks ordered by key and forwards
@@ -303,7 +312,7 @@ where
             for part in slru_partitions.parts {
                 let blocks = self
                     .timeline
-                    .get_vectored(part, self.lsn, self.ctx)
+                    .get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
                     .await
                     .map_err(|e| BasebackupError::Server(e.into()))?;
 
@@ -358,7 +367,7 @@ where
         let start_time = Instant::now();
         let aux_files = self
             .timeline
-            .list_aux_files(self.lsn, self.ctx)
+            .list_aux_files(self.lsn, self.ctx, self.io_concurrency.clone())
             .await
             .map_err(|e| BasebackupError::Server(e.into()))?;
         let aux_scan_time = start_time.elapsed();
@@ -422,7 +431,7 @@ where
         }
         let repl_origins = self
             .timeline
-            .get_replorigins(self.lsn, self.ctx)
+            .get_replorigins(self.lsn, self.ctx, self.io_concurrency.clone())
             .await
             .map_err(|e| BasebackupError::Server(e.into()))?;
         let n_origins = repl_origins.len();
@@ -489,7 +498,13 @@ where
             for blknum in startblk..endblk {
                 let img = self
                     .timeline
-                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx)
+                    .get_rel_page_at_lsn(
+                        src,
+                        blknum,
+                        Version::Lsn(self.lsn),
+                        self.ctx,
+                        self.io_concurrency.clone(),
+                    )
                     .await
                     .map_err(|e| BasebackupError::Server(e.into()))?;
                 segment_data.extend_from_slice(&img[..]);
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 921c6a5092..5764728505 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -135,6 +135,7 @@ fn main() -> anyhow::Result<()> {
     info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
     info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
     info!(?conf.page_service_pipelining, "starting with page service pipelining config");
+    info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config");
 
     // The tenants directory contains all the pageserver local disk state.
     // Create if not exists and make sure all the contents are durable before proceeding.
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 1651db8500..ce480c70a0 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -191,6 +191,8 @@ pub struct PageServerConf {
     pub wal_receiver_protocol: PostgresClientProtocol,
 
     pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig,
+
+    pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo,
 }
 
 /// Token for authentication to safekeepers
@@ -352,6 +354,7 @@ impl PageServerConf {
             no_sync,
             wal_receiver_protocol,
             page_service_pipelining,
+            get_vectored_concurrent_io,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -396,6 +399,7 @@ impl PageServerConf {
             import_pgdata_aws_endpoint_url,
             wal_receiver_protocol,
             page_service_pipelining,
+            get_vectored_concurrent_io,
 
             // ------------------------------------------------------------
             // fields that require additional validation or custom handling
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 33b2d04588..5452719bcd 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -84,6 +84,7 @@ use crate::tenant::remote_timeline_client::list_remote_tenant_shards;
 use crate::tenant::remote_timeline_client::list_remote_timelines;
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
+use crate::tenant::storage_layer::IoConcurrency;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::import_pgdata;
@@ -2938,8 +2939,15 @@ async fn list_aux_files(
         active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
             .await?;
 
+    let io_concurrency = IoConcurrency::spawn_from_conf(
+        state.conf,
+        timeline.gate.enter().map_err(|_| ApiError::Cancelled)?,
+    );
+
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let files = timeline.list_aux_files(body.lsn, &ctx).await?;
+    let files = timeline
+        .list_aux_files(body.lsn, &ctx, io_concurrency)
+        .await?;
     json_response(StatusCode::OK, files)
 }
 
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 252e566f70..02467cb6f7 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -126,73 +126,6 @@ pub(crate) static INITDB_RUN_TIME: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define metric")
 });
 
-// Metrics collected on operations on the storage repository.
-#[derive(
-    Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr,
-)]
-pub(crate) enum GetKind {
-    Singular,
-    Vectored,
-}
-
-pub(crate) struct ReconstructTimeMetrics {
-    singular: Histogram,
-    vectored: Histogram,
-}
-
-pub(crate) static RECONSTRUCT_TIME: Lazy<ReconstructTimeMetrics> = Lazy::new(|| {
-    let inner = register_histogram_vec!(
-        "pageserver_getpage_reconstruct_seconds",
-        "Time spent in reconstruct_value (reconstruct a page from deltas)",
-        &["get_kind"],
-        CRITICAL_OP_BUCKETS.into(),
-    )
-    .expect("failed to define a metric");
-
-    ReconstructTimeMetrics {
-        singular: inner.with_label_values(&[GetKind::Singular.into()]),
-        vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
-    }
-});
-
-impl ReconstructTimeMetrics {
-    pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
-        match get_kind {
-            GetKind::Singular => &self.singular,
-            GetKind::Vectored => &self.vectored,
-        }
-    }
-}
-
-pub(crate) struct ReconstructDataTimeMetrics {
-    singular: Histogram,
-    vectored: Histogram,
-}
-
-impl ReconstructDataTimeMetrics {
-    pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
-        match get_kind {
-            GetKind::Singular => &self.singular,
-            GetKind::Vectored => &self.vectored,
-        }
-    }
-}
-
-pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<ReconstructDataTimeMetrics> = Lazy::new(|| {
-    let inner = register_histogram_vec!(
-        "pageserver_getpage_get_reconstruct_data_seconds",
-        "Time spent in get_reconstruct_value_data",
-        &["get_kind"],
-        CRITICAL_OP_BUCKETS.into(),
-    )
-    .expect("failed to define a metric");
-
-    ReconstructDataTimeMetrics {
-        singular: inner.with_label_values(&[GetKind::Singular.into()]),
-        vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
-    }
-});
-
 pub(crate) struct GetVectoredLatency {
     map: EnumMap<TaskKind, Option<Histogram>>,
 }
@@ -3934,7 +3867,6 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
     });
 
     // Custom
-    Lazy::force(&RECONSTRUCT_TIME);
     Lazy::force(&BASEBACKUP_QUERY_TIME);
     Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
     Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE);
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index b14a44f9e3..e5063b7fc2 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -39,6 +39,7 @@ use tokio::io::{AsyncWriteExt, BufWriter};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::sync::gate::{Gate, GateGuard};
 use utils::sync::spsc_fold;
 use utils::{
     auth::{Claims, Scope, SwappableJwtAuth},
@@ -61,6 +62,7 @@ use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME};
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
 use crate::tenant::mgr::{GetActiveTenantError, GetTenantError, ShardResolveResult};
+use crate::tenant::storage_layer::IoConcurrency;
 use crate::tenant::timeline::{self, WaitLsnError};
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
@@ -90,6 +92,7 @@ pub struct Listener {
 pub struct Connections {
     cancel: CancellationToken,
     tasks: tokio::task::JoinSet<ConnectionHandlerResult>,
+    gate: Gate,
 }
 
 pub fn spawn(
@@ -110,6 +113,7 @@ pub fn spawn(
     let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
         "libpq listener",
         libpq_listener_main(
+            conf,
             tenant_manager,
             pg_auth,
             tcp_listener,
@@ -134,11 +138,16 @@ impl Listener {
 }
 impl Connections {
     pub(crate) async fn shutdown(self) {
-        let Self { cancel, mut tasks } = self;
+        let Self {
+            cancel,
+            mut tasks,
+            gate,
+        } = self;
         cancel.cancel();
         while let Some(res) = tasks.join_next().await {
             Self::handle_connection_completion(res);
         }
+        gate.close().await;
     }
 
     fn handle_connection_completion(res: Result<anyhow::Result<()>, tokio::task::JoinError>) {
@@ -158,7 +167,9 @@ impl Connections {
 /// Returns Ok(()) upon cancellation via `cancel`, returning the set of
 /// open connections.
 ///
+#[allow(clippy::too_many_arguments)]
 pub async fn libpq_listener_main(
+    conf: &'static PageServerConf,
     tenant_manager: Arc<TenantManager>,
     auth: Option<Arc<SwappableJwtAuth>>,
     listener: tokio::net::TcpListener,
@@ -168,9 +179,15 @@ pub async fn libpq_listener_main(
     listener_cancel: CancellationToken,
 ) -> Connections {
     let connections_cancel = CancellationToken::new();
+    let connections_gate = Gate::default();
     let mut connection_handler_tasks = tokio::task::JoinSet::default();
 
     loop {
+        let gate_guard = match connections_gate.enter() {
+            Ok(guard) => guard,
+            Err(_) => break,
+        };
+
         let accepted = tokio::select! {
             biased;
             _ = listener_cancel.cancelled() => break,
@@ -190,6 +207,7 @@ pub async fn libpq_listener_main(
                 let connection_ctx = listener_ctx
                     .detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download);
                 connection_handler_tasks.spawn(page_service_conn_main(
+                    conf,
                     tenant_manager.clone(),
                     local_auth,
                     socket,
@@ -197,6 +215,7 @@ pub async fn libpq_listener_main(
                     pipelining_config.clone(),
                     connection_ctx,
                     connections_cancel.child_token(),
+                    gate_guard,
                 ));
             }
             Err(err) => {
@@ -211,13 +230,16 @@ pub async fn libpq_listener_main(
     Connections {
         cancel: connections_cancel,
         tasks: connection_handler_tasks,
+        gate: connections_gate,
     }
 }
 
 type ConnectionHandlerResult = anyhow::Result<()>;
 
 #[instrument(skip_all, fields(peer_addr))]
+#[allow(clippy::too_many_arguments)]
 async fn page_service_conn_main(
+    conf: &'static PageServerConf,
     tenant_manager: Arc<TenantManager>,
     auth: Option<Arc<SwappableJwtAuth>>,
     socket: tokio::net::TcpStream,
@@ -225,6 +247,7 @@ async fn page_service_conn_main(
     pipelining_config: PageServicePipeliningConfig,
     connection_ctx: RequestContext,
     cancel: CancellationToken,
+    gate_guard: GateGuard,
 ) -> ConnectionHandlerResult {
     let _guard = LIVE_CONNECTIONS
         .with_label_values(&["page_service"])
@@ -274,11 +297,13 @@ async fn page_service_conn_main(
     // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
     // and create the per-query context in process_query ourselves.
     let mut conn_handler = PageServerHandler::new(
+        conf,
         tenant_manager,
         auth,
         pipelining_config,
         connection_ctx,
         cancel.clone(),
+        gate_guard,
     );
     let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
 
@@ -310,6 +335,7 @@ async fn page_service_conn_main(
 }
 
 struct PageServerHandler {
+    conf: &'static PageServerConf,
     auth: Option<Arc<SwappableJwtAuth>>,
     claims: Option<Claims>,
 
@@ -325,6 +351,8 @@ struct PageServerHandler {
     timeline_handles: Option<TimelineHandles>,
 
     pipelining_config: PageServicePipeliningConfig,
+
+    gate_guard: GateGuard,
 }
 
 struct TimelineHandles {
@@ -634,19 +662,23 @@ impl BatchedFeMessage {
 
 impl PageServerHandler {
     pub fn new(
+        conf: &'static PageServerConf,
         tenant_manager: Arc<TenantManager>,
         auth: Option<Arc<SwappableJwtAuth>>,
         pipelining_config: PageServicePipeliningConfig,
         connection_ctx: RequestContext,
         cancel: CancellationToken,
+        gate_guard: GateGuard,
     ) -> Self {
         PageServerHandler {
+            conf,
             auth,
             claims: None,
             connection_ctx,
             timeline_handles: Some(TimelineHandles::new(tenant_manager)),
             cancel,
             pipelining_config,
+            gate_guard,
         }
     }
 
@@ -1015,6 +1047,7 @@ impl PageServerHandler {
         &mut self,
         pgb_writer: &mut PostgresBackend<IO>,
         batch: BatchedFeMessage,
+        io_concurrency: IoConcurrency,
         cancel: &CancellationToken,
         protocol_version: PagestreamProtocolVersion,
         ctx: &RequestContext,
@@ -1084,6 +1117,7 @@ impl PageServerHandler {
                                 &*shard.upgrade()?,
                                 effective_request_lsn,
                                 pages,
+                                io_concurrency,
                                 ctx,
                             )
                             .instrument(span.clone())
@@ -1288,6 +1322,17 @@ impl PageServerHandler {
             }
         }
 
+        let io_concurrency = IoConcurrency::spawn_from_conf(
+            self.conf,
+            match self.gate_guard.try_clone() {
+                Ok(guard) => guard,
+                Err(_) => {
+                    info!("shutdown request received in page handler");
+                    return Err(QueryError::Shutdown);
+                }
+            },
+        );
+
         let pgb_reader = pgb
             .split()
             .context("implementation error: split pgb into reader and writer")?;
@@ -1309,6 +1354,7 @@ impl PageServerHandler {
                     request_span,
                     pipelining_config,
                     protocol_version,
+                    io_concurrency,
                     &ctx,
                 )
                 .await
@@ -1322,6 +1368,7 @@ impl PageServerHandler {
                     timeline_handles,
                     request_span,
                     protocol_version,
+                    io_concurrency,
                     &ctx,
                 )
                 .await
@@ -1349,6 +1396,7 @@ impl PageServerHandler {
         mut timeline_handles: TimelineHandles,
         request_span: Span,
         protocol_version: PagestreamProtocolVersion,
+        io_concurrency: IoConcurrency,
         ctx: &RequestContext,
     ) -> (
         (PostgresBackendReader<IO>, TimelineHandles),
@@ -1383,7 +1431,14 @@ impl PageServerHandler {
             };
 
             let err = self
-                .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, protocol_version, ctx)
+                .pagesteam_handle_batched_message(
+                    pgb_writer,
+                    msg,
+                    io_concurrency.clone(),
+                    &cancel,
+                    protocol_version,
+                    ctx,
+                )
                 .await;
             match err {
                 Ok(()) => {}
@@ -1407,6 +1462,7 @@ impl PageServerHandler {
         request_span: Span,
         pipelining_config: PageServicePipeliningConfigPipelined,
         protocol_version: PagestreamProtocolVersion,
+        io_concurrency: IoConcurrency,
         ctx: &RequestContext,
     ) -> (
         (PostgresBackendReader<IO>, TimelineHandles),
@@ -1550,6 +1606,7 @@ impl PageServerHandler {
                     self.pagesteam_handle_batched_message(
                         pgb_writer,
                         batch,
+                        io_concurrency.clone(),
                         &cancel,
                         protocol_version,
                         &ctx,
@@ -1806,6 +1863,7 @@ impl PageServerHandler {
         timeline: &Timeline,
         effective_lsn: Lsn,
         requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
+        io_concurrency: IoConcurrency,
         ctx: &RequestContext,
     ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
         debug_assert_current_span_has_tenant_and_timeline_id();
@@ -1832,6 +1890,7 @@ impl PageServerHandler {
             .get_rel_page_at_lsn_batched(
                 requests.iter().map(|p| (&p.req.rel, &p.req.blkno)),
                 effective_lsn,
+                io_concurrency,
                 ctx,
             )
             .await;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index b65fe6cf7c..40c657524d 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -17,6 +17,7 @@ use crate::span::{
     debug_assert_current_span_has_tenant_and_timeline_id,
     debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
 };
+use crate::tenant::storage_layer::IoConcurrency;
 use crate::tenant::timeline::GetVectoredError;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
@@ -200,6 +201,7 @@ impl Timeline {
         blknum: BlockNumber,
         version: Version<'_>,
         ctx: &RequestContext,
+        io_concurrency: IoConcurrency,
     ) -> Result<Bytes, PageReconstructError> {
         match version {
             Version::Lsn(effective_lsn) => {
@@ -208,6 +210,7 @@ impl Timeline {
                     .get_rel_page_at_lsn_batched(
                         pages.iter().map(|(tag, blknum)| (tag, blknum)),
                         effective_lsn,
+                        io_concurrency.clone(),
                         ctx,
                     )
                     .await;
@@ -246,6 +249,7 @@ impl Timeline {
         &self,
         pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber)>,
         effective_lsn: Lsn,
+        io_concurrency: IoConcurrency,
         ctx: &RequestContext,
     ) -> Vec<Result<Bytes, PageReconstructError>> {
         debug_assert_current_span_has_tenant_and_timeline_id();
@@ -309,7 +313,10 @@ impl Timeline {
             acc.to_keyspace()
         };
 
-        match self.get_vectored(keyspace, effective_lsn, ctx).await {
+        match self
+            .get_vectored(keyspace, effective_lsn, io_concurrency, ctx)
+            .await
+        {
             Ok(results) => {
                 for (key, res) in results {
                     let mut key_slots = keys_slots.remove(&key).unwrap().into_iter();
@@ -889,9 +896,15 @@ impl Timeline {
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
+        io_concurrency: IoConcurrency,
     ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
         let kv = self
-            .scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx)
+            .scan(
+                KeySpace::single(Key::metadata_aux_key_range()),
+                lsn,
+                ctx,
+                io_concurrency,
+            )
             .await?;
         let mut result = HashMap::new();
         let mut sz = 0;
@@ -914,8 +927,9 @@ impl Timeline {
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
+        io_concurrency: IoConcurrency,
     ) -> Result<(), PageReconstructError> {
-        self.list_aux_files_v2(lsn, ctx).await?;
+        self.list_aux_files_v2(lsn, ctx, io_concurrency).await?;
         Ok(())
     }
 
@@ -923,17 +937,24 @@ impl Timeline {
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
+        io_concurrency: IoConcurrency,
     ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
-        self.list_aux_files_v2(lsn, ctx).await
+        self.list_aux_files_v2(lsn, ctx, io_concurrency).await
     }
 
     pub(crate) async fn get_replorigins(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
+        io_concurrency: IoConcurrency,
     ) -> Result<HashMap<RepOriginId, Lsn>, PageReconstructError> {
         let kv = self
-            .scan(KeySpace::single(repl_origin_key_range()), lsn, ctx)
+            .scan(
+                KeySpace::single(repl_origin_key_range()),
+                lsn,
+                ctx,
+                io_concurrency,
+            )
             .await?;
         let mut result = HashMap::new();
         for (k, v) in kv {
@@ -2432,7 +2453,11 @@ mod tests {
             ("foo/bar2".to_string(), Bytes::from_static(b"content2")),
         ]);
 
-        let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?;
+        let io_concurrency = IoConcurrency::spawn_for_test();
+
+        let readback = tline
+            .list_aux_files(Lsn(0x1008), &ctx, io_concurrency.clone())
+            .await?;
         assert_eq!(readback, expect_1008);
 
         // Second modification: update one key, remove the other
@@ -2444,11 +2469,15 @@ mod tests {
         let expect_2008 =
             HashMap::from([("foo/bar1".to_string(), Bytes::from_static(b"content3"))]);
 
-        let readback = tline.list_aux_files(Lsn(0x2008), &ctx).await?;
+        let readback = tline
+            .list_aux_files(Lsn(0x2008), &ctx, io_concurrency.clone())
+            .await?;
         assert_eq!(readback, expect_2008);
 
         // Reading back in time works
-        let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?;
+        let readback = tline
+            .list_aux_files(Lsn(0x1008), &ctx, io_concurrency.clone())
+            .await?;
         assert_eq!(readback, expect_1008);
 
         Ok(())
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e45ba2ca3b..a273ef5d01 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5714,7 +5714,7 @@ mod tests {
     use pageserver_api::value::Value;
     use pageserver_compaction::helpers::overlaps_with;
     use rand::{thread_rng, Rng};
-    use storage_layer::PersistentLayerKey;
+    use storage_layer::{IoConcurrency, PersistentLayerKey};
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
     use timeline::{CompactOptions, DeltaLayerTestDesc};
@@ -6495,6 +6495,7 @@ mod tests {
     async fn test_get_vectored() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_get_vectored").await?;
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -6559,7 +6560,7 @@ mod tests {
                 .get_vectored_impl(
                     read.clone(),
                     reads_lsn,
-                    &mut ValuesReconstructState::new(),
+                    &mut ValuesReconstructState::new(io_concurrency.clone()),
                     &ctx,
                 )
                 .await;
@@ -6606,6 +6607,7 @@ mod tests {
         let harness = TenantHarness::create("test_get_vectored_aux_files").await?;
 
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -6640,7 +6642,7 @@ mod tests {
             .get_vectored_impl(
                 aux_keyspace.clone(),
                 read_lsn,
-                &mut ValuesReconstructState::new(),
+                &mut ValuesReconstructState::new(io_concurrency.clone()),
                 &ctx,
             )
             .await;
@@ -6688,6 +6690,7 @@ mod tests {
         )
         .await?;
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
 
         let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
         let gap_at_key = current_key.add(100);
@@ -6788,7 +6791,7 @@ mod tests {
             .get_vectored_impl(
                 read.clone(),
                 current_lsn,
-                &mut ValuesReconstructState::new(),
+                &mut ValuesReconstructState::new(io_concurrency.clone()),
                 &ctx,
             )
             .await?;
@@ -6831,6 +6834,7 @@ mod tests {
     async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_get_vectored_on_lsn_axis").await?;
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
 
         let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
         let end_key = start_key.add(1000);
@@ -6923,7 +6927,7 @@ mod tests {
                         ranges: vec![child_gap_at_key..child_gap_at_key.next()],
                     },
                     query_lsn,
-                    &mut ValuesReconstructState::new(),
+                    &mut ValuesReconstructState::new(io_concurrency.clone()),
                     &ctx,
                 )
                 .await;
@@ -7369,6 +7373,7 @@ mod tests {
     async fn test_metadata_scan() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_metadata_scan").await?;
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -7422,7 +7427,7 @@ mod tests {
                 .get_vectored_impl(
                     keyspace.clone(),
                     lsn,
-                    &mut ValuesReconstructState::default(),
+                    &mut ValuesReconstructState::new(io_concurrency.clone()),
                     &ctx,
                 )
                 .await?
@@ -7537,6 +7542,7 @@ mod tests {
         let harness = TenantHarness::create("test_aux_file_e2e").await.unwrap();
 
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
 
         let mut lsn = Lsn(0x08);
 
@@ -7556,7 +7562,10 @@ mod tests {
         }
 
         // we can read everything from the storage
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        let files = tline
+            .list_aux_files(lsn, &ctx, io_concurrency.clone())
+            .await
+            .unwrap();
         assert_eq!(
             files.get("pg_logical/mappings/test1"),
             Some(&bytes::Bytes::from_static(b"first"))
@@ -7572,7 +7581,10 @@ mod tests {
             modification.commit(&ctx).await.unwrap();
         }
 
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        let files = tline
+            .list_aux_files(lsn, &ctx, io_concurrency.clone())
+            .await
+            .unwrap();
         assert_eq!(
             files.get("pg_logical/mappings/test2"),
             Some(&bytes::Bytes::from_static(b"second"))
@@ -7583,7 +7595,10 @@ mod tests {
             .await
             .unwrap();
 
-        let files = child.list_aux_files(lsn, &ctx).await.unwrap();
+        let files = child
+            .list_aux_files(lsn, &ctx, io_concurrency.clone())
+            .await
+            .unwrap();
         assert_eq!(files.get("pg_logical/mappings/test1"), None);
         assert_eq!(files.get("pg_logical/mappings/test2"), None);
     }
@@ -7592,6 +7607,7 @@ mod tests {
     async fn test_metadata_image_creation() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_metadata_image_creation").await?;
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -7611,8 +7627,9 @@ mod tests {
             keyspace: &KeySpace,
             lsn: Lsn,
             ctx: &RequestContext,
+            io_concurrency: IoConcurrency,
         ) -> anyhow::Result<(BTreeMap<Key, Result<Bytes, PageReconstructError>>, usize)> {
-            let mut reconstruct_state = ValuesReconstructState::default();
+            let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
             let res = tline
                 .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
                 .await?;
@@ -7660,7 +7677,8 @@ mod tests {
 
             if iter % 5 == 0 {
                 let (_, before_delta_file_accessed) =
-                    scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
+                    scan_with_statistics(&tline, &keyspace, lsn, &ctx, io_concurrency.clone())
+                        .await?;
                 tline
                     .compact(
                         &cancel,
@@ -7674,7 +7692,8 @@ mod tests {
                     )
                     .await?;
                 let (_, after_delta_file_accessed) =
-                    scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
+                    scan_with_statistics(&tline, &keyspace, lsn, &ctx, io_concurrency.clone())
+                        .await?;
                 assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}");
                 // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances.
                 assert!(
@@ -7763,6 +7782,7 @@ mod tests {
     async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads").await?;
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
 
         let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
         let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
@@ -7901,7 +7921,7 @@ mod tests {
         );
 
         // test vectored scan on parent timeline
-        let mut reconstruct_state = ValuesReconstructState::new();
+        let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
         let res = tline
             .get_vectored_impl(
                 KeySpace::single(Key::metadata_key_range()),
@@ -7927,7 +7947,7 @@ mod tests {
         );
 
         // test vectored scan on child timeline
-        let mut reconstruct_state = ValuesReconstructState::new();
+        let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
         let res = child
             .get_vectored_impl(
                 KeySpace::single(Key::metadata_key_range()),
@@ -7965,7 +7985,9 @@ mod tests {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<Option<Bytes>, GetVectoredError> {
-        let mut reconstruct_state = ValuesReconstructState::new();
+        let io_concurrency =
+            IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap());
+        let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
         let mut res = tline
             .get_vectored_impl(
                 KeySpace::single(key..key.next()),
@@ -8066,6 +8088,7 @@ mod tests {
             .await
             .unwrap();
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
 
         let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
         let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
@@ -8125,7 +8148,7 @@ mod tests {
 
         // Image layers are created at last_record_lsn
         let images = tline
-            .inspect_image_layers(Lsn(0x40), &ctx)
+            .inspect_image_layers(Lsn(0x40), &ctx, io_concurrency.clone())
             .await
             .unwrap()
             .into_iter()
@@ -8140,6 +8163,7 @@ mod tests {
             .await
             .unwrap();
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
 
         let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
         let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
@@ -8190,7 +8214,7 @@ mod tests {
 
         // Image layers are created at last_record_lsn
         let images = tline
-            .inspect_image_layers(Lsn(0x30), &ctx)
+            .inspect_image_layers(Lsn(0x30), &ctx, io_concurrency.clone())
             .await
             .unwrap()
             .into_iter()
@@ -8203,6 +8227,7 @@ mod tests {
     async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_simple_bottom_most_compaction_images").await?;
         let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
 
         fn get_key(id: u32) -> Key {
             // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
@@ -8344,7 +8369,7 @@ mod tests {
 
         // Check if the image layer at the GC horizon contains exactly what we want
         let image_at_gc_horizon = tline
-            .inspect_image_layers(Lsn(0x30), &ctx)
+            .inspect_image_layers(Lsn(0x30), &ctx, io_concurrency.clone())
             .await
             .unwrap()
             .into_iter()
@@ -10057,7 +10082,12 @@ mod tests {
 
         let keyspace = KeySpace::single(get_key(0)..get_key(10));
         let results = tline
-            .get_vectored(keyspace, delta_layer_end_lsn, &ctx)
+            .get_vectored(
+                keyspace,
+                delta_layer_end_lsn,
+                IoConcurrency::sequential(),
+                &ctx,
+            )
             .await
             .expect("No vectored errors");
         for (key, res) in results {
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 3913637ca0..c24d037dde 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -10,18 +10,26 @@ mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;
 
+use crate::config::PageServerConf;
 use crate::context::{AccessStatsBehavior, RequestContext};
 use bytes::Bytes;
+use futures::stream::FuturesUnordered;
+use futures::StreamExt;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::value::Value;
-use std::cmp::{Ordering, Reverse};
+use std::cmp::Ordering;
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
+use std::future::Future;
 use std::ops::Range;
+use std::pin::Pin;
+use std::sync::atomic::AtomicUsize;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
+use tracing::{trace, Instrument};
+use utils::sync::gate::GateGuard;
 
 use utils::lsn::Lsn;
 
@@ -78,30 +86,151 @@ pub(crate) enum ValueReconstructSituation {
     Continue,
 }
 
-/// Reconstruct data accumulated for a single key during a vectored get
-#[derive(Debug, Default, Clone)]
-pub(crate) struct VectoredValueReconstructState {
-    pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
-    pub(crate) img: Option<(Lsn, Bytes)>,
-
-    situation: ValueReconstructSituation,
+/// On disk representation of a value loaded in a buffer
+#[derive(Debug)]
+pub(crate) enum OnDiskValue {
+    /// Unencoded [`Value::Image`]
+    RawImage(Bytes),
+    /// Encoded [`Value`]. Can deserialize into an image or a WAL record
+    WalRecordOrImage(Bytes),
 }
 
-impl VectoredValueReconstructState {
-    fn get_cached_lsn(&self) -> Option<Lsn> {
-        self.img.as_ref().map(|img| img.0)
+/// Reconstruct data accumulated for a single key during a vectored get
+#[derive(Debug, Default)]
+pub(crate) struct VectoredValueReconstructState {
+    pub(crate) on_disk_values: Vec<(Lsn, OnDiskValueIoWaiter)>,
+
+    pub(crate) situation: ValueReconstructSituation,
+}
+
+#[derive(Debug)]
+pub(crate) struct OnDiskValueIoWaiter {
+    rx: tokio::sync::oneshot::Receiver<OnDiskValueIoResult>,
+}
+
+#[derive(Debug)]
+#[must_use]
+pub(crate) enum OnDiskValueIo {
+    /// Traversal identified this IO as required to complete the vectored get.
+    Required {
+        num_active_ios: Arc<AtomicUsize>,
+        tx: tokio::sync::oneshot::Sender<OnDiskValueIoResult>,
+    },
+    /// Sparse keyspace reads always read all the values for a given key,
+    /// even though only the first value is needed.
+    ///
+    /// This variant represents the unnecessary IOs for those values at lower LSNs
+    /// that aren't needed, but are currently still being done.
+    ///
+    /// The execution of unnecessary IOs was a pre-existing behavior before concurrent IO.
+    /// We added this explicit representation here so that we can drop
+    /// unnecessary IO results immediately, instead of buffering them in
+    /// `oneshot` channels inside [`VectoredValueReconstructState`] until
+    /// [`VectoredValueReconstructState::collect_pending_ios`] gets called.
+    Unnecessary,
+}
+
+type OnDiskValueIoResult = Result<OnDiskValue, std::io::Error>;
+
+impl OnDiskValueIo {
+    pub(crate) fn complete(self, res: OnDiskValueIoResult) {
+        match self {
+            OnDiskValueIo::Required { num_active_ios, tx } => {
+                num_active_ios.fetch_sub(1, std::sync::atomic::Ordering::Release);
+                let _ = tx.send(res);
+            }
+            OnDiskValueIo::Unnecessary => {
+                // Nobody cared, see variant doc comment.
+            }
+        }
     }
 }
 
-impl From<VectoredValueReconstructState> for ValueReconstructState {
-    fn from(mut state: VectoredValueReconstructState) -> Self {
-        // walredo expects the records to be descending in terms of Lsn
-        state.records.sort_by_key(|(lsn, _)| Reverse(*lsn));
+#[derive(Debug, thiserror::Error)]
+pub(crate) enum WaitCompletionError {
+    #[error("OnDiskValueIo was dropped without completing, likely the sidecar task panicked")]
+    IoDropped,
+}
 
-        ValueReconstructState {
-            records: state.records,
-            img: state.img,
+impl OnDiskValueIoWaiter {
+    pub(crate) async fn wait_completion(self) -> Result<OnDiskValueIoResult, WaitCompletionError> {
+        // NB: for Unnecessary IOs, this method never gets called because we don't add them to `on_disk_values`.
+        self.rx.await.map_err(|_| WaitCompletionError::IoDropped)
+    }
+}
+
+impl VectoredValueReconstructState {
+    /// # Cancel-Safety
+    ///
+    /// Technically fine to stop polling this future, but, the IOs will still
+    /// be executed to completion by the sidecar task and hold on to / consume resources.
+    /// Better not do it to make reasonsing about the system easier.
+    pub(crate) async fn collect_pending_ios(
+        self,
+    ) -> Result<ValueReconstructState, PageReconstructError> {
+        use utils::bin_ser::BeSer;
+
+        let mut res = Ok(ValueReconstructState::default());
+
+        // We should try hard not to bail early, so that by the time we return from this
+        // function, all IO for this value is done. It's not required -- we could totally
+        // stop polling the IO futures in the sidecar task, they need to support that,
+        // but just stopping to poll doesn't reduce the IO load on the disk. It's easier
+        // to reason about the system if we just wait for all IO to complete, even if
+        // we're no longer interested in the result.
+        //
+        // Revisit this when IO futures are replaced with a more sophisticated IO system
+        // and an IO scheduler, where we know which IOs were submitted and which ones
+        // just queued. Cf the comment on IoConcurrency::spawn_io.
+        for (lsn, waiter) in self.on_disk_values {
+            let value_recv_res = waiter
+                .wait_completion()
+                // we rely on the caller to poll us to completion, so this is not a bail point
+                .await;
+            // Force not bailing early by wrapping the code into a closure.
+            #[allow(clippy::redundant_closure_call)]
+            let _: () = (|| {
+                match (&mut res, value_recv_res) {
+                    (Err(_), _) => {
+                        // We've already failed, no need to process more.
+                    }
+                    (Ok(_), Err(wait_err)) => {
+                        // This shouldn't happen - likely the sidecar task panicked.
+                        res = Err(PageReconstructError::Other(wait_err.into()));
+                    }
+                    (Ok(_), Ok(Err(err))) => {
+                        let err: std::io::Error = err;
+                        // TODO: returning IO error here will fail a compute query.
+                        // Probably not what we want, we're not doing `maybe_fatal_err`
+                        // in the IO futures.
+                        // But it's been like that for a long time, not changing it
+                        // as part of concurrent IO.
+                        // => https://github.com/neondatabase/neon/issues/10454
+                        res = Err(PageReconstructError::Other(err.into()));
+                    }
+                    (Ok(ok), Ok(Ok(OnDiskValue::RawImage(img)))) => {
+                        assert!(ok.img.is_none());
+                        ok.img = Some((lsn, img));
+                    }
+                    (Ok(ok), Ok(Ok(OnDiskValue::WalRecordOrImage(buf)))) => {
+                        match Value::des(&buf) {
+                            Ok(Value::WalRecord(rec)) => {
+                                ok.records.push((lsn, rec));
+                            }
+                            Ok(Value::Image(img)) => {
+                                assert!(ok.img.is_none());
+                                ok.img = Some((lsn, img));
+                            }
+                            Err(err) => {
+                                res = Err(PageReconstructError::Other(err.into()));
+                            }
+                        }
+                    }
+                }
+            })();
         }
+
+        res
     }
 }
 
@@ -109,7 +238,7 @@ impl From<VectoredValueReconstructState> for ValueReconstructState {
 pub(crate) struct ValuesReconstructState {
     /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
     /// should not expect to get anything from this hashmap.
-    pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
+    pub(crate) keys: HashMap<Key, VectoredValueReconstructState>,
     /// The keys which are already retrieved
     keys_done: KeySpaceRandomAccum,
 
@@ -119,27 +248,365 @@ pub(crate) struct ValuesReconstructState {
     // Statistics that are still accessible as a caller of `get_vectored_impl`.
     layers_visited: u32,
     delta_layers_visited: u32,
+
+    pub(crate) io_concurrency: IoConcurrency,
+    num_active_ios: Arc<AtomicUsize>,
+}
+
+/// The level of IO concurrency to be used on the read path
+///
+/// The desired end state is that we always do parallel IO.
+/// This struct and the dispatching in the impl will be removed once
+/// we've built enough confidence.
+pub(crate) enum IoConcurrency {
+    Sequential,
+    SidecarTask {
+        task_id: usize,
+        ios_tx: tokio::sync::mpsc::UnboundedSender<IoFuture>,
+    },
+}
+
+type IoFuture = Pin<Box<dyn Send + Future<Output = ()>>>;
+
+pub(crate) enum SelectedIoConcurrency {
+    Sequential,
+    SidecarTask(GateGuard),
+}
+
+impl std::fmt::Debug for IoConcurrency {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            IoConcurrency::Sequential => write!(f, "Sequential"),
+            IoConcurrency::SidecarTask { .. } => write!(f, "SidecarTask"),
+        }
+    }
+}
+
+impl std::fmt::Debug for SelectedIoConcurrency {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            SelectedIoConcurrency::Sequential => write!(f, "Sequential"),
+            SelectedIoConcurrency::SidecarTask(_) => write!(f, "SidecarTask"),
+        }
+    }
+}
+
+impl IoConcurrency {
+    /// Force sequential IO. This is a temporary workaround until we have
+    /// moved plumbing-through-the-call-stack
+    /// of IoConcurrency into `RequestContextq.
+    ///
+    /// DO NOT USE for new code.
+    ///
+    /// Tracking issue: <https://github.com/neondatabase/neon/issues/10460>.
+    pub(crate) fn sequential() -> Self {
+        Self::spawn(SelectedIoConcurrency::Sequential)
+    }
+
+    pub(crate) fn spawn_from_conf(
+        conf: &'static PageServerConf,
+        gate_guard: GateGuard,
+    ) -> IoConcurrency {
+        use pageserver_api::config::GetVectoredConcurrentIo;
+        let selected = match conf.get_vectored_concurrent_io {
+            GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential,
+            GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard),
+        };
+        Self::spawn(selected)
+    }
+
+    pub(crate) fn spawn(io_concurrency: SelectedIoConcurrency) -> Self {
+        match io_concurrency {
+            SelectedIoConcurrency::Sequential => IoConcurrency::Sequential,
+            SelectedIoConcurrency::SidecarTask(gate_guard) => {
+                let (ios_tx, ios_rx) = tokio::sync::mpsc::unbounded_channel();
+                static TASK_ID: AtomicUsize = AtomicUsize::new(0);
+                let task_id = TASK_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+                // TODO: enrich the span with more context (tenant,shard,timeline) + (basebackup|pagestream|...)
+                let span =
+                    tracing::info_span!(parent: None, "IoConcurrency_sidecar", task_id = task_id);
+                trace!(task_id, "spawning sidecar task");
+                tokio::spawn(async move {
+                    trace!("start");
+                    scopeguard::defer!{ trace!("end") };
+                    type IosRx = tokio::sync::mpsc::UnboundedReceiver<IoFuture>;
+                    enum State {
+                        Waiting {
+                            // invariant: is_empty(), but we recycle the allocation
+                            empty_futures: FuturesUnordered<IoFuture>,
+                            ios_rx: IosRx,
+                        },
+                        Executing {
+                            futures: FuturesUnordered<IoFuture>,
+                            ios_rx: IosRx,
+                        },
+                        ShuttingDown {
+                            futures: FuturesUnordered<IoFuture>,
+                        },
+                    }
+                    let mut state = State::Waiting {
+                        empty_futures: FuturesUnordered::new(),
+                        ios_rx,
+                    };
+                    loop {
+                        match state {
+                            State::Waiting {
+                                empty_futures,
+                                mut ios_rx,
+                            } => {
+                                assert!(empty_futures.is_empty());
+                                tokio::select! {
+                                    fut = ios_rx.recv() => {
+                                        if let Some(fut) = fut {
+                                            trace!("received new io future");
+                                            empty_futures.push(fut);
+                                            state = State::Executing { futures: empty_futures, ios_rx };
+                                        } else {
+                                            state = State::ShuttingDown { futures: empty_futures }
+                                        }
+                                    }
+                                }
+                            }
+                            State::Executing {
+                                mut futures,
+                                mut ios_rx,
+                            } => {
+                                tokio::select! {
+                                    res = futures.next() => {
+                                        trace!("io future completed");
+                                        assert!(res.is_some());
+                                        if futures.is_empty() {
+                                            state = State::Waiting { empty_futures: futures, ios_rx};
+                                        } else {
+                                            state = State::Executing { futures, ios_rx };
+                                        }
+                                    }
+                                    fut = ios_rx.recv() => {
+                                        if let Some(fut) = fut {
+                                            trace!("received new io future");
+                                            futures.push(fut);
+                                            state =  State::Executing { futures, ios_rx};
+                                        } else {
+                                            state = State::ShuttingDown { futures };
+                                        }
+                                    }
+                                }
+                            }
+                            State::ShuttingDown {
+                                mut futures,
+                            } => {
+                                trace!("shutting down");
+                                while let Some(()) = futures.next().await {
+                                    trace!("io future completed (shutdown)");
+                                    // drain
+                                }
+                                trace!("shutdown complete");
+                                break;
+                            }
+                        }
+                    }
+                    drop(gate_guard); // drop it right before we exit
+                }.instrument(span));
+                IoConcurrency::SidecarTask { task_id, ios_tx }
+            }
+        }
+    }
+
+    pub(crate) fn clone(&self) -> Self {
+        match self {
+            IoConcurrency::Sequential => IoConcurrency::Sequential,
+            IoConcurrency::SidecarTask { task_id, ios_tx } => IoConcurrency::SidecarTask {
+                task_id: *task_id,
+                ios_tx: ios_tx.clone(),
+            },
+        }
+    }
+
+    /// Submit an IO to be executed in the background. DEADLOCK RISK, read the full doc string.
+    ///
+    /// The IO is represented as an opaque future.
+    /// IO completion must be handled inside the future, e.g., through a oneshot channel.
+    ///
+    /// The API seems simple but there are multiple **pitfalls** involving
+    /// DEADLOCK RISK.
+    ///
+    /// First, there are no guarantees about the exexecution of the IO.
+    /// It may be `await`ed in-place before this function returns.
+    /// It may be polled partially by this task and handed off to another task to be finished.
+    /// It may be polled and then dropped before returning ready.
+    ///
+    /// This means that submitted IOs must not be interedependent.
+    /// Interdependence may be through shared limited resources, e.g.,
+    /// - VirtualFile file descriptor cache slot acquisition
+    /// - tokio-epoll-uring slot
+    ///
+    /// # Why current usage is safe from deadlocks
+    ///
+    /// Textbook condition for a deadlock is that _all_ of the following be given
+    /// - Mutual exclusion
+    /// - Hold and wait
+    /// - No preemption
+    /// - Circular wait
+    ///
+    /// The current usage is safe because:
+    /// - Mutual exclusion: IO futures definitely use mutexes, no way around that for now
+    /// - Hold and wait: IO futures currently hold two kinds of locks/resources while waiting
+    ///   for acquisition of other resources:
+    ///    - VirtualFile file descriptor cache slot tokio mutex
+    ///    - tokio-epoll-uring slot (uses tokio notify => wait queue, much like mutex)
+    /// - No preemption: there's no taking-away of acquired locks/resources => given
+    /// - Circular wait: this is the part of the condition that isn't met: all IO futures
+    ///   first acquire VirtualFile mutex, then tokio-epoll-uring slot.
+    ///   There is no IO future that acquires slot before VirtualFile.
+    ///   Hence there can be no circular waiting.
+    ///   Hence there cannot be a deadlock.
+    ///
+    /// This is a very fragile situation and must be revisited whenver any code called from
+    /// inside the IO futures is changed.
+    ///
+    /// We will move away from opaque IO futures towards well-defined IOs at some point in
+    /// the future when we have shipped this first version of concurrent IO to production
+    /// and are ready to retire the Sequential mode which runs the futures in place.
+    /// Right now, while brittle, the opaque IO approach allows us to ship the feature
+    /// with minimal changes to the code and minimal changes to existing behavior in Sequential mode.
+    ///
+    /// Also read the comment in `collect_pending_ios`.
+    pub(crate) async fn spawn_io<F>(&mut self, fut: F)
+    where
+        F: std::future::Future<Output = ()> + Send + 'static,
+    {
+        match self {
+            IoConcurrency::Sequential => fut.await,
+            IoConcurrency::SidecarTask { ios_tx, .. } => {
+                let fut = Box::pin(fut);
+                // NB: experiments showed that doing an opportunistic poll of `fut` here was bad for throughput
+                // while insignificant for latency.
+                // It would make sense to revisit the tokio-epoll-uring API in the future such that we can try
+                // a submission here, but never poll the future. That way, io_uring can make proccess while
+                // the future sits in the ios_tx queue.
+                match ios_tx.send(fut) {
+                    Ok(()) => {}
+                    Err(_) => {
+                        unreachable!("the io task must have exited, likely it panicked")
+                    }
+                }
+            }
+        }
+    }
+
+    #[cfg(test)]
+    pub(crate) fn spawn_for_test() -> impl std::ops::DerefMut<Target = Self> {
+        use std::ops::{Deref, DerefMut};
+        use tracing::info;
+        use utils::sync::gate::Gate;
+
+        // Spawn needs a Gate, give it one.
+        struct Wrapper {
+            inner: IoConcurrency,
+            #[allow(dead_code)]
+            gate: Box<Gate>,
+        }
+        impl Deref for Wrapper {
+            type Target = IoConcurrency;
+
+            fn deref(&self) -> &Self::Target {
+                &self.inner
+            }
+        }
+        impl DerefMut for Wrapper {
+            fn deref_mut(&mut self) -> &mut Self::Target {
+                &mut self.inner
+            }
+        }
+        let gate = Box::new(Gate::default());
+
+        // The default behavior when running Rust unit tests without any further
+        // flags is to use the new behavior.
+        // The CI uses the following environment variable to unit test both old
+        // and new behavior.
+        // NB: the Python regression & perf tests take the `else` branch
+        // below and have their own defaults management.
+        let selected = {
+            // The pageserver_api::config type is unsuitable because it's internally tagged.
+            #[derive(serde::Deserialize)]
+            #[serde(rename_all = "kebab-case")]
+            enum TestOverride {
+                Sequential,
+                SidecarTask,
+            }
+            use once_cell::sync::Lazy;
+            static TEST_OVERRIDE: Lazy<TestOverride> = Lazy::new(|| {
+                utils::env::var_serde_json_string(
+                    "NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO",
+                )
+                .unwrap_or(TestOverride::SidecarTask)
+            });
+
+            match *TEST_OVERRIDE {
+                TestOverride::Sequential => SelectedIoConcurrency::Sequential,
+                TestOverride::SidecarTask => {
+                    SelectedIoConcurrency::SidecarTask(gate.enter().expect("just created it"))
+                }
+            }
+        };
+
+        info!(?selected, "get_vectored_concurrent_io test");
+
+        Wrapper {
+            inner: Self::spawn(selected),
+            gate,
+        }
+    }
+}
+
+/// Make noise in case the [`ValuesReconstructState`] gets dropped while
+/// there are still IOs in flight.
+/// Refer to `collect_pending_ios` for why we prefer not to do that.
+//
+/// We log from here instead of from the sidecar task because the [`ValuesReconstructState`]
+/// gets dropped in a tracing span with more context.
+/// We repeat the sidecar tasks's `task_id` so we can correlate what we emit here with
+/// the logs / panic handler logs from the sidecar task, which also logs the `task_id`.
+impl Drop for ValuesReconstructState {
+    fn drop(&mut self) {
+        let num_active_ios = self
+            .num_active_ios
+            .load(std::sync::atomic::Ordering::Acquire);
+        if num_active_ios == 0 {
+            return;
+        }
+        let sidecar_task_id = match &self.io_concurrency {
+            IoConcurrency::Sequential => None,
+            IoConcurrency::SidecarTask { task_id, .. } => Some(*task_id),
+        };
+        tracing::warn!(
+            num_active_ios,
+            ?sidecar_task_id,
+            backtrace=%std::backtrace::Backtrace::force_capture(),
+            "dropping ValuesReconstructState while some IOs have not been completed",
+        );
+    }
 }
 
 impl ValuesReconstructState {
-    pub(crate) fn new() -> Self {
+    pub(crate) fn new(io_concurrency: IoConcurrency) -> Self {
         Self {
             keys: HashMap::new(),
             keys_done: KeySpaceRandomAccum::new(),
             keys_with_image_coverage: None,
             layers_visited: 0,
             delta_layers_visited: 0,
+            io_concurrency,
+            num_active_ios: Arc::new(AtomicUsize::new(0)),
         }
     }
 
-    /// Associate a key with the error which it encountered and mark it as done
-    pub(crate) fn on_key_error(&mut self, key: Key, err: PageReconstructError) {
-        let previous = self.keys.insert(key, Err(err));
-        if let Some(Ok(state)) = previous {
-            if state.situation == ValueReconstructSituation::Continue {
-                self.keys_done.add_key(key);
-            }
-        }
+    /// Absolutely read [`IoConcurrency::spawn_io`] to learn about assumptions & pitfalls.
+    pub(crate) async fn spawn_io<F>(&mut self, fut: F)
+    where
+        F: std::future::Future<Output = ()> + Send + 'static,
+    {
+        self.io_concurrency.spawn_io(fut).await;
     }
 
     pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) {
@@ -159,29 +626,6 @@ impl ValuesReconstructState {
         self.layers_visited
     }
 
-    /// This function is called after reading a keyspace from a layer.
-    /// It checks if the read path has now moved past the cached Lsn for any keys.
-    ///
-    /// Implementation note: We intentionally iterate over the keys for which we've
-    /// already collected some reconstruct data. This avoids scaling complexity with
-    /// the size of the search space.
-    pub(crate) fn on_lsn_advanced(&mut self, keyspace: &KeySpace, advanced_to: Lsn) {
-        for (key, value) in self.keys.iter_mut() {
-            if !keyspace.contains(key) {
-                continue;
-            }
-
-            if let Ok(state) = value {
-                if state.situation != ValueReconstructSituation::Complete
-                    && state.get_cached_lsn() >= Some(advanced_to)
-                {
-                    state.situation = ValueReconstructSituation::Complete;
-                    self.keys_done.add_key(*key);
-                }
-            }
-        }
-    }
-
     /// On hitting image layer, we can mark all keys in this range as done, because
     /// if the image layer does not contain a key, it is deleted/never added.
     pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range<Key>) {
@@ -199,70 +643,42 @@ impl ValuesReconstructState {
     ///
     /// If the key is in the sparse keyspace (i.e., aux files), we do not track them in
     /// `key_done`.
-    pub(crate) fn update_key(
-        &mut self,
-        key: &Key,
-        lsn: Lsn,
-        value: Value,
-    ) -> ValueReconstructSituation {
-        let state = self
-            .keys
-            .entry(*key)
-            .or_insert(Ok(VectoredValueReconstructState::default()));
+    // TODO: rename this method & update description.
+    pub(crate) fn update_key(&mut self, key: &Key, lsn: Lsn, completes: bool) -> OnDiskValueIo {
+        let state = self.keys.entry(*key).or_default();
+
         let is_sparse_key = key.is_sparse();
-        if let Ok(state) = state {
-            let key_done = match state.situation {
-                ValueReconstructSituation::Complete => {
-                    if is_sparse_key {
-                        // Sparse keyspace might be visited multiple times because
-                        // we don't track unmapped keyspaces.
-                        return ValueReconstructSituation::Complete;
-                    } else {
-                        unreachable!()
-                    }
-                }
-                ValueReconstructSituation::Continue => match value {
-                    Value::Image(img) => {
-                        state.img = Some((lsn, img));
-                        true
-                    }
-                    Value::WalRecord(rec) => {
-                        debug_assert!(
-                            Some(lsn) > state.get_cached_lsn(),
-                            "Attempt to collect a record below cached LSN for walredo: {} < {}",
-                            lsn,
-                            state
-                                .get_cached_lsn()
-                                .expect("Assertion can only fire if a cached lsn is present")
-                        );
 
-                        let will_init = rec.will_init();
-                        state.records.push((lsn, rec));
-                        will_init
-                    }
-                },
-            };
-
-            if key_done && state.situation == ValueReconstructSituation::Continue {
-                state.situation = ValueReconstructSituation::Complete;
-                if !is_sparse_key {
-                    self.keys_done.add_key(*key);
+        let required_io = match state.situation {
+            ValueReconstructSituation::Complete => {
+                if is_sparse_key {
+                    // Sparse keyspace might be visited multiple times because
+                    // we don't track unmapped keyspaces.
+                    return OnDiskValueIo::Unnecessary;
+                } else {
+                    unreachable!()
                 }
             }
+            ValueReconstructSituation::Continue => {
+                self.num_active_ios
+                    .fetch_add(1, std::sync::atomic::Ordering::Release);
+                let (tx, rx) = tokio::sync::oneshot::channel();
+                state.on_disk_values.push((lsn, OnDiskValueIoWaiter { rx }));
+                OnDiskValueIo::Required {
+                    tx,
+                    num_active_ios: Arc::clone(&self.num_active_ios),
+                }
+            }
+        };
 
-            state.situation
-        } else {
-            ValueReconstructSituation::Complete
+        if completes && state.situation == ValueReconstructSituation::Continue {
+            state.situation = ValueReconstructSituation::Complete;
+            if !is_sparse_key {
+                self.keys_done.add_key(*key);
+            }
         }
-    }
 
-    /// Returns the Lsn at which this key is cached if one exists.
-    /// The read path should go no further than this Lsn for the given key.
-    pub(crate) fn get_cached_lsn(&self, key: &Key) -> Option<Lsn> {
-        self.keys
-            .get(key)
-            .and_then(|k| k.as_ref().ok())
-            .and_then(|state| state.get_cached_lsn())
+        required_io
     }
 
     /// Returns the key space describing the keys that have
@@ -276,12 +692,6 @@ impl ValuesReconstructState {
     }
 }
 
-impl Default for ValuesReconstructState {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 /// A key that uniquely identifies a layer in a timeline
 #[derive(Debug, PartialEq, Eq, Clone, Hash)]
 pub(crate) enum LayerId {
@@ -720,3 +1130,78 @@ impl<T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'_, T> {
         write!(f, "{}..{}", self.0.start, self.0.end)
     }
 }
+
+#[cfg(test)]
+mod tests2 {
+    use pageserver_api::key::DBDIR_KEY;
+    use tracing::info;
+
+    use super::*;
+    use crate::tenant::storage_layer::IoConcurrency;
+
+    /// TODO: currently this test relies on manual visual inspection of the --no-capture output.
+    /// Should look like so:
+    /// ```text
+    /// RUST_LOG=trace cargo nextest run  --features testing  --no-capture test_io_concurrency_noise
+    /// running 1 test
+    /// 2025-01-21T17:42:01.335679Z  INFO get_vectored_concurrent_io test selected=SidecarTask
+    /// 2025-01-21T17:42:01.335680Z TRACE spawning sidecar task task_id=0
+    /// 2025-01-21T17:42:01.335937Z TRACE IoConcurrency_sidecar{task_id=0}: start
+    /// 2025-01-21T17:42:01.335972Z TRACE IoConcurrency_sidecar{task_id=0}: received new io future
+    /// 2025-01-21T17:42:01.335999Z  INFO IoConcurrency_sidecar{task_id=0}: waiting for signal to complete IO
+    /// 2025-01-21T17:42:01.336229Z  WARN dropping ValuesReconstructState while some IOs have not been completed num_active_ios=1 sidecar_task_id=Some(0) backtrace=   0: <pageserver::tenant::storage_layer::ValuesReconstructState as core::ops::drop::Drop>::drop
+    ///              at ./src/tenant/storage_layer.rs:553:24
+    ///    1: core::ptr::drop_in_place<pageserver::tenant::storage_layer::ValuesReconstructState>
+    ///              at /home/christian/.rustup/toolchains/1.84.0-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/ptr/mod.rs:521:1
+    ///    2: core::mem::drop
+    ///              at /home/christian/.rustup/toolchains/1.84.0-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/mem/mod.rs:942:24
+    ///    3: pageserver::tenant::storage_layer::tests2::test_io_concurrency_noise::{{closure}}
+    ///              at ./src/tenant/storage_layer.rs:1159:9
+    ///   ...
+    ///   49: <unknown>
+    /// 2025-01-21T17:42:01.452293Z  INFO IoConcurrency_sidecar{task_id=0}: completing IO
+    /// 2025-01-21T17:42:01.452357Z TRACE IoConcurrency_sidecar{task_id=0}: io future completed
+    /// 2025-01-21T17:42:01.452473Z TRACE IoConcurrency_sidecar{task_id=0}: end
+    /// test tenant::storage_layer::tests2::test_io_concurrency_noise ... ok
+    ///
+    /// ```
+    #[tokio::test]
+    async fn test_io_concurrency_noise() {
+        crate::tenant::harness::setup_logging();
+
+        let io_concurrency = IoConcurrency::spawn_for_test();
+        match *io_concurrency {
+            IoConcurrency::Sequential => {
+                // This test asserts behavior in sidecar mode, doesn't make sense in sequential mode.
+                return;
+            }
+            IoConcurrency::SidecarTask { .. } => {}
+        }
+        let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
+
+        let (io_fut_is_waiting_tx, io_fut_is_waiting) = tokio::sync::oneshot::channel();
+        let (do_complete_io, should_complete_io) = tokio::sync::oneshot::channel();
+        let (io_fut_exiting_tx, io_fut_exiting) = tokio::sync::oneshot::channel();
+
+        let io = reconstruct_state.update_key(&DBDIR_KEY, Lsn(8), true);
+        reconstruct_state
+            .spawn_io(async move {
+                info!("waiting for signal to complete IO");
+                io_fut_is_waiting_tx.send(()).unwrap();
+                should_complete_io.await.unwrap();
+                info!("completing IO");
+                io.complete(Ok(OnDiskValue::RawImage(Bytes::new())));
+                io_fut_exiting_tx.send(()).unwrap();
+            })
+            .await;
+
+        io_fut_is_waiting.await.unwrap();
+
+        // this is what makes the noise
+        drop(reconstruct_state);
+
+        do_complete_io.send(()).unwrap();
+
+        io_fut_exiting.await.unwrap();
+    }
+}
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index ade1b794c6..885c50425f 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -41,13 +41,12 @@ use crate::tenant::vectored_blob_io::{
     BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
     VectoredReadPlanner,
 };
-use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
 use crate::virtual_file::IoBufferMut;
 use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
-use anyhow::{anyhow, bail, ensure, Context, Result};
+use anyhow::{bail, ensure, Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
@@ -60,7 +59,7 @@ use pageserver_api::shard::TenantShardId;
 use pageserver_api::value::Value;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::collections::VecDeque;
+use std::collections::{HashMap, VecDeque};
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
@@ -77,7 +76,10 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
+use super::{
+    AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer,
+    ValuesReconstructState,
+};
 
 ///
 /// Header stored in the beginning of the file
@@ -226,7 +228,7 @@ pub struct DeltaLayerInner {
     index_start_blk: u32,
     index_root_blk: u32,
 
-    file: VirtualFile,
+    file: Arc<VirtualFile>,
     file_id: FileId,
 
     layer_key_range: Range<Key>,
@@ -795,9 +797,11 @@ impl DeltaLayerInner {
         max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open_v2(path, ctx)
-            .await
-            .context("open layer file")?;
+        let file = Arc::new(
+            VirtualFile::open_v2(path, ctx)
+                .await
+                .context("open layer file")?,
+        );
 
         let file_id = page_cache::next_file_id();
 
@@ -842,12 +846,11 @@ impl DeltaLayerInner {
     // Look up the keys in the provided keyspace and update
     // the reconstruct state with whatever is found.
     //
-    // If the key is cached, go no further than the cached Lsn.
-    //
     // Currently, the index is visited for each range, but this
     // can be further optimised to visit the index only once.
     pub(super) async fn get_values_reconstruct_data(
         &self,
+        this: ResidentLayer,
         keyspace: KeySpace,
         lsn_range: Range<Lsn>,
         reconstruct_state: &mut ValuesReconstructState,
@@ -875,17 +878,14 @@ impl DeltaLayerInner {
             data_end_offset,
             index_reader,
             planner,
-            reconstruct_state,
             ctx,
         )
         .await
         .map_err(GetVectoredError::Other)?;
 
-        self.do_reads_and_update_state(reads, reconstruct_state, ctx)
+        self.do_reads_and_update_state(this, reads, reconstruct_state, ctx)
             .await;
 
-        reconstruct_state.on_lsn_advanced(&keyspace, lsn_range.start);
-
         Ok(())
     }
 
@@ -895,7 +895,6 @@ impl DeltaLayerInner {
         data_end_offset: u64,
         index_reader: DiskBtreeReader<Reader, DELTA_KEY_SIZE>,
         mut planner: VectoredReadPlanner,
-        reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> anyhow::Result<Vec<VectoredRead>>
     where
@@ -922,10 +921,9 @@ impl DeltaLayerInner {
                 assert!(key >= range.start);
 
                 let outside_lsn_range = !lsn_range.contains(&lsn);
-                let below_cached_lsn = reconstruct_state.get_cached_lsn(&key) >= Some(lsn);
 
                 let flag = {
-                    if outside_lsn_range || below_cached_lsn {
+                    if outside_lsn_range {
                         BlobFlag::Ignore
                     } else if blob_ref.will_init() {
                         BlobFlag::ReplaceAll
@@ -994,98 +992,78 @@ impl DeltaLayerInner {
 
     async fn do_reads_and_update_state(
         &self,
+        this: ResidentLayer,
         reads: Vec<VectoredRead>,
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) {
-        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
-        let mut ignore_key_with_err = None;
-
         let max_vectored_read_bytes = self
             .max_vectored_read_bytes
             .expect("Layer is loaded with max vectored bytes config")
             .0
             .into();
         let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
-        let mut buf = Some(IoBufferMut::with_capacity(buf_size));
 
         // Note that reads are processed in reverse order (from highest key+lsn).
         // This is the order that `ReconstructState` requires such that it can
         // track when a key is done.
         for read in reads.into_iter().rev() {
-            let res = vectored_blob_reader
-                .read_blobs(&read, buf.take().expect("Should have a buffer"), ctx)
-                .await;
-
-            let blobs_buf = match res {
-                Ok(blobs_buf) => blobs_buf,
-                Err(err) => {
-                    let kind = err.kind();
-                    for (_, blob_meta) in read.blobs_at.as_slice() {
-                        reconstruct_state.on_key_error(
-                            blob_meta.key,
-                            PageReconstructError::Other(anyhow!(
-                                "Failed to read blobs from virtual file {}: {}",
-                                self.file.path(),
-                                kind
-                            )),
-                        );
-                    }
-
-                    // We have "lost" the buffer since the lower level IO api
-                    // doesn't return the buffer on error. Allocate a new one.
-                    buf = Some(IoBufferMut::with_capacity(buf_size));
-
-                    continue;
-                }
-            };
-            let view = BufView::new_slice(&blobs_buf.buf);
-            for meta in blobs_buf.blobs.iter().rev() {
-                if Some(meta.meta.key) == ignore_key_with_err {
-                    continue;
-                }
-                let blob_read = meta.read(&view).await;
-                let blob_read = match blob_read {
-                    Ok(buf) => buf,
-                    Err(e) => {
-                        reconstruct_state.on_key_error(
-                            meta.meta.key,
-                            PageReconstructError::Other(anyhow!(e).context(format!(
-                                "Failed to decompress blob from virtual file {}",
-                                self.file.path(),
-                            ))),
-                        );
-
-                        ignore_key_with_err = Some(meta.meta.key);
-                        continue;
-                    }
-                };
-
-                let value = Value::des(&blob_read);
-
-                let value = match value {
-                    Ok(v) => v,
-                    Err(e) => {
-                        reconstruct_state.on_key_error(
-                            meta.meta.key,
-                            PageReconstructError::Other(anyhow!(e).context(format!(
-                                "Failed to deserialize blob from virtual file {}",
-                                self.file.path(),
-                            ))),
-                        );
-
-                        ignore_key_with_err = Some(meta.meta.key);
-                        continue;
-                    }
-                };
-
-                // Invariant: once a key reaches [`ValueReconstructSituation::Complete`]
-                // state, no further updates shall be made to it. The call below will
-                // panic if the invariant is violated.
-                reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value);
+            let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
+            for (_, blob_meta) in read.blobs_at.as_slice().iter().rev() {
+                let io = reconstruct_state.update_key(
+                    &blob_meta.key,
+                    blob_meta.lsn,
+                    blob_meta.will_init,
+                );
+                ios.insert((blob_meta.key, blob_meta.lsn), io);
             }
 
-            buf = Some(blobs_buf.buf);
+            let read_extend_residency = this.clone();
+            let read_from = self.file.clone();
+            let read_ctx = ctx.attached_child();
+            reconstruct_state
+                .spawn_io(async move {
+                    let vectored_blob_reader = VectoredBlobReader::new(&read_from);
+                    let buf = IoBufferMut::with_capacity(buf_size);
+
+                    let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await;
+                    match res {
+                        Ok(blobs_buf) => {
+                            let view = BufView::new_slice(&blobs_buf.buf);
+                            for meta in blobs_buf.blobs.iter().rev() {
+                                let io = ios.remove(&(meta.meta.key, meta.meta.lsn)).unwrap();
+
+                                let blob_read = meta.read(&view).await;
+                                let blob_read = match blob_read {
+                                    Ok(buf) => buf,
+                                    Err(e) => {
+                                        io.complete(Err(e));
+                                        continue;
+                                    }
+                                };
+
+                                io.complete(Ok(OnDiskValue::WalRecordOrImage(
+                                    blob_read.into_bytes(),
+                                )));
+                            }
+
+                            assert!(ios.is_empty());
+                        }
+                        Err(err) => {
+                            for (_, sender) in ios {
+                                sender.complete(Err(std::io::Error::new(
+                                    err.kind(),
+                                    "vec read failed",
+                                )));
+                            }
+                        }
+                    }
+
+                    // keep layer resident until this IO is done; this spawned IO future generally outlives the
+                    // call to `self` / the `Arc<DownloadedLayer>` / the `ResidentLayer` that guarantees residency
+                    drop(read_extend_residency);
+                })
+                .await;
         }
     }
 
@@ -1224,7 +1202,14 @@ impl DeltaLayerInner {
             let actionable = if let Some((key, lsn, start_offset)) = prev.take() {
                 let end_offset = offset;
 
-                Some((BlobMeta { key, lsn }, start_offset..end_offset))
+                Some((
+                    BlobMeta {
+                        key,
+                        lsn,
+                        will_init: false,
+                    },
+                    start_offset..end_offset,
+                ))
             } else {
                 None
             };
@@ -1560,7 +1545,9 @@ impl DeltaLayerIterator<'_> {
                 let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
                 let blob_ref = BlobRef(value);
                 let offset = blob_ref.pos();
-                if let Some(batch_plan) = self.planner.handle(key, lsn, offset) {
+                if let Some(batch_plan) =
+                    self.planner.handle(key, lsn, offset, blob_ref.will_init())
+                {
                     break batch_plan;
                 }
             } else {
@@ -1673,7 +1660,6 @@ pub(crate) mod test {
             .expect("In memory disk finish should never fail");
         let reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(0, root_offset, disk);
         let planner = VectoredReadPlanner::new(100);
-        let mut reconstruct_state = ValuesReconstructState::new();
         let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
 
         let keyspace = KeySpace {
@@ -1691,7 +1677,6 @@ pub(crate) mod test {
             disk_offset,
             reader,
             planner,
-            &mut reconstruct_state,
             &ctx,
         )
         .await
@@ -1935,7 +1920,6 @@ pub(crate) mod test {
             );
 
             let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES);
-            let mut reconstruct_state = ValuesReconstructState::new();
             let keyspace = pick_random_keyspace(rng, &entries_meta.key_range);
             let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;
 
@@ -1945,7 +1929,6 @@ pub(crate) mod test {
                 data_end_offset,
                 index_reader,
                 planner,
-                &mut reconstruct_state,
                 &ctx,
             )
             .await?;
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 0d3c9d5a44..c49281dc45 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -38,12 +38,11 @@ use crate::tenant::vectored_blob_io::{
     BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
     VectoredReadPlanner,
 };
-use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::IoBufferMut;
 use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
-use anyhow::{anyhow, bail, ensure, Context, Result};
+use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
@@ -56,12 +55,13 @@ use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_api::value::Value;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::collections::VecDeque;
+use std::collections::{HashMap, VecDeque};
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::str::FromStr;
+use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tokio_stream::StreamExt;
 use tracing::*;
@@ -73,7 +73,10 @@ use utils::{
 };
 
 use super::layer_name::ImageLayerName;
-use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
+use super::{
+    AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer,
+    ValuesReconstructState,
+};
 
 ///
 /// Header stored in the beginning of the file
@@ -164,7 +167,7 @@ pub struct ImageLayerInner {
     key_range: Range<Key>,
     lsn: Lsn,
 
-    file: VirtualFile,
+    file: Arc<VirtualFile>,
     file_id: FileId,
 
     max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
@@ -391,9 +394,11 @@ impl ImageLayerInner {
         max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open_v2(path, ctx)
-            .await
-            .context("open layer file")?;
+        let file = Arc::new(
+            VirtualFile::open_v2(path, ctx)
+                .await
+                .context("open layer file")?,
+        );
         let file_id = page_cache::next_file_id();
         let block_reader = FileBlockReader::new(&file, file_id);
         let summary_blk = block_reader
@@ -439,6 +444,7 @@ impl ImageLayerInner {
     // the reconstruct state with whatever is found.
     pub(super) async fn get_values_reconstruct_data(
         &self,
+        this: ResidentLayer,
         keyspace: KeySpace,
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
@@ -448,7 +454,7 @@ impl ImageLayerInner {
             .await
             .map_err(GetVectoredError::Other)?;
 
-        self.do_reads_and_update_state(reads, reconstruct_state, ctx)
+        self.do_reads_and_update_state(this, reads, reconstruct_state, ctx)
             .await;
 
         reconstruct_state.on_image_layer_visited(&self.key_range);
@@ -570,6 +576,7 @@ impl ImageLayerInner {
 
     async fn do_reads_and_update_state(
         &self,
+        this: ResidentLayer,
         reads: Vec<VectoredRead>,
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
@@ -580,8 +587,13 @@ impl ImageLayerInner {
             .0
             .into();
 
-        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
         for read in reads.into_iter() {
+            let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
+            for (_, blob_meta) in read.blobs_at.as_slice() {
+                let io = reconstruct_state.update_key(&blob_meta.key, blob_meta.lsn, true);
+                ios.insert((blob_meta.key, blob_meta.lsn), io);
+            }
+
             let buf_size = read.size();
 
             if buf_size > max_vectored_read_bytes {
@@ -611,50 +623,51 @@ impl ImageLayerInner {
                 }
             }
 
-            let buf = IoBufferMut::with_capacity(buf_size);
-            let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;
+            let read_extend_residency = this.clone();
+            let read_from = self.file.clone();
+            let read_ctx = ctx.attached_child();
+            reconstruct_state
+                .spawn_io(async move {
+                    let buf = IoBufferMut::with_capacity(buf_size);
+                    let vectored_blob_reader = VectoredBlobReader::new(&read_from);
+                    let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await;
 
-            match res {
-                Ok(blobs_buf) => {
-                    let view = BufView::new_slice(&blobs_buf.buf);
-                    for meta in blobs_buf.blobs.iter() {
-                        let img_buf = meta.read(&view).await;
+                    match res {
+                        Ok(blobs_buf) => {
+                            let view = BufView::new_slice(&blobs_buf.buf);
+                            for meta in blobs_buf.blobs.iter() {
+                                let io: OnDiskValueIo =
+                                    ios.remove(&(meta.meta.key, meta.meta.lsn)).unwrap();
+                                let img_buf = meta.read(&view).await;
 
-                        let img_buf = match img_buf {
-                            Ok(img_buf) => img_buf,
-                            Err(e) => {
-                                reconstruct_state.on_key_error(
-                                    meta.meta.key,
-                                    PageReconstructError::Other(anyhow!(e).context(format!(
-                                        "Failed to decompress blob from virtual file {}",
-                                        self.file.path(),
-                                    ))),
-                                );
+                                let img_buf = match img_buf {
+                                    Ok(img_buf) => img_buf,
+                                    Err(e) => {
+                                        io.complete(Err(e));
+                                        continue;
+                                    }
+                                };
 
-                                continue;
+                                io.complete(Ok(OnDiskValue::RawImage(img_buf.into_bytes())));
                             }
-                        };
-                        reconstruct_state.update_key(
-                            &meta.meta.key,
-                            self.lsn,
-                            Value::Image(img_buf.into_bytes()),
-                        );
+
+                            assert!(ios.is_empty());
+                        }
+                        Err(err) => {
+                            for (_, io) in ios {
+                                io.complete(Err(std::io::Error::new(
+                                    err.kind(),
+                                    "vec read failed",
+                                )));
+                            }
+                        }
                     }
-                }
-                Err(err) => {
-                    let kind = err.kind();
-                    for (_, blob_meta) in read.blobs_at.as_slice() {
-                        reconstruct_state.on_key_error(
-                            blob_meta.key,
-                            PageReconstructError::from(anyhow!(
-                                "Failed to read blobs from virtual file {}: {}",
-                                self.file.path(),
-                                kind
-                            )),
-                        );
-                    }
-                }
-            };
+
+                    // keep layer resident until this IO is done; this spawned IO future generally outlives the
+                    // call to `self` / the `Arc<DownloadedLayer>` / the `ResidentLayer` that guarantees residency
+                    drop(read_extend_residency);
+                })
+                .await;
         }
     }
 
@@ -1069,6 +1082,7 @@ impl ImageLayerIterator<'_> {
                     Key::from_slice(&raw_key[..KEY_SIZE]),
                     self.image_layer.lsn,
                     offset,
+                    true,
                 ) {
                     break batch_plan;
                 }
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 2b67f55a17..61a0fdea8c 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -8,23 +8,22 @@ use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64};
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::tenant::ephemeral_file::EphemeralFile;
+use crate::tenant::storage_layer::{OnDiskValue, OnDiskValueIo};
 use crate::tenant::timeline::GetVectoredError;
-use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::{l0_flush, page_cache};
-use anyhow::{anyhow, Result};
+use anyhow::Result;
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
-use pageserver_api::value::Value;
 use std::collections::{BTreeMap, HashMap};
 use std::sync::{Arc, OnceLock};
 use std::time::Instant;
 use tracing::*;
-use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
+use utils::{id::TimelineId, lsn::Lsn, vec_map::VecMap};
 use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta};
 // avoid binding to Write (conflicts with std::io::Write)
 // while being able to use std::fmt::Write's methods
@@ -36,9 +35,7 @@ use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
 use tokio::sync::RwLock;
 
-use super::{
-    DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
-};
+use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState};
 
 pub(crate) mod vectored_dio_read;
 
@@ -415,10 +412,8 @@ impl InMemoryLayer {
 
     // Look up the keys in the provided keyspace and update
     // the reconstruct state with whatever is found.
-    //
-    // If the key is cached, go no further than the cached Lsn.
     pub(crate) async fn get_values_reconstruct_data(
-        &self,
+        self: &Arc<InMemoryLayer>,
         keyspace: KeySpace,
         end_lsn: Lsn,
         reconstruct_state: &mut ValuesReconstructState,
@@ -435,6 +430,9 @@ impl InMemoryLayer {
             read: vectored_dio_read::LogicalRead<Vec<u8>>,
         }
         let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
+        let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
+
+        let lsn_range = self.start_lsn..end_lsn;
 
         for range in keyspace.ranges.iter() {
             for (key, vec_map) in inner
@@ -442,12 +440,7 @@ impl InMemoryLayer {
                 .range(range.start.to_compact()..range.end.to_compact())
             {
                 let key = Key::from_compact(*key);
-                let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
-                    Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
-                    None => self.start_lsn..end_lsn,
-                };
-
-                let slice = vec_map.slice_range(lsn_range);
+                let slice = vec_map.slice_range(lsn_range.clone());
 
                 for (entry_lsn, index_entry) in slice.iter().rev() {
                     let IndexEntryUnpacked {
@@ -463,55 +456,59 @@ impl InMemoryLayer {
                             Vec::with_capacity(len as usize),
                         ),
                     });
+
+                    let io = reconstruct_state.update_key(&key, *entry_lsn, will_init);
+                    ios.insert((key, *entry_lsn), io);
+
                     if will_init {
                         break;
                     }
                 }
             }
         }
+        drop(inner); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
+        let read_from = Arc::clone(self);
+        let read_ctx = ctx.attached_child();
+        reconstruct_state
+            .spawn_io(async move {
+                let inner = read_from.inner.read().await;
+                let f = vectored_dio_read::execute(
+                    &inner.file,
+                    reads
+                        .iter()
+                        .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
+                    &read_ctx,
+                );
+                send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
+                    .await;
 
-        // Execute the reads.
-
-        let f = vectored_dio_read::execute(
-            &inner.file,
-            reads
-                .iter()
-                .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
-            &ctx,
-        );
-        send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
-            .await;
-
-        // Process results into the reconstruct state
-        'next_key: for (key, value_reads) in reads {
-            for ValueRead { entry_lsn, read } in value_reads {
-                match read.into_result().expect("we run execute() above") {
-                    Err(e) => {
-                        reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
-                        continue 'next_key;
-                    }
-                    Ok(value_buf) => {
-                        let value = Value::des(&value_buf);
-                        if let Err(e) = value {
-                            reconstruct_state
-                                .on_key_error(key, PageReconstructError::from(anyhow!(e)));
-                            continue 'next_key;
+                for (key, value_reads) in reads {
+                    for ValueRead { entry_lsn, read } in value_reads {
+                        let io = ios.remove(&(key, entry_lsn)).expect("sender must exist");
+                        match read.into_result().expect("we run execute() above") {
+                            Err(e) => {
+                                io.complete(Err(std::io::Error::new(
+                                    e.kind(),
+                                    "dio vec read failed",
+                                )));
+                            }
+                            Ok(value_buf) => {
+                                io.complete(Ok(OnDiskValue::WalRecordOrImage(value_buf.into())));
+                            }
                         }
-
-                        let key_situation =
-                            reconstruct_state.update_key(&key, entry_lsn, value.unwrap());
-                        if key_situation == ValueReconstructSituation::Complete {
-                            // TODO: metric to see if we fetched more values than necessary
-                            continue 'next_key;
-                        }
-
-                        // process the next value in the next iteration of the loop
                     }
                 }
-            }
-        }
 
-        reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);
+                assert!(ios.is_empty());
+
+                // Keep layer existent until this IO is done;
+                // This is kinda forced for InMemoryLayer because we need to inner.read() anyway,
+                // but it's less obvious for DeltaLayer and ImageLayer. So, keep this explicit
+                // drop for consistency among all three layer types.
+                drop(inner);
+                drop(read_from);
+            })
+            .await;
 
         Ok(())
     }
@@ -606,6 +603,7 @@ impl InMemoryLayer {
         // Write the batch to the file
         inner.file.write_raw(&raw, ctx).await?;
         let new_size = inner.file.len();
+
         let expected_new_len = base_offset
             .checked_add(raw.len().into_u64())
             // write_raw would error if we were to overflow u64.
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 2b06c88e8b..2a86885f6b 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -308,7 +308,7 @@ impl Layer {
         reconstruct_data: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
-        let layer = self
+        let downloaded = self
             .0
             .get_or_maybe_download(true, Some(ctx))
             .await
@@ -318,11 +318,15 @@ impl Layer {
                 }
                 other => GetVectoredError::Other(anyhow::anyhow!(other)),
             })?;
+        let this = ResidentLayer {
+            downloaded: downloaded.clone(),
+            owner: self.clone(),
+        };
 
         self.record_access(ctx);
 
-        layer
-            .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
+        downloaded
+            .get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx)
             .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
             .await
             .map_err(|err| match err {
@@ -1768,25 +1772,25 @@ impl DownloadedLayer {
 
     async fn get_values_reconstruct_data(
         &self,
+        this: ResidentLayer,
         keyspace: KeySpace,
         lsn_range: Range<Lsn>,
         reconstruct_data: &mut ValuesReconstructState,
-        owner: &Arc<LayerInner>,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
         use LayerKind::*;
 
         match self
-            .get(owner, ctx)
+            .get(&this.owner.0, ctx)
             .await
             .map_err(GetVectoredError::Other)?
         {
             Delta(d) => {
-                d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx)
+                d.get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx)
                     .await
             }
             Image(i) => {
-                i.get_values_reconstruct_data(keyspace, reconstruct_data, ctx)
+                i.get_values_reconstruct_data(this, keyspace, reconstruct_data, ctx)
                     .await
             }
         }
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index fcb73ad20d..d93c378ffc 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -11,7 +11,10 @@ use super::failpoints::{Failpoint, FailpointKind};
 use super::*;
 use crate::{
     context::DownloadBehavior,
-    tenant::{harness::test_img, storage_layer::LayerVisibilityHint},
+    tenant::{
+        harness::test_img,
+        storage_layer::{IoConcurrency, LayerVisibilityHint},
+    },
 };
 use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
 
@@ -31,6 +34,7 @@ async fn smoke_test() {
     let span = h.span();
     let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
     let (tenant, _) = h.load().await;
+    let io_concurrency = IoConcurrency::spawn_for_test();
 
     let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
 
@@ -89,7 +93,7 @@ async fn smoke_test() {
     };
 
     let img_before = {
-        let mut data = ValuesReconstructState::default();
+        let mut data = ValuesReconstructState::new(io_concurrency.clone());
         layer
             .get_values_reconstruct_data(
                 controlfile_keyspace.clone(),
@@ -99,10 +103,13 @@ async fn smoke_test() {
             )
             .await
             .unwrap();
+
         data.keys
             .remove(&CONTROLFILE_KEY)
             .expect("must be present")
-            .expect("should not error")
+            .collect_pending_ios()
+            .await
+            .expect("must not error")
             .img
             .take()
             .expect("tenant harness writes the control file")
@@ -121,7 +128,7 @@ async fn smoke_test() {
 
     // on accesses when the layer is evicted, it will automatically be downloaded.
     let img_after = {
-        let mut data = ValuesReconstructState::default();
+        let mut data = ValuesReconstructState::new(io_concurrency.clone());
         layer
             .get_values_reconstruct_data(
                 controlfile_keyspace.clone(),
@@ -135,7 +142,9 @@ async fn smoke_test() {
         data.keys
             .remove(&CONTROLFILE_KEY)
             .expect("must be present")
-            .expect("should not error")
+            .collect_pending_ios()
+            .await
+            .expect("must not error")
             .img
             .take()
             .expect("tenant harness writes the control file")
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e83b516d79..5d348ac474 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -20,6 +20,7 @@ use camino::Utf8Path;
 use chrono::{DateTime, Utc};
 use enumset::EnumSet;
 use fail::fail_point;
+use futures::{stream::FuturesUnordered, StreamExt};
 use handle::ShardTimelineId;
 use offload::OffloadError;
 use once_cell::sync::Lazy;
@@ -74,6 +75,7 @@ use std::{
     ops::{Deref, Range},
 };
 
+use crate::l0_flush::{self, L0FlushGlobalState};
 use crate::{
     aux_file::AuxFileSizeEstimator,
     page_service::TenantManagerTypes,
@@ -81,7 +83,10 @@ use crate::{
         config::AttachmentMode,
         layer_map::{LayerMap, SearchResult},
         metadata::TimelineMetadata,
-        storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
+        storage_layer::{
+            inmemory_layer::IndexEntry, IoConcurrency, PersistentLayerDesc,
+            ValueReconstructSituation,
+        },
     },
     walingest::WalLagCooldown,
     walredo,
@@ -102,10 +107,6 @@ use crate::{
 use crate::{
     disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
 };
-use crate::{
-    l0_flush::{self, L0FlushGlobalState},
-    metrics::GetKind,
-};
 use crate::{
     metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
@@ -1005,9 +1006,7 @@ impl Timeline {
             ranges: vec![key..key.next()],
         };
 
-        // Initialise the reconstruct state for the key with the cache
-        // entry returned above.
-        let mut reconstruct_state = ValuesReconstructState::new();
+        let mut reconstruct_state = ValuesReconstructState::new(IoConcurrency::sequential());
 
         let vectored_res = self
             .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
@@ -1050,6 +1049,7 @@ impl Timeline {
         &self,
         keyspace: KeySpace,
         lsn: Lsn,
+        io_concurrency: super::storage_layer::IoConcurrency,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
         if !lsn.is_valid() {
@@ -1084,7 +1084,7 @@ impl Timeline {
             .get_vectored_impl(
                 keyspace.clone(),
                 lsn,
-                &mut ValuesReconstructState::new(),
+                &mut ValuesReconstructState::new(io_concurrency),
                 ctx,
             )
             .await;
@@ -1109,6 +1109,7 @@ impl Timeline {
         keyspace: KeySpace,
         lsn: Lsn,
         ctx: &RequestContext,
+        io_concurrency: super::storage_layer::IoConcurrency,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
         if !lsn.is_valid() {
             return Err(GetVectoredError::InvalidLsn(lsn));
@@ -1140,7 +1141,7 @@ impl Timeline {
             .get_vectored_impl(
                 keyspace.clone(),
                 lsn,
-                &mut ValuesReconstructState::default(),
+                &mut ValuesReconstructState::new(io_concurrency),
                 ctx,
             )
             .await;
@@ -1159,39 +1160,56 @@ impl Timeline {
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
-        let get_kind = if keyspace.total_raw_size() == 1 {
-            GetKind::Singular
-        } else {
-            GetKind::Vectored
+        let traversal_res: Result<(), _> = self
+            .get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx)
+            .await;
+        if let Err(err) = traversal_res {
+            // Wait for all the spawned IOs to complete.
+            // See comments on `spawn_io` inside `storage_layer` for more details.
+            let mut collect_futs = std::mem::take(&mut reconstruct_state.keys)
+                .into_values()
+                .map(|state| state.collect_pending_ios())
+                .collect::<FuturesUnordered<_>>();
+            while collect_futs.next().await.is_some() {}
+            return Err(err);
         };
 
-        let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
-            .for_get_kind(get_kind)
-            .start_timer();
-        self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx)
-            .await?;
-        get_data_timer.stop_and_record();
-
-        let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME
-            .for_get_kind(get_kind)
-            .start_timer();
-        let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
         let layers_visited = reconstruct_state.get_layers_visited();
 
-        for (key, res) in std::mem::take(&mut reconstruct_state.keys) {
-            match res {
-                Err(err) => {
-                    results.insert(key, Err(err));
-                }
-                Ok(state) => {
-                    let state = ValueReconstructState::from(state);
+        let futs = FuturesUnordered::new();
+        for (key, state) in std::mem::take(&mut reconstruct_state.keys) {
+            futs.push({
+                let walredo_self = self.myself.upgrade().expect("&self method holds the arc");
+                async move {
+                    assert_eq!(state.situation, ValueReconstructSituation::Complete);
 
-                    let reconstruct_res = self.reconstruct_value(key, lsn, state).await;
-                    results.insert(key, reconstruct_res);
+                    let converted = match state.collect_pending_ios().await {
+                        Ok(ok) => ok,
+                        Err(err) => {
+                            return (key, Err(err));
+                        }
+                    };
+
+                    // The walredo module expects the records to be descending in terms of Lsn.
+                    // And we submit the IOs in that order, so, there shuold be no need to sort here.
+                    debug_assert!(
+                        converted
+                            .records
+                            .is_sorted_by_key(|(lsn, _)| std::cmp::Reverse(*lsn)),
+                        "{converted:?}"
+                    );
+
+                    (
+                        key,
+                        walredo_self.reconstruct_value(key, lsn, converted).await,
+                    )
                 }
-            }
+            });
         }
-        reconstruct_timer.stop_and_record();
+
+        let results = futs
+            .collect::<BTreeMap<Key, Result<Bytes, PageReconstructError>>>()
+            .await;
 
         // For aux file keys (v1 or v2) the vectored read path does not return an error
         // when they're missing. Instead they are omitted from the resulting btree
@@ -2873,6 +2891,14 @@ impl Timeline {
                     crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances)
                 };
 
+                let io_concurrency = IoConcurrency::spawn_from_conf(
+                    self_ref.conf,
+                    self_ref
+                        .gate
+                        .enter()
+                        .map_err(|_| CalculateLogicalSizeError::Cancelled)?,
+                );
+
                 let calculated_size = self_ref
                     .logical_size_calculation_task(
                         initial_part_end,
@@ -2882,7 +2908,11 @@ impl Timeline {
                     .await?;
 
                 self_ref
-                    .trigger_aux_file_size_computation(initial_part_end, background_ctx)
+                    .trigger_aux_file_size_computation(
+                        initial_part_end,
+                        background_ctx,
+                        io_concurrency,
+                    )
                     .await?;
 
                 // TODO: add aux file size to logical size
@@ -4115,6 +4145,7 @@ impl Timeline {
 
     /// Create image layers for Postgres data. Assumes the caller passes a partition that is not too large,
     /// so that at most one image layer will be produced from this function.
+    #[allow(clippy::too_many_arguments)]
     async fn create_image_layer_for_rel_blocks(
         self: &Arc<Self>,
         partition: &KeySpace,
@@ -4123,6 +4154,7 @@ impl Timeline {
         ctx: &RequestContext,
         img_range: Range<Key>,
         start: Key,
+        io_concurrency: IoConcurrency,
     ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
         let mut wrote_keys = false;
 
@@ -4151,7 +4183,12 @@ impl Timeline {
                     || (last_key_in_range && key_request_accum.raw_size() > 0)
                 {
                     let results = self
-                        .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
+                        .get_vectored(
+                            key_request_accum.consume_keyspace(),
+                            lsn,
+                            io_concurrency.clone(),
+                            ctx,
+                        )
                         .await?;
 
                     if self.cancel.is_cancelled() {
@@ -4230,9 +4267,10 @@ impl Timeline {
         img_range: Range<Key>,
         mode: ImageLayerCreationMode,
         start: Key,
+        io_concurrency: IoConcurrency,
     ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
         // Metadata keys image layer creation.
-        let mut reconstruct_state = ValuesReconstructState::default();
+        let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
         let begin = Instant::now();
         let data = self
             .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
@@ -4449,6 +4487,13 @@ impl Timeline {
                 )))
             });
 
+            let io_concurrency = IoConcurrency::spawn_from_conf(
+                self.conf,
+                self.gate
+                    .enter()
+                    .map_err(|_| CreateImageLayersError::Cancelled)?,
+            );
+
             if !compact_metadata {
                 let ImageLayerCreationOutcome {
                     image,
@@ -4461,6 +4506,7 @@ impl Timeline {
                         ctx,
                         img_range,
                         start,
+                        io_concurrency,
                     )
                     .await?;
 
@@ -4479,6 +4525,7 @@ impl Timeline {
                         img_range,
                         mode,
                         start,
+                        io_concurrency,
                     )
                     .await?;
                 start = next_start_key;
@@ -5746,13 +5793,14 @@ impl Timeline {
         self: &Arc<Timeline>,
         lsn: Lsn,
         ctx: &RequestContext,
+        io_concurrency: IoConcurrency,
     ) -> anyhow::Result<Vec<(Key, Bytes)>> {
         let mut all_data = Vec::new();
         let guard = self.layers.read().await;
         for layer in guard.layer_map()?.iter_historic_layers() {
             if !layer.is_delta() && layer.image_layer_lsn() == lsn {
                 let layer = guard.get_from_desc(&layer);
-                let mut reconstruct_data = ValuesReconstructState::default();
+                let mut reconstruct_data = ValuesReconstructState::new(io_concurrency.clone());
                 layer
                     .get_values_reconstruct_data(
                         KeySpace::single(Key::MIN..Key::MAX),
@@ -5761,8 +5809,9 @@ impl Timeline {
                         ctx,
                     )
                     .await?;
-                for (k, v) in reconstruct_data.keys {
-                    all_data.push((k, v?.img.unwrap().1));
+                for (k, v) in std::mem::take(&mut reconstruct_data.keys) {
+                    let v = v.collect_pending_ios().await?;
+                    all_data.push((k, v.img.unwrap().1));
                 }
             }
         }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 06a21f6b3c..57fc415d06 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -42,8 +42,8 @@ use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::{
     AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
 };
-use crate::tenant::timeline::ImageLayerCreationOutcome;
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
+use crate::tenant::timeline::{ImageLayerCreationOutcome, IoConcurrency};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded};
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
@@ -3170,6 +3170,7 @@ impl TimelineAdaptor {
                 ctx,
                 key_range.clone(),
                 start,
+                IoConcurrency::sequential(),
             )
             .await?;
 
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index dfe2352310..47fb4a276b 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -35,6 +35,7 @@ use crate::virtual_file::{self, VirtualFile};
 pub struct BlobMeta {
     pub key: Key,
     pub lsn: Lsn,
+    pub will_init: bool,
 }
 
 /// A view into the vectored blobs read buffer.
@@ -310,7 +311,15 @@ pub enum BlobFlag {
 /// * Iterate over the collected blobs and coalesce them into reads at the end
 pub struct VectoredReadPlanner {
     // Track all the blob offsets. Start offsets must be ordered.
-    blobs: BTreeMap<Key, Vec<(Lsn, u64, u64)>>,
+    // Values in the value tuples are:
+    // (
+    //   lsn of the blob,
+    //   start offset of the blob in the underlying file,
+    //   end offset of the blob in the underlying file,
+    //   whether the blob initializes the page image or not
+    //   see [`pageserver_api::record::NeonWalRecord::will_init`]
+    // )
+    blobs: BTreeMap<Key, Vec<(Lsn, u64, u64, bool)>>,
     // Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
     prev: Option<(Key, Lsn, u64, BlobFlag)>,
 
@@ -371,12 +380,12 @@ impl VectoredReadPlanner {
         match flag {
             BlobFlag::None => {
                 let blobs_for_key = self.blobs.entry(key).or_default();
-                blobs_for_key.push((lsn, start_offset, end_offset));
+                blobs_for_key.push((lsn, start_offset, end_offset, false));
             }
             BlobFlag::ReplaceAll => {
                 let blobs_for_key = self.blobs.entry(key).or_default();
                 blobs_for_key.clear();
-                blobs_for_key.push((lsn, start_offset, end_offset));
+                blobs_for_key.push((lsn, start_offset, end_offset, true));
             }
             BlobFlag::Ignore => {}
         }
@@ -387,11 +396,17 @@ impl VectoredReadPlanner {
         let mut reads = Vec::new();
 
         for (key, blobs_for_key) in self.blobs {
-            for (lsn, start_offset, end_offset) in blobs_for_key {
+            for (lsn, start_offset, end_offset, will_init) in blobs_for_key {
                 let extended = match &mut current_read_builder {
-                    Some(read_builder) => {
-                        read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn })
-                    }
+                    Some(read_builder) => read_builder.extend(
+                        start_offset,
+                        end_offset,
+                        BlobMeta {
+                            key,
+                            lsn,
+                            will_init,
+                        },
+                    ),
                     None => VectoredReadExtended::No,
                 };
 
@@ -399,7 +414,11 @@ impl VectoredReadPlanner {
                     let next_read_builder = ChunkedVectoredReadBuilder::new(
                         start_offset,
                         end_offset,
-                        BlobMeta { key, lsn },
+                        BlobMeta {
+                            key,
+                            lsn,
+                            will_init,
+                        },
                         self.max_read_size,
                     );
 
@@ -527,7 +546,7 @@ impl<'a> VectoredBlobReader<'a> {
 pub struct StreamingVectoredReadPlanner {
     read_builder: Option<ChunkedVectoredReadBuilder>,
     // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`]
-    prev: Option<(Key, Lsn, u64)>,
+    prev: Option<(Key, Lsn, u64, bool)>,
     /// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150,
     /// we will produce a single batch instead of split them.
     max_read_size: u64,
@@ -550,27 +569,47 @@ impl StreamingVectoredReadPlanner {
         }
     }
 
-    pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64) -> Option<VectoredRead> {
+    pub fn handle(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        offset: u64,
+        will_init: bool,
+    ) -> Option<VectoredRead> {
         // Implementation note: internally lag behind by one blob such that
         // we have a start and end offset when initialising [`VectoredRead`]
-        let (prev_key, prev_lsn, prev_offset) = match self.prev {
+        let (prev_key, prev_lsn, prev_offset, prev_will_init) = match self.prev {
             None => {
-                self.prev = Some((key, lsn, offset));
+                self.prev = Some((key, lsn, offset, will_init));
                 return None;
             }
             Some(prev) => prev,
         };
 
-        let res = self.add_blob(prev_key, prev_lsn, prev_offset, offset, false);
+        let res = self.add_blob(
+            prev_key,
+            prev_lsn,
+            prev_offset,
+            offset,
+            false,
+            prev_will_init,
+        );
 
-        self.prev = Some((key, lsn, offset));
+        self.prev = Some((key, lsn, offset, will_init));
 
         res
     }
 
     pub fn handle_range_end(&mut self, offset: u64) -> Option<VectoredRead> {
-        let res = if let Some((prev_key, prev_lsn, prev_offset)) = self.prev {
-            self.add_blob(prev_key, prev_lsn, prev_offset, offset, true)
+        let res = if let Some((prev_key, prev_lsn, prev_offset, prev_will_init)) = self.prev {
+            self.add_blob(
+                prev_key,
+                prev_lsn,
+                prev_offset,
+                offset,
+                true,
+                prev_will_init,
+            )
         } else {
             None
         };
@@ -587,10 +626,19 @@ impl StreamingVectoredReadPlanner {
         start_offset: u64,
         end_offset: u64,
         is_last_blob_in_read: bool,
+        will_init: bool,
     ) -> Option<VectoredRead> {
         match &mut self.read_builder {
             Some(read_builder) => {
-                let extended = read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn });
+                let extended = read_builder.extend(
+                    start_offset,
+                    end_offset,
+                    BlobMeta {
+                        key,
+                        lsn,
+                        will_init,
+                    },
+                );
                 assert_eq!(extended, VectoredReadExtended::Yes);
             }
             None => {
@@ -598,7 +646,11 @@ impl StreamingVectoredReadPlanner {
                     Some(ChunkedVectoredReadBuilder::new_streaming(
                         start_offset,
                         end_offset,
-                        BlobMeta { key, lsn },
+                        BlobMeta {
+                            key,
+                            lsn,
+                            will_init,
+                        },
                     ))
                 };
             }
@@ -812,7 +864,7 @@ mod tests {
         let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1000);
         let mut reads = Vec::new();
         for (key, lsn, offset, _) in blob_descriptions.clone() {
-            reads.extend(planner.handle(key, lsn, offset));
+            reads.extend(planner.handle(key, lsn, offset, false));
         }
         reads.extend(planner.handle_range_end(652 * 1024));
 
@@ -850,7 +902,7 @@ mod tests {
         let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2);
         let mut reads = Vec::new();
         for (key, lsn, offset, _) in blob_descriptions.clone() {
-            reads.extend(planner.handle(key, lsn, offset));
+            reads.extend(planner.handle(key, lsn, offset, false));
         }
         reads.extend(planner.handle_range_end(652 * 1024));
 
@@ -875,7 +927,7 @@ mod tests {
         {
             let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
             let mut reads = Vec::new();
-            reads.extend(planner.handle(key, lsn, 0));
+            reads.extend(planner.handle(key, lsn, 0, false));
             reads.extend(planner.handle_range_end(652 * 1024));
             assert_eq!(reads.len(), 1);
             validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]);
@@ -883,8 +935,8 @@ mod tests {
         {
             let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
             let mut reads = Vec::new();
-            reads.extend(planner.handle(key, lsn, 0));
-            reads.extend(planner.handle(key, lsn, 128 * 1024));
+            reads.extend(planner.handle(key, lsn, 0, false));
+            reads.extend(planner.handle(key, lsn, 128 * 1024, false));
             reads.extend(planner.handle_range_end(652 * 1024));
             assert_eq!(reads.len(), 2);
             validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]);
@@ -893,8 +945,8 @@ mod tests {
         {
             let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2);
             let mut reads = Vec::new();
-            reads.extend(planner.handle(key, lsn, 0));
-            reads.extend(planner.handle(key, lsn, 128 * 1024));
+            reads.extend(planner.handle(key, lsn, 0, false));
+            reads.extend(planner.handle(key, lsn, 128 * 1024, false));
             reads.extend(planner.handle_range_end(652 * 1024));
             assert_eq!(reads.len(), 1);
             validate_read(
@@ -923,6 +975,7 @@ mod tests {
         let meta = BlobMeta {
             key: Key::MIN,
             lsn: Lsn(0),
+            will_init: false,
         };
 
         for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index ad7bcc0714..e0283d99e0 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -499,7 +499,13 @@ impl WalIngest {
 
                 let content = modification
                     .tline
-                    .get_rel_page_at_lsn(src_rel, blknum, Version::Modified(modification), ctx)
+                    .get_rel_page_at_lsn(
+                        src_rel,
+                        blknum,
+                        Version::Modified(modification),
+                        ctx,
+                        crate::tenant::storage_layer::IoConcurrency::sequential(),
+                    )
                     .await?;
                 modification.put_rel_page_image(dst_rel, blknum, content)?;
                 num_blocks_copied += 1;
@@ -1489,6 +1495,7 @@ mod tests {
     use super::*;
     use crate::tenant::harness::*;
     use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH};
+    use crate::tenant::storage_layer::IoConcurrency;
     use postgres_ffi::RELSEG_SIZE;
 
     use crate::DEFAULT_PG_VERSION;
@@ -1532,6 +1539,7 @@ mod tests {
     #[tokio::test]
     async fn test_relsize() -> Result<()> {
         let (tenant, ctx) = TenantHarness::create("test_relsize").await?.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -1599,7 +1607,13 @@ mod tests {
         // Check page contents at each LSN
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    0,
+                    Version::Lsn(Lsn(0x20)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 0 at 2")
@@ -1607,7 +1621,13 @@ mod tests {
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    0,
+                    Version::Lsn(Lsn(0x30)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 0 at 3")
@@ -1615,14 +1635,26 @@ mod tests {
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    0,
+                    Version::Lsn(Lsn(0x40)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    1,
+                    Version::Lsn(Lsn(0x40)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 1 at 4")
@@ -1630,21 +1662,39 @@ mod tests {
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    0,
+                    Version::Lsn(Lsn(0x50)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    1,
+                    Version::Lsn(Lsn(0x50)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 1 at 4")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    2,
+                    Version::Lsn(Lsn(0x50)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 2 at 5")
@@ -1667,14 +1717,26 @@ mod tests {
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    0,
+                    Version::Lsn(Lsn(0x60)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    1,
+                    Version::Lsn(Lsn(0x60)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 1 at 4")
@@ -1689,7 +1751,13 @@ mod tests {
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    2,
+                    Version::Lsn(Lsn(0x50)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 2 at 5")
@@ -1722,14 +1790,26 @@ mod tests {
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    0,
+                    Version::Lsn(Lsn(0x70)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             ZERO_PAGE
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    1,
+                    Version::Lsn(Lsn(0x70)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 1")
@@ -1750,7 +1830,13 @@ mod tests {
         for blk in 2..1500 {
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), &ctx)
+                    .get_rel_page_at_lsn(
+                        TESTREL_A,
+                        blk,
+                        Version::Lsn(Lsn(0x80)),
+                        &ctx,
+                        io_concurrency.clone()
+                    )
                     .instrument(test_span.clone())
                     .await?,
                 ZERO_PAGE
@@ -1758,7 +1844,13 @@ mod tests {
         }
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), &ctx)
+                .get_rel_page_at_lsn(
+                    TESTREL_A,
+                    1500,
+                    Version::Lsn(Lsn(0x80)),
+                    &ctx,
+                    io_concurrency.clone()
+                )
                 .instrument(test_span.clone())
                 .await?,
             test_img("foo blk 1500")
@@ -1851,6 +1943,7 @@ mod tests {
             .await?
             .load()
             .await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -1903,7 +1996,13 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), &ctx)
+                    .get_rel_page_at_lsn(
+                        TESTREL_A,
+                        blkno,
+                        Version::Lsn(lsn),
+                        &ctx,
+                        io_concurrency.clone()
+                    )
                     .instrument(test_span.clone())
                     .await?,
                 test_img(&data)
@@ -1931,7 +2030,13 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), &ctx)
+                    .get_rel_page_at_lsn(
+                        TESTREL_A,
+                        blkno,
+                        Version::Lsn(Lsn(0x60)),
+                        &ctx,
+                        io_concurrency.clone()
+                    )
                     .instrument(test_span.clone())
                     .await?,
                 test_img(&data)
@@ -1950,7 +2055,13 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), &ctx)
+                    .get_rel_page_at_lsn(
+                        TESTREL_A,
+                        blkno,
+                        Version::Lsn(Lsn(0x50)),
+                        &ctx,
+                        io_concurrency.clone()
+                    )
                     .instrument(test_span.clone())
                     .await?,
                 test_img(&data)
@@ -1987,7 +2098,13 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), &ctx)
+                    .get_rel_page_at_lsn(
+                        TESTREL_A,
+                        blkno,
+                        Version::Lsn(Lsn(0x80)),
+                        &ctx,
+                        io_concurrency.clone()
+                    )
                     .instrument(test_span.clone())
                     .await?,
                 test_img(&data)
diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py
index 6c22b31e00..c82c7578d1 100644
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -208,6 +208,10 @@ class ShardIndex:
             shard_count=int(input[2:4], 16),
         )
 
+    @property
+    def is_sharded(self) -> bool:
+        return self.shard_count != 0
+
 
 class TenantShardId:
     def __init__(self, tenant_id: TenantId, shard_number: int, shard_count: int):
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index fa541bad17..fd7e193778 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -126,12 +126,8 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = (
     "pageserver_page_cache_read_accesses_total",
     "pageserver_page_cache_size_current_bytes",
     "pageserver_page_cache_size_max_bytes",
-    "pageserver_getpage_reconstruct_seconds_bucket",
-    "pageserver_getpage_reconstruct_seconds_count",
-    "pageserver_getpage_reconstruct_seconds_sum",
     *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
     *histogram("pageserver_smgr_query_seconds_global"),
-    *histogram("pageserver_getpage_get_reconstruct_data_seconds"),
     *histogram("pageserver_wait_lsn_seconds"),
     *histogram("pageserver_remote_operation_seconds"),
     *histogram("pageserver_io_operations_seconds"),
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d79c2a5ea8..af427b92d2 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -313,6 +313,10 @@ class PgProtocol:
         """
         return self.safe_psql(query, log_query=log_query)[0][0]
 
+    def show_timeline_id(self) -> TimelineId:
+        """SHOW neon.timeline_id"""
+        return TimelineId(cast("str", self.safe_psql("show neon.timeline_id")[0][0]))
+
 
 class PageserverWalReceiverProtocol(StrEnum):
     VANILLA = "vanilla"
@@ -387,6 +391,7 @@ class NeonEnvBuilder:
         storage_controller_port_override: int | None = None,
         pageserver_virtual_file_io_mode: str | None = None,
         pageserver_wal_receiver_protocol: PageserverWalReceiverProtocol | None = None,
+        pageserver_get_vectored_concurrent_io: str | None = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -426,6 +431,9 @@ class NeonEnvBuilder:
         self.storage_controller_config: dict[Any, Any] | None = None
 
         self.pageserver_virtual_file_io_engine: str | None = pageserver_virtual_file_io_engine
+        self.pageserver_get_vectored_concurrent_io: str | None = (
+            pageserver_get_vectored_concurrent_io
+        )
 
         self.pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = (
             pageserver_default_tenant_config_compaction_algorithm
@@ -452,6 +460,7 @@ class NeonEnvBuilder:
         self.test_name = test_name
         self.compatibility_neon_binpath = compatibility_neon_binpath
         self.compatibility_pg_distrib_dir = compatibility_pg_distrib_dir
+        self.test_may_use_compatibility_snapshot_binaries = False
         self.version_combination = combination
         self.mixdir = self.test_output_dir / "mixdir_neon"
         if self.version_combination is not None:
@@ -463,6 +472,7 @@ class NeonEnvBuilder:
             ), "the environment variable COMPATIBILITY_POSTGRES_DISTRIB_DIR is required when using mixed versions"
             self.mixdir.mkdir(mode=0o755, exist_ok=True)
             self._mix_versions()
+            self.test_may_use_compatibility_snapshot_binaries = True
 
     def init_configs(self, default_remote_storage_if_missing: bool = True) -> NeonEnv:
         # Cannot create more than one environment from one builder
@@ -1062,6 +1072,7 @@ class NeonEnv:
         self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
         self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode
         self.pageserver_wal_receiver_protocol = config.pageserver_wal_receiver_protocol
+        self.pageserver_get_vectored_concurrent_io = config.pageserver_get_vectored_concurrent_io
 
         # Create the neon_local's `NeonLocalInitConf`
         cfg: dict[str, Any] = {
@@ -1121,6 +1132,20 @@ class NeonEnv:
                 "max_batch_size": 32,
             }
 
+            # Concurrent IO (https://github.com/neondatabase/neon/issues/9378):
+            # enable concurrent IO by default in tests and benchmarks.
+            # Compat tests are exempt because old versions fail to parse the new config.
+            get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io
+            if config.test_may_use_compatibility_snapshot_binaries:
+                log.info(
+                    "Forcing use of binary-built-in default to avoid forward-compatibility related test failures"
+                )
+                get_vectored_concurrent_io = None
+            if get_vectored_concurrent_io is not None:
+                ps_cfg["get_vectored_concurrent_io"] = {
+                    "mode": self.pageserver_get_vectored_concurrent_io,
+                }
+
             if self.pageserver_virtual_file_io_engine is not None:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
             if config.pageserver_default_tenant_config_compaction_algorithm is not None:
@@ -1455,6 +1480,7 @@ def neon_simple_env(
     pageserver_virtual_file_io_engine: str,
     pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None,
     pageserver_virtual_file_io_mode: str | None,
+    pageserver_get_vectored_concurrent_io: str | None,
 ) -> Iterator[NeonEnv]:
     """
     Simple Neon environment, with 1 safekeeper and 1 pageserver. No authentication, no fsync.
@@ -1487,6 +1513,7 @@ def neon_simple_env(
         pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
         pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
         pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode,
+        pageserver_get_vectored_concurrent_io=pageserver_get_vectored_concurrent_io,
         combination=combination,
     ) as builder:
         env = builder.init_start()
@@ -1513,6 +1540,7 @@ def neon_env_builder(
     pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None,
     record_property: Callable[[str, object], None],
     pageserver_virtual_file_io_mode: str | None,
+    pageserver_get_vectored_concurrent_io: str | None,
 ) -> Iterator[NeonEnvBuilder]:
     """
     Fixture to create a Neon environment for test.
@@ -1555,6 +1583,7 @@ def neon_env_builder(
         test_overlay_dir=test_overlay_dir,
         pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
         pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode,
+        pageserver_get_vectored_concurrent_io=pageserver_get_vectored_concurrent_io,
     ) as builder:
         yield builder
         # Propogate `preserve_database_files` to make it possible to use in other fixtures,
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index f57c0f801f..3404c16f55 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -44,6 +44,11 @@ def pageserver_virtual_file_io_mode() -> str | None:
     return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE")
 
 
+@pytest.fixture(scope="function", autouse=True)
+def pageserver_get_vectored_concurrent_io() -> str | None:
+    return os.getenv("PAGESERVER_GET_VECTORED_CONCURRENT_IO")
+
+
 def get_pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None:
     toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM")
     if toml_table is None:
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index a6eaaf6c4c..ac44630d30 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -251,6 +251,8 @@ def test_forward_compatibility(
         os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true"
     )
 
+    neon_env_builder.test_may_use_compatibility_snapshot_binaries = True
+
     try:
         neon_env_builder.num_safekeepers = 3
 

From c283aaaf8d66dd04ce463733cf6545269f70f4c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Wed, 22 Jan 2025 17:09:41 +0100
Subject: [PATCH 242/583] Tag images from docker-hub in promote-images-prod
 (#10475)

## Problem

https://github.com/neondatabase/neon/actions/runs/12896686483/job/35961290336#step:5:107
showed that `promote-images-prod` was missing another dependency.

## Summary of changes
Modify `promote-images-prod` to tag based on docker-hub images, so that
`promote-images-prod` does not rely on `promote-images-dev`. The result
should be the exact same, but allows the two jobs to run in parallel.
---
 .github/workflows/build_and_test.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 4fc81dccaa..b1230879d3 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -892,14 +892,14 @@ jobs:
         run: |
           for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
             docker buildx imagetools create -t $repo/neon:latest \
-                                               $repo/neon:${{ needs.tag.outputs.build-tag }}
+                                               neondatabase/neon:${{ needs.tag.outputs.build-tag }}
 
             for version in ${VERSIONS}; do
               docker buildx imagetools create -t $repo/compute-node-${version}:latest \
-                                                 $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
+                                                 neondatabase/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
 
               docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
-                                                 $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
+                                                 neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
             done
           done
           docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \

From f1473dd438ebfeb2eeae7f9d619cd0478c47e470 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Wed, 22 Jan 2025 17:34:57 +0100
Subject: [PATCH 243/583] Fix the connection error for extension tests (#10480)

## Problem
The trust connection to the compute required for `pg_anon` was removed.
However, the PGPASSWORD environment variable was not added to
`docker-compose.yml`.
This caused connection errors, which were interpreted as success due to
errors in the bash script.
## Summary of changes
The environment variable was added, and the logic in the bash script was
fixed.
---
 docker-compose/docker-compose.yml | 2 ++
 docker-compose/run-tests.sh       | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
index 6e15fdbe0d..4f0a887c27 100644
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -185,6 +185,8 @@ services:
   neon-test-extensions:
     profiles: ["test-extensions"]
     image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest}
+    environment:
+      - PGPASSWORD=cloud_admin
     entrypoint:
       - "/bin/bash"
       - "-c"
diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh
index 3fc0b90071..9873187b62 100644
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -7,7 +7,10 @@ LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
 for d in ${LIST}
 do
        [ -d "${d}" ] || continue
-    psql -c "select 1" >/dev/null || break
+       if ! psql -w -c "select 1" >/dev/null; then
+          FAILED="${d} ${FAILED}"
+          break
+       fi
        USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
 done
 [ -z "${FAILED}" ] && exit 0

From c60b91369aef527f63aebc99d1e6e148dbece506 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 22 Jan 2025 19:52:16 +0100
Subject: [PATCH 244/583] Expose safekeeper APIs for creation and deletion
 (#10478)

Add APIs for timeline creation and deletion to the safekeeper client
crate. Going to be used later in #10440.

Split off from #10440.

Part of https://github.com/neondatabase/neon/issues/9011
---
 safekeeper/client/src/mgmt_api.rs | 32 ++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs
index 5727f32509..f65bfaa6d5 100644
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -4,7 +4,7 @@
 //! united.
 
 use reqwest::{IntoUrl, Method, StatusCode};
-use safekeeper_api::models::TimelineStatus;
+use safekeeper_api::models::{TimelineCreateRequest, TimelineStatus};
 use std::error::Error as _;
 use utils::{
     http::error::HttpErrorBody,
@@ -76,6 +76,28 @@ impl Client {
         }
     }
 
+    pub async fn create_timeline(&self, req: &TimelineCreateRequest) -> Result<TimelineStatus> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}",
+            self.mgmt_api_endpoint, req.tenant_id, req.timeline_id
+        );
+        let resp = self.post(&uri, req).await?;
+        resp.json().await.map_err(Error::ReceiveBody)
+    }
+
+    pub async fn delete_timeline(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<TimelineStatus> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}",
+            self.mgmt_api_endpoint, tenant_id, timeline_id
+        );
+        let resp = self.request(Method::DELETE, &uri, ()).await?;
+        resp.json().await.map_err(Error::ReceiveBody)
+    }
+
     pub async fn timeline_status(
         &self,
         tenant_id: TenantId,
@@ -107,6 +129,14 @@ impl Client {
         self.get(&uri).await
     }
 
+    async fn post<B: serde::Serialize, U: IntoUrl>(
+        &self,
+        uri: U,
+        body: B,
+    ) -> Result<reqwest::Response> {
+        self.request(Method::POST, uri, body).await
+    }
+
     async fn get<U: IntoUrl>(&self, uri: U) -> Result<reqwest::Response> {
         self.request(Method::GET, uri, ()).await
     }

From 0af40b5494a56ca19b982ad7b53045768fd79eb6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 22 Jan 2025 20:45:12 +0100
Subject: [PATCH 245/583] Only churn rows once in
 test_scrubber_physical_gc_ancestors (#10481)

## Problem

PR #10457 was supposed to fix the flakiness of
`test_scrubber_physical_gc_ancestors`, but instead it made it even more
flaky. However, the original error causes disappeared, now to be
replaced by key not found errors.

See this for a longer explanation:
https://github.com/neondatabase/neon/issues/10391#issuecomment-2608018967

## Solution

This does one churn rows after all compactions, and before we do any
timeline gc's. That way, we remain more accessible at older lsn's.
---
 test_runner/regress/test_storage_scrubber.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index a782e85567..1304d302b7 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -271,8 +271,14 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_
         ps.http_client().timeline_compact(
             shard, timeline_id, force_image_layer_creation=True, wait_until_uploaded=True
         )
-        # Add some WAL so that we don't gc at the latest remote consistent lsn
-        workload.churn_rows(1)
+
+    # Add some WAL so that we don't gc at the latest remote consistent lsn
+    workload.churn_rows(10)
+
+    # Now gc the old stuff away
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+        assert ps is not None
         ps.http_client().timeline_gc(shard, timeline_id, 0)
 
     # We will use a min_age_secs=1 threshold for deletion, let it pass

From 92d95b08cfba6973bc735538fe5778f40b0dd45c Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 22 Jan 2025 19:15:46 -0500
Subject: [PATCH 246/583] fix(pageserver): extend split job key range to the
 end (#10484)

## Problem

Not really a bug fix, but hopefully can reproduce
https://github.com/neondatabase/neon/issues/10482 more.

If the layer map does not contain layers that end at exactly the end
range of the compaction job, the current split algorithm will produce
the last job that ends at the maximum layer key. This patch extends it
all the way to the compaction job end key.

For example, the user requests a compaction of 0000...FFFF. However, we
only have a layer 0000..3000 in the layer map, and the split job will
have a range of 0000..3000 instead of 0000..FFFF.

This is not a correctness issue but it would be better to fix it so that
we can get consistent job splits.

## Summary of changes

Compaction job split will always cover the full specified key range.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 6 ++++++
 test_runner/regress/test_compaction.py       | 9 +++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 57fc415d06..4d5dc2d8a9 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -2212,6 +2212,12 @@ impl Timeline {
                 } else {
                     end
                 };
+                let end = if ranges_num == idx + 1 {
+                    // extend the compaction range to the end of the key range if it's the last partition
+                    end.max(job.compact_key_range.end)
+                } else {
+                    end
+                };
                 info!(
                     "splitting compaction job: {}..{}, estimated_size={}",
                     start, end, total_size
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index d0a2349ccf..fde26e1533 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -150,8 +150,7 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b
     child_workloads: list[Workload] = []
 
     for i in range(1, churn_rounds + 1):
-        if i % 10 == 0:
-            log.info(f"Running churn round {i}/{churn_rounds} ...")
+        log.info(f"Running churn round {i}/{churn_rounds} ...")
         if i % 10 == 5 and with_branches == "with_branches":
             branch_name = f"child-{i}"
             branch_timeline_id = env.create_branch(branch_name)
@@ -172,8 +171,10 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b
                     "sub_compaction_max_job_size_mb": 16,
                 },
             )
-
-        workload.churn_rows(row_count, env.pageserver.id)
+        # do not wait for upload so that we can see if gc_compaction works well with data being ingested
+        workload.churn_rows(row_count, env.pageserver.id, upload=False)
+        time.sleep(1)
+        workload.validate(env.pageserver.id)
 
     def compaction_finished():
         queue_depth = len(ps_http.timeline_compact_info(tenant_id, timeline_id))

From 8e8df1b4539403b294a02332bbc14252b81b3cc9 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 23 Jan 2025 11:02:15 +0000
Subject: [PATCH 247/583] Disable logical replication subscribers (#10249)

Drop logical replication subscribers
before compute starts on a non-main branch.

Add new compute_ctl spec flag: drop_subscriptions_before_start
If it is set, drop all the subscriptions from the compute node
before it starts.

To avoid race on compute start, use new GUC
neon.disable_logical_replication_subscribers
to temporarily disable logical replication workers until we drop the
subscriptions.

Ensure that we drop subscriptions exactly once when endpoint starts on a
new branch.
It is essential, because otherwise, we may drop not only inherited, but
newly created subscriptions.

We cannot rely only on spec.drop_subscriptions_before_start flag,
because if for some reason compute restarts inside VM,
it will start again with the same spec and flag value.

To handle this, we save the fact of the operation in the database
in the neon.drop_subscriptions_done table.
If the table does not exist, we assume that the operation was never
performed, so we must do it.
If table exists, we check if the operation was performed on the current
timeline.

fixes: https://github.com/neondatabase/neon/issues/8790
---
 compute_tools/src/compute.rs                  | 104 ++++++--
 compute_tools/src/config.rs                   |   7 +
 compute_tools/src/spec_apply.rs               |  24 +-
 ...or_drop_dbs.sql => drop_subscriptions.sql} |   0
 .../src/sql/finalize_drop_subscriptions.sql   |  21 ++
 control_plane/src/bin/neon_local.rs           |   1 +
 control_plane/src/endpoint.rs                 |   7 +
 libs/compute_api/src/spec.rs                  |   7 +
 pgxn/neon/neon.c                              |  10 +
 test_runner/fixtures/neon_cli.py              |   3 +
 test_runner/fixtures/neon_fixtures.py         |   2 +
 .../regress/test_subscriber_branching.py      | 242 ++++++++++++++++++
 vendor/postgres-v14                           |   2 +-
 vendor/postgres-v15                           |   2 +-
 vendor/postgres-v16                           |   2 +-
 vendor/postgres-v17                           |   2 +-
 vendor/revisions.json                         |   8 +-
 17 files changed, 413 insertions(+), 31 deletions(-)
 rename compute_tools/src/sql/{drop_subscription_for_drop_dbs.sql => drop_subscriptions.sql} (100%)
 create mode 100644 compute_tools/src/sql/finalize_drop_subscriptions.sql
 create mode 100644 test_runner/regress/test_subscriber_branching.py

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 1ac97a378b..fd76e404c6 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -41,14 +41,14 @@ use crate::local_proxy;
 use crate::pg_helpers::*;
 use crate::spec::*;
 use crate::spec_apply::ApplySpecPhase::{
-    CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSuperUser,
-    DropInvalidDatabases, DropRoles, HandleNeonExtension, HandleOtherExtensions,
-    RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase,
+    CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSchemaNeon,
+    CreateSuperUser, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
+    HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
+    RunInEachDatabase,
 };
 use crate::spec_apply::PerDatabasePhase;
 use crate::spec_apply::PerDatabasePhase::{
-    ChangeSchemaPerms, DeleteDBRoleReferences, DropSubscriptionsForDeletedDatabases,
-    HandleAnonExtension,
+    ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension,
 };
 use crate::spec_apply::{apply_operations, MutableApplyContext, DB};
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
@@ -340,6 +340,15 @@ impl ComputeNode {
         self.state.lock().unwrap().status
     }
 
+    pub fn get_timeline_id(&self) -> Option<TimelineId> {
+        self.state
+            .lock()
+            .unwrap()
+            .pspec
+            .as_ref()
+            .map(|s| s.timeline_id)
+    }
+
     // Remove `pgdata` directory and create it again with right permissions.
     fn create_pgdata(&self) -> Result<()> {
         // Ignore removal error, likely it is a 'No such file or directory (os error 2)'.
@@ -929,6 +938,48 @@ impl ComputeNode {
                 .map(|role| (role.name.clone(), role))
                 .collect::<HashMap<String, Role>>();
 
+            // Check if we need to drop subscriptions before starting the endpoint.
+            //
+            // It is important to do this operation exactly once when endpoint starts on a new branch.
+            // Otherwise, we may drop not inherited, but newly created subscriptions.
+            //
+            // We cannot rely only on spec.drop_subscriptions_before_start flag,
+            // because if for some reason compute restarts inside VM,
+            // it will start again with the same spec and flag value.
+            //
+            // To handle this, we save the fact of the operation in the database
+            // in the neon.drop_subscriptions_done table.
+            // If the table does not exist, we assume that the operation was never performed, so we must do it.
+            // If table exists, we check if the operation was performed on the current timelilne.
+            //
+            let mut drop_subscriptions_done = false;
+
+            if spec.drop_subscriptions_before_start {
+                let timeline_id = self.get_timeline_id().context("timeline_id must be set")?;
+                let query = format!("select 1 from neon.drop_subscriptions_done where timeline_id = '{}'", timeline_id);
+
+                info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id);
+
+                drop_subscriptions_done =  match
+                    client.simple_query(&query).await {
+                    Ok(result) => {
+                        matches!(&result[0], postgres::SimpleQueryMessage::Row(_))
+                    },
+                    Err(e) =>
+                    {
+                        match e.code() {
+                            Some(&SqlState::UNDEFINED_TABLE) => false,
+                            _ => {
+                                // We don't expect any other error here, except for the schema/table not existing
+                                error!("Error checking if drop subscription operation was already performed: {}", e);
+                                return Err(e.into());
+                            }
+                        }
+                    }
+                }
+            };
+
+
             let jwks_roles = Arc::new(
                 spec.as_ref()
                     .local_proxy_config
@@ -996,7 +1047,7 @@ impl ComputeNode {
                         jwks_roles.clone(),
                         concurrency_token.clone(),
                         db,
-                        [DropSubscriptionsForDeletedDatabases].to_vec(),
+                        [DropLogicalSubscriptions].to_vec(),
                     );
 
                     Ok(spawn(fut))
@@ -1024,6 +1075,7 @@ impl ComputeNode {
                 CreateAndAlterRoles,
                 RenameAndDeleteDatabases,
                 CreateAndAlterDatabases,
+                CreateSchemaNeon,
             ] {
                 info!("Applying phase {:?}", &phase);
                 apply_operations(
@@ -1064,6 +1116,17 @@ impl ComputeNode {
                     }
 
                     let conf = Arc::new(conf);
+                    let mut phases = vec![
+                        DeleteDBRoleReferences,
+                        ChangeSchemaPerms,
+                        HandleAnonExtension,
+                    ];
+
+                    if spec.drop_subscriptions_before_start && !drop_subscriptions_done {
+                        info!("Adding DropLogicalSubscriptions phase because drop_subscriptions_before_start is set");
+                        phases.push(DropLogicalSubscriptions);
+                    }
+
                     let fut = Self::apply_spec_sql_db(
                         spec.clone(),
                         conf,
@@ -1071,12 +1134,7 @@ impl ComputeNode {
                         jwks_roles.clone(),
                         concurrency_token.clone(),
                         db,
-                        [
-                            DeleteDBRoleReferences,
-                            ChangeSchemaPerms,
-                            HandleAnonExtension,
-                        ]
-                        .to_vec(),
+                        phases,
                     );
 
                     Ok(spawn(fut))
@@ -1088,12 +1146,20 @@ impl ComputeNode {
                 handle.await??;
             }
 
-            for phase in vec![
+            let mut phases = vec![
                 HandleOtherExtensions,
-                HandleNeonExtension,
+                HandleNeonExtension, // This step depends on CreateSchemaNeon
                 CreateAvailabilityCheck,
                 DropRoles,
-            ] {
+            ];
+
+            // This step depends on CreateSchemaNeon
+            if spec.drop_subscriptions_before_start && !drop_subscriptions_done {
+                info!("Adding FinalizeDropLogicalSubscriptions phase because drop_subscriptions_before_start is set");
+                phases.push(FinalizeDropLogicalSubscriptions);
+            }
+
+            for phase in phases {
                 debug!("Applying phase {:?}", &phase);
                 apply_operations(
                     spec.clone(),
@@ -1463,6 +1529,14 @@ impl ComputeNode {
                         Ok(())
                     },
                 )?;
+
+                let postgresql_conf_path = pgdata_path.join("postgresql.conf");
+                if config::line_in_file(
+                    &postgresql_conf_path,
+                    "neon.disable_logical_replication_subscribers=false",
+                )? {
+                    info!("updated postgresql.conf to set neon.disable_logical_replication_subscribers=false");
+                }
                 self.pg_reload_conf()?;
             }
             self.post_apply_config()?;
diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index b257c8a68f..e1bdfffa54 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -129,6 +129,13 @@ pub fn write_postgres_conf(
 
     writeln!(file, "neon.extension_server_port={}", extension_server_port)?;
 
+    if spec.drop_subscriptions_before_start {
+        writeln!(file, "neon.disable_logical_replication_subscribers=true")?;
+    } else {
+        // be explicit about the default value
+        writeln!(file, "neon.disable_logical_replication_subscribers=false")?;
+    }
+
     // This is essential to keep this line at the end of the file,
     // because it is intended to override any settings above.
     writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?;
diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs
index 7401de2e60..5ee9c5fbd8 100644
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -47,7 +47,7 @@ pub enum PerDatabasePhase {
     DeleteDBRoleReferences,
     ChangeSchemaPerms,
     HandleAnonExtension,
-    DropSubscriptionsForDeletedDatabases,
+    DropLogicalSubscriptions,
 }
 
 #[derive(Clone, Debug)]
@@ -58,11 +58,13 @@ pub enum ApplySpecPhase {
     CreateAndAlterRoles,
     RenameAndDeleteDatabases,
     CreateAndAlterDatabases,
+    CreateSchemaNeon,
     RunInEachDatabase { db: DB, subphase: PerDatabasePhase },
     HandleOtherExtensions,
     HandleNeonExtension,
     CreateAvailabilityCheck,
     DropRoles,
+    FinalizeDropLogicalSubscriptions,
 }
 
 pub struct Operation {
@@ -331,7 +333,7 @@ async fn get_operations<'a>(
                             // NB: there could be other db states, which prevent us from dropping
                             // the database. For example, if db is used by any active subscription
                             // or replication slot.
-                            // Such cases are handled in the DropSubscriptionsForDeletedDatabases
+                            // Such cases are handled in the DropLogicalSubscriptions
                             // phase. We do all the cleanup before actually dropping the database.
                             let drop_db_query: String = format!(
                                 "DROP DATABASE IF EXISTS {} WITH (FORCE)",
@@ -442,13 +444,19 @@ async fn get_operations<'a>(
 
             Ok(Box::new(operations))
         }
+        ApplySpecPhase::CreateSchemaNeon => Ok(Box::new(once(Operation {
+            query: String::from("CREATE SCHEMA IF NOT EXISTS neon"),
+            comment: Some(String::from(
+                "create schema for neon extension and utils tables",
+            )),
+        }))),
         ApplySpecPhase::RunInEachDatabase { db, subphase } => {
             match subphase {
-                PerDatabasePhase::DropSubscriptionsForDeletedDatabases => {
+                PerDatabasePhase::DropLogicalSubscriptions => {
                     match &db {
                         DB::UserDB(db) => {
                             let drop_subscription_query: String = format!(
-                                include_str!("sql/drop_subscription_for_drop_dbs.sql"),
+                                include_str!("sql/drop_subscriptions.sql"),
                                 datname_str = escape_literal(&db.name),
                             );
 
@@ -666,10 +674,6 @@ async fn get_operations<'a>(
         }
         ApplySpecPhase::HandleNeonExtension => {
             let operations = vec![
-                Operation {
-                    query: String::from("CREATE SCHEMA IF NOT EXISTS neon"),
-                    comment: Some(String::from("init: add schema for extension")),
-                },
                 Operation {
                     query: String::from("CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon"),
                     comment: Some(String::from(
@@ -712,5 +716,9 @@ async fn get_operations<'a>(
 
             Ok(Box::new(operations))
         }
+        ApplySpecPhase::FinalizeDropLogicalSubscriptions => Ok(Box::new(once(Operation {
+            query: String::from(include_str!("sql/finalize_drop_subscriptions.sql")),
+            comment: None,
+        }))),
     }
 }
diff --git a/compute_tools/src/sql/drop_subscription_for_drop_dbs.sql b/compute_tools/src/sql/drop_subscriptions.sql
similarity index 100%
rename from compute_tools/src/sql/drop_subscription_for_drop_dbs.sql
rename to compute_tools/src/sql/drop_subscriptions.sql
diff --git a/compute_tools/src/sql/finalize_drop_subscriptions.sql b/compute_tools/src/sql/finalize_drop_subscriptions.sql
new file mode 100644
index 0000000000..4bb291876f
--- /dev/null
+++ b/compute_tools/src/sql/finalize_drop_subscriptions.sql
@@ -0,0 +1,21 @@
+DO $$
+BEGIN
+    IF NOT EXISTS(
+        SELECT 1
+        FROM pg_catalog.pg_tables
+        WHERE tablename = 'drop_subscriptions_done'
+        AND schemaname = 'neon'
+    )
+    THEN
+        CREATE TABLE neon.drop_subscriptions_done
+        (id serial primary key, timeline_id text);
+    END IF;
+
+    -- preserve the timeline_id of the last drop_subscriptions run
+    -- to ensure that the cleanup of a timeline is executed only once.
+    -- use upsert to avoid the table bloat in case of cascade branching (branch of a branch)
+    INSERT INTO neon.drop_subscriptions_done VALUES (1, current_setting('neon.timeline_id'))
+    ON CONFLICT (id) DO UPDATE
+    SET timeline_id = current_setting('neon.timeline_id');
+END
+$$
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index c73debae4c..ba67ffa2dd 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1357,6 +1357,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                 args.pg_version,
                 mode,
                 !args.update_catalog,
+                false,
             )?;
         }
         EndpointCmd::Start(args) => {
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index b8027abf7c..bc86d09103 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -76,6 +76,7 @@ pub struct EndpointConf {
     http_port: u16,
     pg_version: u32,
     skip_pg_catalog_updates: bool,
+    drop_subscriptions_before_start: bool,
     features: Vec<ComputeFeature>,
 }
 
@@ -143,6 +144,7 @@ impl ComputeControlPlane {
         pg_version: u32,
         mode: ComputeMode,
         skip_pg_catalog_updates: bool,
+        drop_subscriptions_before_start: bool,
     ) -> Result<Arc<Endpoint>> {
         let pg_port = pg_port.unwrap_or_else(|| self.get_port());
         let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
@@ -162,6 +164,7 @@ impl ComputeControlPlane {
             // with this we basically test a case of waking up an idle compute, where
             // we also skip catalog updates in the cloud.
             skip_pg_catalog_updates,
+            drop_subscriptions_before_start,
             features: vec![],
         });
 
@@ -177,6 +180,7 @@ impl ComputeControlPlane {
                 pg_port,
                 pg_version,
                 skip_pg_catalog_updates,
+                drop_subscriptions_before_start,
                 features: vec![],
             })?,
         )?;
@@ -240,6 +244,7 @@ pub struct Endpoint {
     // Optimizations
     skip_pg_catalog_updates: bool,
 
+    drop_subscriptions_before_start: bool,
     // Feature flags
     features: Vec<ComputeFeature>,
 }
@@ -291,6 +296,7 @@ impl Endpoint {
             tenant_id: conf.tenant_id,
             pg_version: conf.pg_version,
             skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
+            drop_subscriptions_before_start: conf.drop_subscriptions_before_start,
             features: conf.features,
         })
     }
@@ -625,6 +631,7 @@ impl Endpoint {
             shard_stripe_size: Some(shard_stripe_size),
             local_proxy_config: None,
             reconfigure_concurrency: 1,
+            drop_subscriptions_before_start: self.drop_subscriptions_before_start,
         };
         let spec_path = self.endpoint_path().join("spec.json");
         std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 54d6a1d38f..b3f18dc6da 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -138,6 +138,13 @@ pub struct ComputeSpec {
     /// enough spare connections for reconfiguration process to succeed.
     #[serde(default = "default_reconfigure_concurrency")]
     pub reconfigure_concurrency: usize,
+
+    /// If set to true, the compute_ctl will drop all subscriptions before starting the
+    /// compute. This is needed when we start an endpoint on a branch, so that child
+    /// would not compete with parent branch subscriptions
+    /// over the same replication content from publisher.
+    #[serde(default)] // Default false
+    pub drop_subscriptions_before_start: bool,
 }
 
 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index ff08f9164d..ce2938cfd5 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -19,6 +19,7 @@
 #include "access/xlogrecovery.h"
 #endif
 #include "replication/logical.h"
+#include "replication/logicallauncher.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
 #include "storage/proc.h"
@@ -434,6 +435,15 @@ _PG_init(void)
 
 	restore_running_xacts_callback = RestoreRunningXactsFromClog;
 
+	DefineCustomBoolVariable(
+							"neon.disable_logical_replication_subscribers",
+							"Disables incomming logical replication",
+							NULL,
+							&disable_logical_replication_subscribers,
+							false,
+							PGC_SIGHUP,
+							0,
+							NULL, NULL, NULL);
 
 	DefineCustomBoolVariable(
 							"neon.allow_replica_misconfig",
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index adbd6414a7..33d422c590 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -523,6 +523,7 @@ class NeonLocalCli(AbstractNeonCli):
         remote_ext_config: str | None = None,
         pageserver_id: int | None = None,
         allow_multiple: bool = False,
+        create_test_user: bool = False,
         basebackup_request_tries: int | None = None,
         env: dict[str, str] | None = None,
     ) -> subprocess.CompletedProcess[str]:
@@ -544,6 +545,8 @@ class NeonLocalCli(AbstractNeonCli):
             args.extend(["--pageserver-id", str(pageserver_id)])
         if allow_multiple:
             args.extend(["--allow-multiple"])
+        if create_test_user:
+            args.extend(["--create-test-user"])
 
         res = self.raw_cli(args, extra_env_vars)
         res.check_returncode()
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index af427b92d2..388c1eb046 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3918,6 +3918,7 @@ class Endpoint(PgProtocol, LogUtils):
         pageserver_id: int | None = None,
         safekeepers: list[int] | None = None,
         allow_multiple: bool = False,
+        create_test_user: bool = False,
         basebackup_request_tries: int | None = None,
         env: dict[str, str] | None = None,
     ) -> Self:
@@ -3939,6 +3940,7 @@ class Endpoint(PgProtocol, LogUtils):
             remote_ext_config=remote_ext_config,
             pageserver_id=pageserver_id,
             allow_multiple=allow_multiple,
+            create_test_user=create_test_user,
             basebackup_request_tries=basebackup_request_tries,
             env=env,
         )
diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py
new file mode 100644
index 0000000000..645572da8e
--- /dev/null
+++ b/test_runner/regress/test_subscriber_branching.py
@@ -0,0 +1,242 @@
+from __future__ import annotations
+
+import time
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv, logical_replication_sync
+from fixtures.utils import query_scalar, wait_until
+
+
+# This test checks that branching of timeline with logical subscriptions
+# does not affect logical replication for parent.
+# Endpoint on a new branch will drop all existing subscriptions at the start,
+# so it will not receive any changes.
+# If needed, user can create new subscriptions on the child branch.
+def test_subscriber_branching(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    env.create_branch("publisher")
+    pub = env.endpoints.create("publisher")
+    pub.respec(
+        skip_pg_catalog_updates=False,
+        create_test_user=True,
+    )
+    pub.start(create_test_user=True)
+
+    env.create_branch("subscriber")
+    sub = env.endpoints.create("subscriber")
+    # Pass create_test_user flag to get properly filled spec.users and spec.databases fields.
+    #
+    # This test checks the per-database operations that happen at compute start
+    # and these operations are applied to the databases that are present in the spec.
+    sub.respec(
+        skip_pg_catalog_updates=False,
+        create_test_user=True,
+    )
+    sub.start(create_test_user=True)
+
+    pub.wait_for_migrations()
+    sub.wait_for_migrations()
+
+    n_records = 1000
+
+    def check_that_changes_propagated():
+        scur.execute("SELECT count(*) FROM t")
+        res = scur.fetchall()
+        assert res[0][0] == n_records
+
+    def insert_data(pub, start):
+        with pub.cursor(dbname="neondb", user="test", password="pubtestpwd") as pcur:
+            for i in range(start, start + n_records):
+                pcur.execute("INSERT into t values (%s,random()*100000)", (i,))
+
+    # create_test_user creates a user without password
+    # but psycopg2 execute() requires a password
+    with sub.cursor() as scur:
+        scur.execute("ALTER USER test WITH PASSWORD 'testpwd'")
+    with pub.cursor() as pcur:
+        # Create a test user to avoid using superuser
+        pcur.execute("ALTER USER test WITH PASSWORD 'pubtestpwd'")
+        # If we don't do this, creating the subscription will fail
+        pub.edit_hba(["host all test 0.0.0.0/0 md5"])
+
+    with pub.cursor(dbname="neondb", user="test", password="pubtestpwd") as pcur:
+        pcur.execute("CREATE TABLE t (pk integer primary key, sk integer)")
+        pcur.execute("CREATE PUBLICATION pub FOR TABLE t")
+
+        with sub.cursor(dbname="neondb", user="test", password="testpwd") as scur:
+            scur.execute("CREATE TABLE t (pk integer primary key, sk integer)")
+            pub_conn = (
+                f"host=localhost port={pub.pg_port} dbname=neondb user=test password=pubtestpwd"
+            )
+            query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub"
+            scur.execute(query)
+            time.sleep(2)  # let initial table sync complete
+
+        insert_data(pub, 0)
+
+        with sub.cursor(dbname="neondb", user="test", password="testpwd") as scur:
+            wait_until(check_that_changes_propagated)
+            latest_end_lsn = query_scalar(
+                scur, "select latest_end_lsn from pg_catalog.pg_stat_subscription; "
+            )
+            last_insert_lsn = query_scalar(scur, "select pg_current_wal_insert_lsn();")
+
+            log.info(f"latest_end_lsn = {latest_end_lsn}")
+            log.info(f"last_insert_lsn = {last_insert_lsn}")
+
+        # stop the parent subscriber so that it doesn't interfere with the test
+        sub.stop()
+
+        # 1. good scenario:
+        # create subscriber_child_1
+        # it will not get changes from publisher, because drop_subscriptions_before_start is set to True
+        sub_child_1_timeline_id = env.create_branch(
+            "subscriber_child_1",
+            ancestor_branch_name="subscriber",
+            ancestor_start_lsn=last_insert_lsn,
+        )
+        sub_child_1 = env.endpoints.create("subscriber_child_1")
+        # Pass drop_subscriptions_before_start flag
+        sub_child_1.respec(
+            skip_pg_catalog_updates=False,
+            create_test_user=True,
+            drop_subscriptions_before_start=True,
+        )
+        sub_child_1.start(create_test_user=True)
+
+        # ensure that subscriber_child_1 sees all the data
+        with sub_child_1.cursor(dbname="neondb", user="test", password="testpwd") as scur:
+            scur.execute("SELECT count(*) FROM t")
+            res = scur.fetchall()
+            assert res[0][0] == n_records
+
+            # ensure that there are no subscriptions in this database
+            scur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub'")
+            assert len(scur.fetchall()) == 0
+
+        # ensure that drop_subscriptions_done happened on this timeline
+        with sub_child_1.cursor() as scur_postgres:
+            scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done")
+            res = scur_postgres.fetchall()
+            assert len(res) == 1
+            assert str(sub_child_1_timeline_id) == res[0][0]
+
+        old_n_records = n_records
+        # insert more data on publisher
+        insert_data(pub, n_records)
+        n_records += n_records
+
+        pcur.execute("SELECT count(*) FROM t")
+        res = pcur.fetchall()
+        assert res[0][0] == n_records
+
+        # ensure that subscriber_child_1 doesn't see the new data
+        with sub_child_1.cursor(dbname="neondb", user="test", password="testpwd") as scur:
+            scur.execute("SELECT count(*) FROM t")
+            res = scur.fetchall()
+            assert res[0][0] == old_n_records
+
+        # reenable logical replication on subscriber_child_1
+        # using new publication
+        # ensure that new publication works as expected
+        with sub_child_1.cursor(dbname="neondb", user="test", password="testpwd") as scur:
+            scur.execute("TRUNCATE t")
+
+            # create new subscription
+            # with new pub name
+            pcur.execute("CREATE PUBLICATION pub_new FOR TABLE t")
+            query = f"CREATE SUBSCRIPTION sub_new CONNECTION '{pub_conn}' PUBLICATION pub_new"
+            scur.execute(query)
+
+            wait_until(check_that_changes_propagated)
+
+            scur.execute("SELECT count(*) FROM t")
+            res = scur.fetchall()
+            assert res[0][0] == n_records
+
+        # ensure that new publication works as expected after compute restart
+        # first restart with drop_subscriptions_before_start=True
+        # to emulate the case when compute restarts within the VM with stale spec
+        sub_child_1.stop()
+        sub_child_1.respec(
+            skip_pg_catalog_updates=False,
+            create_test_user=True,
+            drop_subscriptions_before_start=True,
+        )
+        sub_child_1.start(create_test_user=True)
+
+        with sub_child_1.cursor(dbname="neondb", user="test", password="testpwd") as scur:
+            # ensure that even though the flag is set, we didn't drop new subscription
+            scur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub_new'")
+            assert len(scur.fetchall()) == 1
+
+        # ensure that drop_subscriptions_done happened on this timeline
+        with sub_child_1.cursor() as scur_postgres:
+            scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done")
+            res = scur_postgres.fetchall()
+            assert len(res) == 1
+            assert str(sub_child_1_timeline_id) == res[0][0]
+
+        sub_child_1.stop()
+        sub_child_1.respec(
+            skip_pg_catalog_updates=False,
+            create_test_user=True,
+            drop_subscriptions_before_start=False,
+        )
+        sub_child_1.start(create_test_user=True)
+
+        # insert more data on publisher
+        insert_data(pub, n_records)
+        n_records += n_records
+        with sub_child_1.cursor(dbname="neondb", user="test", password="testpwd") as scur:
+            # ensure that there is a subscriptions in this database
+            scur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub_new'")
+            assert len(scur.fetchall()) == 1
+
+            wait_until(check_that_changes_propagated)
+            scur.execute("SELECT count(*) FROM t")
+            res = scur.fetchall()
+            assert res[0][0] == n_records
+
+        # ensure that drop_subscriptions_done happened on this timeline
+        with sub_child_1.cursor() as scur_postgres:
+            scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done")
+            res = scur_postgres.fetchall()
+            assert len(res) == 1
+            assert str(sub_child_1_timeline_id) == res[0][0]
+
+        # wake the sub and ensure that it catches up with the new data
+        sub.start(create_test_user=True)
+        with sub.cursor(dbname="neondb", user="test", password="testpwd") as scur:
+            logical_replication_sync(sub, pub)
+            wait_until(check_that_changes_propagated)
+            scur.execute("SELECT count(*) FROM t")
+            res = scur.fetchall()
+            assert res[0][0] == n_records
+
+        # test that we can create a branch of a branch
+        sub_child_2_timeline_id = env.create_branch(
+            "subscriber_child_2",
+            ancestor_branch_name="subscriber_child_1",
+        )
+        sub_child_2 = env.endpoints.create("subscriber_child_2")
+        # Pass drop_subscriptions_before_start flag
+        sub_child_2.respec(
+            skip_pg_catalog_updates=False,
+            drop_subscriptions_before_start=True,
+        )
+        sub_child_2.start(create_test_user=True)
+
+        # ensure that subscriber_child_2 does not inherit subscription from child_1
+        with sub_child_2.cursor(dbname="neondb", user="test", password="testpwd") as scur:
+            # ensure that there are no subscriptions in this database
+            scur.execute("SELECT count(*) FROM pg_catalog.pg_subscription")
+            res = scur.fetchall()
+            assert res[0][0] == 0
+
+        # ensure that drop_subscriptions_done happened on this timeline
+        with sub_child_2.cursor() as scur_postgres:
+            scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done")
+            res = scur_postgres.fetchall()
+            assert len(res) == 1
+            assert str(sub_child_2_timeline_id) == res[0][0]
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 46082f2088..5f3b3afdd7 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 46082f20884f087a2d974b33ac65d63af26142bd
+Subproject commit 5f3b3afdd7c24b4a0fd63ecb3288fab472fcc633
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index dd0b28d6fb..935292e883 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit dd0b28d6fbad39e227f3b77296fcca879af8b3a9
+Subproject commit 935292e883298187f112db6e9c7f765037ddcf64
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index d674efd776..061d563779 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit d674efd776f59d78e8fa1535bd2f95c3e6984fca
+Subproject commit 061d56377961ba56998e41b7d5d5e975919ad301
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index a8dd6e779d..4276717f6e 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit a8dd6e779dde907778006adb436b557ad652fb97
+Subproject commit 4276717f6e91023e504de355f4f21d4824074de8
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c899dbaa5a..a104be8ae0 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.2",
-    "a8dd6e779dde907778006adb436b557ad652fb97"
+    "4276717f6e91023e504de355f4f21d4824074de8"
   ],
   "v16": [
     "16.6",
-    "d674efd776f59d78e8fa1535bd2f95c3e6984fca"
+    "061d56377961ba56998e41b7d5d5e975919ad301"
   ],
   "v15": [
     "15.10",
-    "dd0b28d6fbad39e227f3b77296fcca879af8b3a9"
+    "935292e883298187f112db6e9c7f765037ddcf64"
   ],
   "v14": [
     "14.15",
-    "46082f20884f087a2d974b33ac65d63af26142bd"
+    "5f3b3afdd7c24b4a0fd63ecb3288fab472fcc633"
   ]
 }

From 3702ec889faa37859e48a7d170fec59dc76b720b Mon Sep 17 00:00:00 2001
From: Mikhail Kot <mikhail@neon.tech>
Date: Thu, 23 Jan 2025 13:22:31 +0000
Subject: [PATCH 248/583] Enable postgres_fdw (#10426)

Update compute image to include postgres_fdw #3720
---
 compute/compute-node.Dockerfile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 706c947008..a80c701b45 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -67,6 +67,9 @@ RUN cd postgres && \
     # Enable some of contrib extensions
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgres_fdw.control && \
+    file=/usr/local/pgsql/share/extension/postgres_fdw--1.0.sql && [ -e $file ] && \
+    echo 'GRANT USAGE ON FOREIGN DATA WRAPPER postgres_fdw TO neon_superuser;' >> $file && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \

From b6c0f66619f0117480dd550b99d6cc57ef0192b7 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Thu, 23 Jan 2025 15:52:07 +0100
Subject: [PATCH 249/583] CI(autocomment): add the lfc state (#10121)

## Problem
Currently, the report does not contain the LFC state of the failed
tests.
## Summary of changes
Added the LFC state to the link to the allure report.

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 scripts/comment-test-report.js                   | 11 +++++++++--
 scripts/ingest_regress_test_result-new-format.py |  2 +-
 test_runner/fixtures/parametrize.py              |  4 +++-
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/scripts/comment-test-report.js b/scripts/comment-test-report.js
index e8e0b3c23a..96a0ea3267 100755
--- a/scripts/comment-test-report.js
+++ b/scripts/comment-test-report.js
@@ -84,6 +84,12 @@ const parseReportJson = async ({ reportJsonUrl, fetch }) => {
                 } else {
                     arch = "unknown"
                 }
+                let lfcState = ""
+                if (test.parameters.includes("'with-lfc'")) {
+                    lfcState = "with-lfc"
+                } else {
+                    lfcState = "without-lfc"
+                }
 
                 // Removing build type and PostgreSQL version from the test name to make it shorter
                 const testName = test.name.replace(new RegExp(`${buildType}-pg${pgVersion}-?`), "").replace("[]", "")
@@ -91,6 +97,7 @@ const parseReportJson = async ({ reportJsonUrl, fetch }) => {
                 test.pgVersion = pgVersion
                 test.buildType = buildType
                 test.arch = arch
+                test.lfcState = lfcState
 
                 if (test.status === "passed") {
                     passedTests[pgVersion][testName].push(test)
@@ -157,7 +164,7 @@ const reportSummary = async (params) => {
                 const links = []
                 for (const test of tests) {
                     const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
-                    links.push(`[${test.buildType}-${test.arch}](${allureLink})`)
+                    links.push(`[${test.buildType}-${test.arch}-${test.lfcState}](${allureLink})`)
                 }
                 summary += `- \`${testName}\`: ${links.join(", ")}\n`
             }
@@ -188,7 +195,7 @@ const reportSummary = async (params) => {
                     const links = []
                     for (const test of tests) {
                         const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}/retries`
-                        links.push(`[${test.buildType}-${test.arch}](${allureLink})`)
+                        links.push(`[${test.buildType}-${test.arch}-${test.lfcState}](${allureLink})`)
                     }
                     summary += `- \`${testName}\`: ${links.join(", ")}\n`
                 }
diff --git a/scripts/ingest_regress_test_result-new-format.py b/scripts/ingest_regress_test_result-new-format.py
index 064c516718..ad2baf56bb 100644
--- a/scripts/ingest_regress_test_result-new-format.py
+++ b/scripts/ingest_regress_test_result-new-format.py
@@ -134,7 +134,7 @@ def ingest_test_result(
             if p["name"].startswith("__")
         }
         arch = parameters.get("arch", "UNKNOWN").strip("'")
-        lfc = parameters.get("lfc", "False") == "True"
+        lfc = parameters.get("lfc", "without-lfc").strip("'") == "with-lfc"
 
         build_type, pg_version, unparametrized_name = parse_test_name(test["name"])
         labels = {label["name"]: label["value"] for label in test["labels"]}
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 3404c16f55..1acb1af23b 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -121,6 +121,8 @@ def pytest_runtest_makereport(*args, **kwargs):
     }.get(os.uname().machine, "UNKNOWN")
     arch = os.getenv("RUNNER_ARCH", uname_m)
     allure.dynamic.parameter("__arch", arch)
-    allure.dynamic.parameter("__lfc", os.getenv("USE_LFC") != "false")
+    allure.dynamic.parameter(
+        "__lfc", "with-lfc" if os.getenv("USE_LFC") != "false" else "without-lfc"
+    )
 
     yield

From ca6d72ba2a54aeb6041af26f4baeaf0e46d5c01b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 23 Jan 2025 17:43:04 +0100
Subject: [PATCH 250/583] Increase reconciler timeout after shard split
 (#10490)

Sometimes, especially when the host running the tests is overloaded, we
can run into reconcile timeouts in
`test_timeline_ancestor_detach_idempotent_success`, making the test
flaky. By increasing the timeouts from 30 seconds to 120 seconds, we can
address the flakiness.

Fixes #10464
---
 test_runner/regress/test_timeline_detach_ancestor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 5234d8278f..612a767480 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -607,7 +607,7 @@ def test_timeline_ancestor_detach_idempotent_success(
 
     if shards_after > 1:
         # FIXME: should this be in the neon_env_builder.init_start?
-        env.storage_controller.reconcile_until_idle()
+        env.storage_controller.reconcile_until_idle(timeout_secs=120)
         client = env.storage_controller.pageserver_api()
     else:
         client = env.pageserver.http_client()
@@ -636,7 +636,7 @@ def test_timeline_ancestor_detach_idempotent_success(
         # Do a shard split
         # This is a reproducer for https://github.com/neondatabase/neon/issues/9667
         env.storage_controller.tenant_shard_split(env.initial_tenant, shards_after)
-        env.storage_controller.reconcile_until_idle()
+        env.storage_controller.reconcile_until_idle(timeout_secs=120)
 
     first_reparenting_response = client.detach_ancestor(env.initial_tenant, first_branch)
     assert set(first_reparenting_response) == {reparented1, reparented2}

From 616648258967ac59b5f8e3f83e0b5091e933ff86 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 23 Jan 2025 21:47:20 +0100
Subject: [PATCH 251/583] feat(compute): Automatically create release PRs
 (#10495)

We've finally transitioned to using a separate `release-compute` branch.
Now, we can finally automatically create release PRs on Fri and release
them during the following week.

Part of neondatabase/cloud#11698
---
 .github/workflows/release.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 3c1af1d9c6..919846ce44 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -3,8 +3,9 @@ name: Create Release Branch
 on:
   schedule:
     # It should be kept in sync with if-condition in jobs
-    - cron: '0 6 * * FRI' # Storage release
     - cron: '0 6 * * THU' # Proxy release
+    - cron: '0 6 * * FRI' # Storage release
+    - cron: '0 7 * * FRI' # Compute release
   workflow_dispatch:
     inputs:
       create-storage-release-branch:
@@ -55,7 +56,7 @@ jobs:
       ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
 
   create-compute-release-branch:
-    if: inputs.create-compute-release-branch
+    if: ${{ github.event.schedule == '0 7 * * FRI' || inputs.create-compute-release-branch }}
 
     permissions:
       contents: write

From 8d47a60de27ef8163fdc63cdc91cf3e807c8a14e Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 23 Jan 2025 16:54:44 -0500
Subject: [PATCH 252/583] fix(pageserver): handle dup layers during
 gc-compaction (#10430)

## Problem

If gc-compaction decides to rewrite an image layer, it will now cause
index_part to lose reference to that layer. In details,

* Assume there's only one image layer of key 0000...AAAA at LSN 0x100
and generation 0xA in the system.
* gc-compaction kicks in at gc-horizon 0x100, and then produce
0000...AAAA at LSN 0x100 and generation 0xB.
* It submits a compaction result update into the index part that unlinks
0000-AAAA-100-A and adds 0000-AAAA-100-B

On the remote storage / local disk side, this is fine -- it unlinks
things correctly and uploads the new file. However, the
`index_part.json` itself doesn't record generations. The buggy procedure
is as follows:

1. upload the new file
2. update the index part to remove the old file and add the new file
3. remove the new file

Therefore, the correct update result process for gc-compaction should be
as follows:

* When modifying the layer map, delete the old one and upload the new
one.
* When updating the index, uploading the new one in the index without
deleting the old one.

## Summary of changes

* Modify `finish_gc_compaction` to correctly order insertions and
deletions.
* Update the way gc-compaction uploads the layer files.
* Add new tests.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs  |  46 +++-
 .../src/tenant/timeline/layer_manager.rs      |  39 ++-
 test_runner/regress/test_compaction.py        | 239 +++++++++++++++++-
 3 files changed, 309 insertions(+), 15 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 4d5dc2d8a9..28c3381318 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -436,12 +436,14 @@ impl KeyHistoryRetention {
         if dry_run {
             return true;
         }
-        let guard = tline.layers.read().await;
-        if !guard.contains_key(key) {
-            return false;
+        let layer_generation;
+        {
+            let guard = tline.layers.read().await;
+            if !guard.contains_key(key) {
+                return false;
+            }
+            layer_generation = guard.get_from_key(key).metadata().generation;
         }
-        let layer_generation = guard.get_from_key(key).metadata().generation;
-        drop(guard);
         if layer_generation == tline.generation {
             info!(
                 key=%key,
@@ -2138,6 +2140,11 @@ impl Timeline {
             self.get_gc_compaction_watermark()
         };
 
+        if compact_below_lsn == Lsn::INVALID {
+            tracing::warn!("no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction");
+            return Ok(vec![]);
+        }
+
         // Split compaction job to about 4GB each
         const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024;
         let sub_compaction_max_job_size_mb =
@@ -2338,6 +2345,11 @@ impl Timeline {
                 // each of the retain_lsn. Therefore, if the user-provided `compact_lsn_range.end` is larger than the real gc cutoff, we will use
                 // the real cutoff.
                 let mut gc_cutoff = if compact_lsn_range.end == Lsn::MAX {
+                    if real_gc_cutoff == Lsn::INVALID {
+                        // If the gc_cutoff is not generated yet, we should not compact anything.
+                        tracing::warn!("no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction");
+                        return Ok(());
+                    }
                     real_gc_cutoff
                 } else {
                     compact_lsn_range.end
@@ -2869,7 +2881,7 @@ impl Timeline {
             "produced {} delta layers and {} image layers, {} layers are kept",
             produced_delta_layers_len,
             produced_image_layers_len,
-            layer_selection.len()
+            keep_layers.len()
         );
 
         // Step 3: Place back to the layer map.
@@ -2915,8 +2927,28 @@ impl Timeline {
         // be batched into `schedule_compaction_update`.
         let disk_consistent_lsn = self.disk_consistent_lsn.load();
         self.schedule_uploads(disk_consistent_lsn, None)?;
+        // If a layer gets rewritten throughout gc-compaction, we need to keep that layer only in `compact_to` instead
+        // of `compact_from`.
+        let compact_from = {
+            let mut compact_from = Vec::new();
+            let mut compact_to_set = HashMap::new();
+            for layer in &compact_to {
+                compact_to_set.insert(layer.layer_desc().key(), layer);
+            }
+            for layer in &layer_selection {
+                if let Some(to) = compact_to_set.get(&layer.layer_desc().key()) {
+                    tracing::info!(
+                        "skipping delete {} because found same layer key at different generation {}",
+                        layer, to
+                    );
+                } else {
+                    compact_from.push(layer.clone());
+                }
+            }
+            compact_from
+        };
         self.remote_client
-            .schedule_compaction_update(&layer_selection, &compact_to)?;
+            .schedule_compaction_update(&compact_from, &compact_to)?;
 
         drop(gc_lock);
 
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 3888e7f86a..f1cef7778c 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -337,16 +337,45 @@ impl OpenLayerManager {
         compact_to: &[ResidentLayer],
         metrics: &TimelineMetrics,
     ) {
-        // We can simply reuse compact l0 logic. Use a different function name to indicate a different type of layer map modification.
-        self.finish_compact_l0(compact_from, compact_to, metrics)
+        // gc-compaction could contain layer rewrites. We need to delete the old layers and insert the new ones.
+
+        // Match the old layers with the new layers
+        let mut add_layers = HashMap::new();
+        let mut rewrite_layers = HashMap::new();
+        let mut drop_layers = HashMap::new();
+        for layer in compact_from {
+            drop_layers.insert(layer.layer_desc().key(), layer.clone());
+        }
+        for layer in compact_to {
+            if let Some(old_layer) = drop_layers.remove(&layer.layer_desc().key()) {
+                rewrite_layers.insert(layer.layer_desc().key(), (old_layer.clone(), layer.clone()));
+            } else {
+                add_layers.insert(layer.layer_desc().key(), layer.clone());
+            }
+        }
+        let add_layers = add_layers.values().cloned().collect::<Vec<_>>();
+        let drop_layers = drop_layers.values().cloned().collect::<Vec<_>>();
+        let rewrite_layers = rewrite_layers.values().cloned().collect::<Vec<_>>();
+
+        self.rewrite_layers_inner(&rewrite_layers, &drop_layers, &add_layers, metrics);
     }
 
     /// Called post-compaction when some previous generation image layers were trimmed.
-    pub(crate) fn rewrite_layers(
+    pub fn rewrite_layers(
         &mut self,
         rewrite_layers: &[(Layer, ResidentLayer)],
         drop_layers: &[Layer],
         metrics: &TimelineMetrics,
+    ) {
+        self.rewrite_layers_inner(rewrite_layers, drop_layers, &[], metrics);
+    }
+
+    fn rewrite_layers_inner(
+        &mut self,
+        rewrite_layers: &[(Layer, ResidentLayer)],
+        drop_layers: &[Layer],
+        add_layers: &[ResidentLayer],
+        metrics: &TimelineMetrics,
     ) {
         let mut updates = self.layer_map.batch_update();
         for (old_layer, new_layer) in rewrite_layers {
@@ -382,6 +411,10 @@ impl OpenLayerManager {
         for l in drop_layers {
             Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
         }
+        for l in add_layers {
+            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+            metrics.record_new_file_metrics(l.layer_desc().file_size);
+        }
         updates.flush();
     }
 
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index fde26e1533..2edfc884ad 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
 import json
+import math
+import random
 import time
 from enum import StrEnum
 
@@ -128,11 +130,6 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b
     }
 
     env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF)
-    env.pageserver.allowed_errors.append(
-        r".*failed to acquire partition lock during gc-compaction.*"
-    )
-    env.pageserver.allowed_errors.append(r".*repartition() called concurrently.*")
-
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
@@ -147,6 +144,10 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b
     log.info("Writing initial data ...")
     workload.write_rows(row_count, env.pageserver.id)
 
+    ps_http.timeline_gc(
+        tenant_id, timeline_id, None
+    )  # Force refresh gc info to have gc_cutoff generated
+
     child_workloads: list[Workload] = []
 
     for i in range(1, churn_rounds + 1):
@@ -198,6 +199,230 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b
     ps_http.timeline_gc(tenant_id, timeline_id, None)
 
 
+@pytest.mark.parametrize(
+    "compaction_mode",
+    ["before_restart", "after_restart"],
+)
+def test_pageserver_gc_compaction_idempotent(
+    neon_env_builder: NeonEnvBuilder, compaction_mode: str
+):
+    """
+    Do gc-compaction twice without writing any new data and see if anything breaks.
+    We run this test in two modes:
+    - before_restart: run two gc-compactions before pageserver restart
+    - after_restart: run one gc-compaction before and one after pageserver restart
+    """
+    SMOKE_CONF = {
+        # Run both gc and gc-compaction.
+        "gc_period": "5s",
+        "compaction_period": "5s",
+        # No PiTR interval and small GC horizon
+        "pitr_interval": "0s",
+        "gc_horizon": 1024,
+        "lsn_lease_length": "0s",
+    }
+
+    env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF)
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # Only in testing mode: the warning is expected because we rewrite a layer file of different generations.
+    # We could potentially patch the sanity-check code to not emit the warning in the future.
+    env.pageserver.allowed_errors.append(".*was unlinked but was not dangling.*")
+
+    row_count = 10000
+
+    ps_http = env.pageserver.http_client()
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageserver.id)
+
+    workload.write_rows(row_count, env.pageserver.id)
+
+    child_workloads: list[Workload] = []
+
+    def compaction_finished():
+        queue_depth = len(ps_http.timeline_compact_info(tenant_id, timeline_id))
+        assert queue_depth == 0
+
+    workload.churn_rows(row_count, env.pageserver.id)
+    env.create_branch("child_branch")  # so that we have a retain_lsn
+    workload.churn_rows(row_count, env.pageserver.id)
+    # compact 3 times if mode is before_restart
+    n_compactions = 3 if compaction_mode == "before_restart" else 1
+    for _ in range(n_compactions):
+        # Force refresh gc info to have gc_cutoff generated
+        ps_http.timeline_gc(tenant_id, timeline_id, None)
+        ps_http.timeline_compact(
+            tenant_id,
+            timeline_id,
+            enhanced_gc_bottom_most_compaction=True,
+            body={
+                "scheduled": True,
+                "sub_compaction": True,
+                "compact_key_range": {
+                    "start": "000000000000000000000000000000000000",
+                    "end": "030000000000000000000000000000000000",
+                },
+                "sub_compaction_max_job_size_mb": 16,
+            },
+        )
+        wait_until(compaction_finished, timeout=60)
+    if compaction_mode == "after_restart":
+        env.pageserver.restart(True)
+        ps_http.timeline_gc(
+            tenant_id, timeline_id, None
+        )  # Force refresh gc info to have gc_cutoff generated
+        for _ in range(3):
+            ps_http.timeline_compact(
+                tenant_id,
+                timeline_id,
+                enhanced_gc_bottom_most_compaction=True,
+                body={
+                    "scheduled": True,
+                    "sub_compaction": True,
+                    "compact_key_range": {
+                        "start": "000000000000000000000000000000000000",
+                        "end": "030000000000000000000000000000000000",
+                    },
+                    "sub_compaction_max_job_size_mb": 16,
+                },
+            )
+            wait_until(compaction_finished, timeout=60)
+
+    # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
+    env.pageserver.assert_log_contains(
+        "scheduled_compact_timeline.*picked .* layers for compaction"
+    )
+
+    # ensure we hit the duplicated layer key warning at least once: we did two compactions consecutively,
+    # and the second one should have hit the duplicated layer key warning.
+    if compaction_mode == "before_restart":
+        env.pageserver.assert_log_contains("duplicated layer key in the same generation")
+    else:
+        env.pageserver.assert_log_contains("same layer key at different generation")
+
+    log.info("Validating at workload end ...")
+    workload.validate(env.pageserver.id)
+    for child_workload in child_workloads:
+        log.info(f"Validating at branch {child_workload.branch_name}")
+        child_workload.validate(env.pageserver.id)
+
+    # Run a legacy compaction+gc to ensure gc-compaction can coexist with legacy compaction.
+    ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
+    ps_http.timeline_gc(tenant_id, timeline_id, None)
+
+
+@skip_in_debug_build("only run with release build")
+def test_pageserver_gc_compaction_interrupt(neon_env_builder: NeonEnvBuilder):
+    """
+    Force interrupt a gc-compaction and see if anything breaks.
+    """
+    SMOKE_CONF = {
+        # Run both gc and gc-compaction.
+        "gc_period": "5s",
+        "compaction_period": "5s",
+        # No PiTR interval and small GC horizon
+        "pitr_interval": "0s",
+        "gc_horizon": "1024",
+        "lsn_lease_length": "0s",
+    }
+
+    env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF)
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # Only in testing mode: the warning is expected because we rewrite a layer file of different generations.
+    # We could potentially patch the sanity-check code to not emit the warning in the future.
+    env.pageserver.allowed_errors.append(".*was unlinked but was not dangling.*")
+
+    row_count = 10000
+    churn_rounds = 20
+
+    ps_http = env.pageserver.http_client()
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageserver.id)
+
+    log.info("Writing initial data ...")
+    workload.write_rows(row_count, env.pageserver.id)
+
+    def compaction_finished():
+        queue_depth = len(ps_http.timeline_compact_info(tenant_id, timeline_id))
+        assert queue_depth == 0
+
+    expected_compaction_time_seconds = 5.0
+    ps_http.timeline_gc(
+        tenant_id, timeline_id, None
+    )  # Force refresh gc info to have gc_cutoff generated
+    for i in range(1, churn_rounds + 1):
+        log.info(f"Running churn round {i}/{churn_rounds} ...")
+        workload.churn_rows(row_count, env.pageserver.id)
+        ps_http.timeline_compact(
+            tenant_id,
+            timeline_id,
+            enhanced_gc_bottom_most_compaction=True,
+            body={
+                "scheduled": True,
+                "sub_compaction": True,
+                "compact_key_range": {
+                    "start": "000000000000000000000000000000000000",
+                    "end": "030000000000000000000000000000000000",
+                },
+                "sub_compaction_max_job_size_mb": 16,
+            },
+        )
+        # sleep random seconds between 0 and max(compaction_time); if the result is 0, wait until the compaction is complete
+        # This would hopefully trigger the restart at different periods of the compaction:
+        # - while we are doing the compaction
+        # - while we finished the compaction but not yet uploaded the metadata
+        # - after we uploaded the metadata
+        time_to_sleep = random.randint(0, max(5, math.ceil(expected_compaction_time_seconds)))
+        if time_to_sleep == 0 or i == 1:
+            start = time.time()
+            wait_until(compaction_finished, timeout=60)
+            end = time.time()
+            expected_compaction_time_seconds = end - start
+            log.info(
+                f"expected_compaction_time_seconds updated to {expected_compaction_time_seconds} seconds"
+            )
+        else:
+            time.sleep(time_to_sleep)
+        env.pageserver.restart(True)
+        ps_http.timeline_gc(
+            tenant_id, timeline_id, None
+        )  # Force refresh gc info to have gc_cutoff generated
+        ps_http.timeline_compact(
+            tenant_id,
+            timeline_id,
+            enhanced_gc_bottom_most_compaction=True,
+            body={
+                "scheduled": True,
+                "sub_compaction": True,
+                "compact_key_range": {
+                    "start": "000000000000000000000000000000000000",
+                    "end": "030000000000000000000000000000000000",
+                },
+                "sub_compaction_max_job_size_mb": 16,
+            },
+        )
+        workload.validate(env.pageserver.id)
+
+    wait_until(compaction_finished, timeout=60)
+
+    # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
+    env.pageserver.assert_log_contains(
+        "scheduled_compact_timeline.*picked .* layers for compaction"
+    )
+
+    log.info("Validating at workload end ...")
+    workload.validate(env.pageserver.id)
+
+    # Run a legacy compaction+gc to ensure gc-compaction can coexist with legacy compaction.
+    ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
+    ps_http.timeline_gc(tenant_id, timeline_id, None)
+
+
 # Stripe sizes in number of pages.
 TINY_STRIPES = 16
 LARGE_STRIPES = 32768
@@ -238,7 +463,9 @@ def test_sharding_compaction(
         "pitr_interval": "0s",
         # disable background compaction and GC. We invoke it manually when we want it to happen.
         "gc_period": "0s",
+        "gc_horizon": f"{128 * 1024}",
         "compaction_period": "0s",
+        "lsn_lease_length": "0s",
         # create image layers eagerly: we want to exercise image layer creation in this test.
         "image_creation_threshold": "1",
         "image_layer_creation_check_threshold": 0,
@@ -313,6 +540,8 @@ def test_sharding_compaction(
         for shard in env.storage_controller.locate(tenant_id):
             pageserver = env.get_pageserver(shard["node_id"])
             tenant_shard_id = shard["shard_id"]
+            # Force refresh gc info to have gc_cutoff generated
+            pageserver.http_client().timeline_gc(tenant_shard_id, timeline_id, None)
             pageserver.http_client().timeline_compact(
                 tenant_shard_id,
                 timeline_id,

From 9e55d798036d646cbc27cd0bb2c849ed2e48ce85 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 24 Jan 2025 09:35:35 +0100
Subject: [PATCH 253/583] Reapply "pageserver: revert flush backpressure"
 (#10270) (#10402)

This reapplies #10135. Just removing this flush backpressure without
further mitigations caused read amp increases during bulk ingestion
(predictably), so it was reverted. We will replace it by
compaction-based backpressure.

## Problem

In #8550, we made the flush loop wait for uploads after every layer.
This was to avoid unbounded buildup of uploads, and to reduce compaction
debt. However, the approach has several problems:

* It prevents upload parallelism.
* It prevents flush and upload pipelining.
* It slows down ingestion even when there is no need to backpressure.
* It does not directly backpressure based on compaction debt and read
amplification.

We will instead implement compaction-based backpressure in a PR
immediately following this removal (#5415).

Touches #5415.
Touches #10095.

## Summary of changes

Remove waiting on the upload queue in the flush loop.
---
 pageserver/src/metrics.rs                  | 25 +----------
 pageserver/src/tenant/timeline.rs          | 38 ++++-------------
 test_runner/fixtures/metrics.py            |  1 -
 test_runner/regress/test_branching.py      | 13 ++----
 test_runner/regress/test_remote_storage.py | 48 ----------------------
 5 files changed, 13 insertions(+), 112 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 02467cb6f7..985614b6cf 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -3,7 +3,7 @@ use metrics::{
     register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
     register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
     register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
-    Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
+    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
     IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
@@ -395,15 +395,6 @@ pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-static FLUSH_WAIT_UPLOAD_TIME: Lazy<GaugeVec> = Lazy::new(|| {
-    register_gauge_vec!(
-        "pageserver_flush_wait_upload_seconds",
-        "Time spent waiting for preceding uploads during layer flush",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
         "pageserver_last_record_lsn",
@@ -2575,7 +2566,6 @@ pub(crate) struct TimelineMetrics {
     shard_id: String,
     timeline_id: String,
     pub flush_time_histo: StorageTimeMetrics,
-    pub flush_wait_upload_time_gauge: Gauge,
     pub compact_time_histo: StorageTimeMetrics,
     pub create_images_time_histo: StorageTimeMetrics,
     pub logical_size_histo: StorageTimeMetrics,
@@ -2621,9 +2611,6 @@ impl TimelineMetrics {
             &shard_id,
             &timeline_id,
         );
-        let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
         let compact_time_histo = StorageTimeMetrics::new(
             StorageTimeOperation::Compact,
             &tenant_id,
@@ -2769,7 +2756,6 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
             flush_time_histo,
-            flush_wait_upload_time_gauge,
             compact_time_histo,
             create_images_time_histo,
             logical_size_histo,
@@ -2819,14 +2805,6 @@ impl TimelineMetrics {
         self.resident_physical_size_gauge.get()
     }
 
-    pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) {
-        self.flush_wait_upload_time_gauge.add(duration);
-        crate::metrics::FLUSH_WAIT_UPLOAD_TIME
-            .get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id])
-            .unwrap()
-            .add(duration);
-    }
-
     pub(crate) fn shutdown(&self) {
         let was_shutdown = self
             .shutdown
@@ -2844,7 +2822,6 @@ impl TimelineMetrics {
         let shard_id = &self.shard_id;
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
-        let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         {
             RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5d348ac474..56f61abc45 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -150,19 +150,15 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
+use super::config::TenantConf;
+use super::remote_timeline_client::index::IndexPart;
+use super::remote_timeline_client::RemoteTimelineClient;
+use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
+use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
+use super::upload_queue::NotInitialized;
+use super::GcError;
 use super::{
-    config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized,
-    MaybeOffloaded,
-};
-use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
-use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
-use super::{
-    remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
-    storage_layer::ReadableLayer,
-};
-use super::{
-    secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
-    GcError,
+    debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf, MaybeOffloaded,
 };
 
 #[cfg(test)]
@@ -3886,24 +3882,6 @@ impl Timeline {
             // release lock on 'layers'
         };
 
-        // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
-        // This makes us refuse ingest until the new layers have been persisted to the remote
-        let start = Instant::now();
-        self.remote_client
-            .wait_completion()
-            .await
-            .map_err(|e| match e {
-                WaitCompletionError::UploadQueueShutDownOrStopped
-                | WaitCompletionError::NotInitialized(
-                    NotInitialized::ShuttingDown | NotInitialized::Stopped,
-                ) => FlushLayerError::Cancelled,
-                WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => {
-                    FlushLayerError::Other(anyhow!(e).into())
-                }
-            })?;
-        let duration = start.elapsed().as_secs_f64();
-        self.metrics.flush_wait_upload_time_gauge_add(duration);
-
         // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
         // a compaction can delete the file and then it won't be available for uploads any more.
         // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index fd7e193778..37859901d4 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -165,7 +165,6 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     "pageserver_evictions_with_low_residence_duration_total",
     "pageserver_aux_file_estimated_size",
     "pageserver_valid_lsn_lease_count",
-    "pageserver_flush_wait_upload_seconds",
     counter("pageserver_tenant_throttling_count_accounted_start"),
     counter("pageserver_tenant_throttling_count_accounted_finish"),
     counter("pageserver_tenant_throttling_wait_usecs_sum"),
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 34e4e994cb..a4056404f0 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -19,6 +19,7 @@ from fixtures.pageserver.utils import wait_until_tenant_active
 from fixtures.utils import query_scalar
 from performance.test_perf_pgbench import get_scales_matrix
 from requests import RequestException
+from requests.exceptions import RetryError
 
 
 # Test branch creation
@@ -176,11 +177,8 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
 
         env.neon_cli.mappings_map_branch(initial_branch, env.initial_tenant, env.initial_timeline)
 
-        with pytest.raises(RuntimeError, match="ERROR: Not found: Timeline"):
-            env.endpoints.create_start(
-                initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2
-            )
-        ps_http.configure_failpoints(("before-upload-index-pausable", "off"))
+        with pytest.raises(RuntimeError, match="is not active, state: Loading"):
+            env.endpoints.create_start(initial_branch, tenant_id=env.initial_tenant)
     finally:
         env.pageserver.stop(immediate=True)
 
@@ -221,10 +219,7 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
 
         branch_id = TimelineId.generate()
 
-        with pytest.raises(
-            PageserverApiException,
-            match="Cannot branch off the timeline that's not present in pageserver",
-        ):
+        with pytest.raises(RetryError, match="too many 503 error responses"):
             ps_http.timeline_create(
                 env.pg_version,
                 env.initial_tenant,
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 76a42ef4a2..52b6b254aa 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -784,54 +784,6 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
         create_thread.join()
 
 
-def test_paused_upload_stalls_checkpoint(
-    neon_env_builder: NeonEnvBuilder,
-):
-    """
-    This test checks that checkpoints block on uploads to remote storage.
-    """
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
-
-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
-            # Set a small compaction threshold
-            "compaction_threshold": "3",
-            # Disable GC
-            "gc_period": "0s",
-            # disable PITR
-            "pitr_interval": "0s",
-        }
-    )
-
-    env.pageserver.allowed_errors.append(
-        f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
-    )
-
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-
-    client = env.pageserver.http_client()
-    layers_at_creation = client.layer_map_info(tenant_id, timeline_id)
-    deltas_at_creation = len(layers_at_creation.delta_layers())
-    assert (
-        deltas_at_creation == 1
-    ), "are you fixing #5863? make sure we end up with 2 deltas at the end of endpoint lifecycle"
-
-    # Make new layer uploads get stuck.
-    # Note that timeline creation waits for the initial layers to reach remote storage.
-    # So at this point, the `layers_at_creation` are in remote storage.
-    client.configure_failpoints(("before-upload-layer-pausable", "pause"))
-
-    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
-        # Build two tables with some data inside
-        endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)")
-        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-
-        with pytest.raises(ReadTimeout):
-            client.timeline_checkpoint(tenant_id, timeline_id, timeout=5)
-        client.configure_failpoints(("before-upload-layer-pausable", "off"))
-
-
 def wait_upload_queue_empty(
     client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
 ):

From ddb9ae1214a1ab19300514e5487569471b8a551a Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 24 Jan 2025 10:47:28 +0100
Subject: [PATCH 254/583] pageserver: add compaction backpressure for layer
 flushes (#10405)

## Problem

There is no direct backpressure for compaction and L0 read
amplification. This allows a large buildup of compaction debt and read
amplification.

Resolves #5415.
Requires #10402.

## Summary of changes

Delay layer flushes based on the number of level 0 delta layers:

* `l0_flush_delay_threshold`: delay flushes such that they take 2x as
long (default `2 * compaction_threshold`).
* `l0_flush_stall_threshold`: stall flushes until level 0 delta layers
drop below threshold (default `4 * compaction_threshold`).

If either threshold is reached, ephemeral layer rolls also synchronously
wait for layer flushes to propagate this backpressure up into WAL
ingestion. This will bound the number of frozen layers to 1 once
backpressure kicks in, since all other frozen layers must flush before
the rolled layer.

## Analysis

This will significantly change the compute backpressure characteristics.
Recall the three compute backpressure knobs:

* `max_replication_write_lag`: 500 MB (based on Pageserver
`last_received_lsn`).
* `max_replication_flush_lag`: 10 GB (based on Pageserver
`disk_consistent_lsn`).
* `max_replication_apply_lag`: disabled (based on Pageserver
`remote_consistent_lsn`).

Previously, the Pageserver would keep ingesting WAL and build up
ephemeral layers and L0 layers until the compute hit
`max_replication_flush_lag` at 10 GB and began backpressuring. Now, once
we delay/stall WAL ingestion, the compute will begin backpressuring
after `max_replication_write_lag`, i.e. 500 MB. This is probably a good
thing (we're not building up a ton of compaction debt), but we should
consider tuning these settings.

`max_replication_flush_lag` probably doesn't serve a purpose anymore,
and we should consider removing it.

Furthermore, the removal of the upload barrier in #10402 will mean that
we no longer backpressure flushes based on S3 uploads, since
`max_replication_apply_lag` is disabled. We should consider enabling
this as well.

### When and what do we compact?

Default compaction settings:

* `compaction_threshold`: 10 L0 delta layers.
* `compaction_period`: 20 seconds (between each compaction loop check).
* `checkpoint_distance`: 256 MB (size of L0 delta layers).
* `l0_flush_delay_threshold`: 20 L0 delta layers.
* `l0_flush_stall_threshold`: 40 L0 delta layers.

Compaction characteristics:

* Minimum compaction volume: 10 layers * 256 MB = 2.5 GB.
* Additional compaction volume (assuming 128 MB/s WAL): 128 MB/s * 20
seconds = 2.5 GB (10 L0 layers).
* Required compaction bandwidth: 5.0 GB / 20 seconds = 256 MB/s.

### When do we hit `max_replication_write_lag`?

Depending on how fast compaction and flushes happens, the compute will
backpressure somewhere between `l0_flush_delay_threshold` or
`l0_flush_stall_threshold` + `max_replication_write_lag`.

* Minimum compute backpressure lag: 20 layers * 256 MB + 500 MB = 5.6 GB
* Maximum compute backpressure lag: 40 layers * 256 MB + 500 MB = 10.0
GB

This seems like a reasonable range to me.
---
 control_plane/src/pageserver.rs               |  10 +
 libs/pageserver_api/src/config.rs             |  13 +-
 libs/pageserver_api/src/models.rs             |  16 ++
 pageserver/src/metrics.rs                     |  12 +-
 pageserver/src/tenant.rs                      |   2 +
 pageserver/src/tenant/config.rs               |  26 ++
 pageserver/src/tenant/timeline.rs             | 257 +++++++++++++++---
 .../fixtures/pageserver/allowed_errors.py     |   7 +-
 test_runner/performance/test_layer_map.py     |   2 +
 .../regress/test_attach_tenant_config.py      |   2 +
 test_runner/regress/test_branch_and_gc.py     |   2 +
 test_runner/regress/test_compatibility.py     |   2 +-
 test_runner/regress/test_recovery.py          |   5 +-
 test_runner/regress/test_remote_storage.py    |   2 +
 test_runner/regress/test_timeline_size.py     |   2 +-
 test_runner/regress/test_vm_bits.py           |   3 +
 16 files changed, 311 insertions(+), 52 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index b33b2877b3..967810ee06 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -352,6 +352,16 @@ impl PageServerNode {
                 .map(serde_json::from_str)
                 .transpose()
                 .context("Failed to parse 'compaction_algorithm' json")?,
+            l0_flush_delay_threshold: settings
+                .remove("l0_flush_delay_threshold")
+                .map(|x| x.parse::<usize>())
+                .transpose()
+                .context("Failed to parse 'l0_flush_delay_threshold' as an integer")?,
+            l0_flush_stall_threshold: settings
+                .remove("l0_flush_stall_threshold")
+                .map(|x| x.parse::<usize>())
+                .transpose()
+                .context("Failed to parse 'l0_flush_stall_threshold' as an integer")?,
             gc_horizon: settings
                 .remove("gc_horizon")
                 .map(|x| x.parse::<u64>())
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 4982c6233d..5866145690 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -254,9 +254,18 @@ pub struct TenantConfigToml {
     // Duration::ZERO means automatic compaction is disabled.
     #[serde(with = "humantime_serde")]
     pub compaction_period: Duration,
-    // Level0 delta layer threshold for compaction.
+    /// Level0 delta layer threshold for compaction.
     pub compaction_threshold: usize,
     pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
+    /// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure,
+    /// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer
+    /// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification
+    /// blowing up. Should be >compaction_threshold. If None, defaults to 2 * compaction_threshold.
+    /// 0 to disable.
+    pub l0_flush_delay_threshold: Option<usize>,
+    /// Level0 delta layer threshold at which to stall layer flushes. 0 to disable. If None,
+    /// defaults to 4 * compaction_threshold. Must be >compaction_threshold to avoid deadlock.
+    pub l0_flush_stall_threshold: Option<usize>,
     // Determines how much history is retained, to allow
     // branching and read replicas at an older point in time.
     // The unit is #of bytes of WAL.
@@ -552,6 +561,8 @@ impl Default for TenantConfigToml {
             compaction_algorithm: crate::models::CompactionAlgorithmSettings {
                 kind: DEFAULT_COMPACTION_ALGORITHM,
             },
+            l0_flush_delay_threshold: None,
+            l0_flush_stall_threshold: None,
             gc_horizon: DEFAULT_GC_HORIZON,
             gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                 .expect("cannot parse default gc period"),
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index fd4879087f..16473415b4 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -462,6 +462,10 @@ pub struct TenantConfigPatch {
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub l0_flush_delay_threshold: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub l0_flush_stall_threshold: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub gc_horizon: FieldPatch<u64>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub gc_period: FieldPatch<String>,
@@ -518,6 +522,8 @@ pub struct TenantConfig {
     pub compaction_threshold: Option<usize>,
     // defer parsing compaction_algorithm, like eviction_policy
     pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
+    pub l0_flush_delay_threshold: Option<usize>,
+    pub l0_flush_stall_threshold: Option<usize>,
     pub gc_horizon: Option<u64>,
     pub gc_period: Option<String>,
     pub image_creation_threshold: Option<usize>,
@@ -551,6 +557,8 @@ impl TenantConfig {
             mut compaction_period,
             mut compaction_threshold,
             mut compaction_algorithm,
+            mut l0_flush_delay_threshold,
+            mut l0_flush_stall_threshold,
             mut gc_horizon,
             mut gc_period,
             mut image_creation_threshold,
@@ -583,6 +591,12 @@ impl TenantConfig {
         patch.compaction_period.apply(&mut compaction_period);
         patch.compaction_threshold.apply(&mut compaction_threshold);
         patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch
+            .l0_flush_delay_threshold
+            .apply(&mut l0_flush_delay_threshold);
+        patch
+            .l0_flush_stall_threshold
+            .apply(&mut l0_flush_stall_threshold);
         patch.gc_horizon.apply(&mut gc_horizon);
         patch.gc_period.apply(&mut gc_period);
         patch
@@ -635,6 +649,8 @@ impl TenantConfig {
             compaction_period,
             compaction_threshold,
             compaction_algorithm,
+            l0_flush_delay_threshold,
+            l0_flush_stall_threshold,
             gc_horizon,
             gc_period,
             image_creation_threshold,
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 985614b6cf..5247a4a2ac 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -38,6 +38,9 @@ pub(crate) enum StorageTimeOperation {
     #[strum(serialize = "layer flush")]
     LayerFlush,
 
+    #[strum(serialize = "layer flush delay")]
+    LayerFlushDelay,
+
     #[strum(serialize = "compact")]
     Compact,
 
@@ -2508,7 +2511,6 @@ impl Drop for AlwaysRecordingStorageTimeMetricsTimer {
 
 impl AlwaysRecordingStorageTimeMetricsTimer {
     /// Returns the elapsed duration of the timer.
-    #[allow(unused)]
     pub fn elapsed(&self) -> Duration {
         self.0.as_ref().expect("not dropped yet").elapsed()
     }
@@ -2566,6 +2568,7 @@ pub(crate) struct TimelineMetrics {
     shard_id: String,
     timeline_id: String,
     pub flush_time_histo: StorageTimeMetrics,
+    pub flush_delay_histo: StorageTimeMetrics,
     pub compact_time_histo: StorageTimeMetrics,
     pub create_images_time_histo: StorageTimeMetrics,
     pub logical_size_histo: StorageTimeMetrics,
@@ -2611,6 +2614,12 @@ impl TimelineMetrics {
             &shard_id,
             &timeline_id,
         );
+        let flush_delay_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::LayerFlushDelay,
+            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
         let compact_time_histo = StorageTimeMetrics::new(
             StorageTimeOperation::Compact,
             &tenant_id,
@@ -2756,6 +2765,7 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
             flush_time_histo,
+            flush_delay_histo,
             compact_time_histo,
             create_images_time_histo,
             logical_size_histo,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index a273ef5d01..efe89cb982 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5453,6 +5453,8 @@ pub(crate) mod harness {
                 compaction_period: Some(tenant_conf.compaction_period),
                 compaction_threshold: Some(tenant_conf.compaction_threshold),
                 compaction_algorithm: Some(tenant_conf.compaction_algorithm),
+                l0_flush_delay_threshold: tenant_conf.l0_flush_delay_threshold,
+                l0_flush_stall_threshold: tenant_conf.l0_flush_stall_threshold,
                 gc_horizon: Some(tenant_conf.gc_horizon),
                 gc_period: Some(tenant_conf.gc_period),
                 image_creation_threshold: Some(tenant_conf.image_creation_threshold),
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 3db1445f6e..c870ca97b8 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -281,6 +281,14 @@ pub struct TenantConfOpt {
     #[serde(default)]
     pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
 
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub l0_flush_delay_threshold: Option<usize>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub l0_flush_stall_threshold: Option<usize>,
+
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
     pub gc_horizon: Option<u64>,
@@ -394,6 +402,12 @@ impl TenantConfOpt {
                 .as_ref()
                 .unwrap_or(&global_conf.compaction_algorithm)
                 .clone(),
+            l0_flush_delay_threshold: self
+                .l0_flush_delay_threshold
+                .or(global_conf.l0_flush_delay_threshold),
+            l0_flush_stall_threshold: self
+                .l0_flush_stall_threshold
+                .or(global_conf.l0_flush_stall_threshold),
             gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
             gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
             image_creation_threshold: self
@@ -458,6 +472,8 @@ impl TenantConfOpt {
             mut compaction_period,
             mut compaction_threshold,
             mut compaction_algorithm,
+            mut l0_flush_delay_threshold,
+            mut l0_flush_stall_threshold,
             mut gc_horizon,
             mut gc_period,
             mut image_creation_threshold,
@@ -496,6 +512,12 @@ impl TenantConfOpt {
             .apply(&mut compaction_period);
         patch.compaction_threshold.apply(&mut compaction_threshold);
         patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch
+            .l0_flush_delay_threshold
+            .apply(&mut l0_flush_delay_threshold);
+        patch
+            .l0_flush_stall_threshold
+            .apply(&mut l0_flush_stall_threshold);
         patch.gc_horizon.apply(&mut gc_horizon);
         patch
             .gc_period
@@ -566,6 +588,8 @@ impl TenantConfOpt {
             compaction_period,
             compaction_threshold,
             compaction_algorithm,
+            l0_flush_delay_threshold,
+            l0_flush_stall_threshold,
             gc_horizon,
             gc_period,
             image_creation_threshold,
@@ -623,6 +647,8 @@ impl From<TenantConfOpt> for models::TenantConfig {
             compaction_target_size: value.compaction_target_size,
             compaction_period: value.compaction_period.map(humantime),
             compaction_threshold: value.compaction_threshold,
+            l0_flush_delay_threshold: value.l0_flush_delay_threshold,
+            l0_flush_stall_threshold: value.l0_flush_stall_threshold,
             gc_horizon: value.gc_horizon,
             gc_period: value.gc_period.map(humantime),
             image_creation_threshold: value.image_creation_threshold,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 56f61abc45..fffa2c8e2b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -22,11 +22,11 @@ use enumset::EnumSet;
 use fail::fail_point;
 use futures::{stream::FuturesUnordered, StreamExt};
 use handle::ShardTimelineId;
+use layer_manager::Shutdown;
 use offload::OffloadError;
 use once_cell::sync::Lazy;
 use pageserver_api::models::PageTraceEvent;
 use pageserver_api::{
-    config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
     key::{
         KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
         SPARSE_RANGE,
@@ -60,20 +60,14 @@ use utils::{
 };
 use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
 
-use std::sync::atomic::Ordering as AtomicOrdering;
-use std::sync::OnceLock;
-use std::sync::{Arc, Mutex, RwLock, Weak};
+use std::array;
+use std::cmp::{max, min};
+use std::collections::btree_map::Entry;
+use std::collections::{BTreeMap, HashMap, HashSet};
+use std::ops::{ControlFlow, Deref, Range};
+use std::sync::atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering};
+use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
-use std::{
-    array,
-    collections::{BTreeMap, HashMap, HashSet},
-    sync::atomic::AtomicU64,
-};
-use std::{cmp::min, ops::ControlFlow};
-use std::{
-    collections::btree_map::Entry,
-    ops::{Deref, Range},
-};
 
 use crate::l0_flush::{self, L0FlushGlobalState};
 use crate::{
@@ -404,6 +398,9 @@ pub struct Timeline {
     /// Timeline deletion will acquire both compaction and gc locks in whatever order.
     compaction_lock: tokio::sync::Mutex<()>,
 
+    /// If true, the last compaction failed.
+    compaction_failed: AtomicBool,
+
     /// Make sure we only have one running gc at a time.
     ///
     /// Must only be taken in two places:
@@ -1698,13 +1695,25 @@ impl Timeline {
             return Ok(false);
         }
 
-        match self.get_compaction_algorithm_settings().kind {
+        let result = match self.get_compaction_algorithm_settings().kind {
             CompactionAlgorithm::Tiered => {
                 self.compact_tiered(cancel, ctx).await?;
                 Ok(false)
             }
             CompactionAlgorithm::Legacy => self.compact_legacy(cancel, options, ctx).await,
-        }
+        };
+
+        // Signal compaction failure to avoid L0 flush stalls when it's broken.
+        let compaction_failed = match result {
+            Ok(_) => false,
+            Err(CompactionError::Offload(_)) => false, // doesn't halt compaction
+            Err(CompactionError::ShuttingDown) => false, // not a failure
+            Err(CompactionError::Other(_)) => true,
+        };
+        self.compaction_failed
+            .store(compaction_failed, AtomicOrdering::Relaxed);
+
+        result
     }
 
     /// Mutate the timeline with a [`TimelineWriter`].
@@ -2133,6 +2142,13 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
     }
 
+    fn get_compaction_period(&self) -> Duration {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .compaction_period
+            .unwrap_or(self.conf.default_tenant_conf.compaction_period)
+    }
+
     fn get_compaction_target_size(&self) -> u64 {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
@@ -2149,6 +2165,84 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
     }
 
+    fn get_l0_flush_delay_threshold(&self) -> Option<usize> {
+        // Default to delay L0 flushes at 2x compaction threshold.
+        const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 2;
+
+        // If compaction is disabled, don't delay.
+        if self.get_compaction_period() == Duration::ZERO {
+            return None;
+        }
+
+        let compaction_threshold = self.get_compaction_threshold();
+        let tenant_conf = self.tenant_conf.load();
+        let l0_flush_delay_threshold = tenant_conf
+            .tenant_conf
+            .l0_flush_delay_threshold
+            .or(self.conf.default_tenant_conf.l0_flush_delay_threshold)
+            .unwrap_or(DEFAULT_L0_FLUSH_DELAY_FACTOR * compaction_threshold);
+
+        // 0 disables backpressure.
+        if l0_flush_delay_threshold == 0 {
+            return None;
+        }
+
+        // Clamp the flush delay threshold to the compaction threshold; it doesn't make sense to
+        // backpressure flushes below this.
+        // TODO: the tenant config should have validation to prevent this instead.
+        debug_assert!(l0_flush_delay_threshold >= compaction_threshold);
+        Some(max(l0_flush_delay_threshold, compaction_threshold))
+    }
+
+    fn get_l0_flush_stall_threshold(&self) -> Option<usize> {
+        // Default to stall L0 flushes at 4x compaction threshold.
+        const DEFAULT_L0_FLUSH_STALL_FACTOR: usize = 4;
+
+        // If compaction is disabled, don't stall.
+        if self.get_compaction_period() == Duration::ZERO {
+            return None;
+        }
+
+        // If compaction is failing, don't stall and try to keep the tenant alive. This may not be a
+        // good idea: read amp can grow unbounded, leading to terrible performance, and we may take
+        // on unbounded compaction debt that can take a long time to fix once compaction comes back
+        // online. At least we'll delay flushes, slowing down the growth and buying some time.
+        if self.compaction_failed.load(AtomicOrdering::Relaxed) {
+            return None;
+        }
+
+        let compaction_threshold = self.get_compaction_threshold();
+        let tenant_conf = self.tenant_conf.load();
+        let l0_flush_stall_threshold = tenant_conf
+            .tenant_conf
+            .l0_flush_stall_threshold
+            .or(self.conf.default_tenant_conf.l0_flush_stall_threshold);
+
+        // Tests sometimes set compaction_threshold=1 to generate lots of layer files, and don't
+        // handle the 20-second compaction delay. Some (e.g. `test_backward_compatibility`) can't
+        // easily adjust the L0 backpressure settings, so just disable stalls in this case.
+        if cfg!(feature = "testing")
+            && compaction_threshold == 1
+            && l0_flush_stall_threshold.is_none()
+        {
+            return None;
+        }
+
+        let l0_flush_stall_threshold = l0_flush_stall_threshold
+            .unwrap_or(DEFAULT_L0_FLUSH_STALL_FACTOR * compaction_threshold);
+
+        // 0 disables backpressure.
+        if l0_flush_stall_threshold == 0 {
+            return None;
+        }
+
+        // Clamp the flush stall threshold to the compaction threshold; it doesn't make sense to
+        // backpressure flushes below this.
+        // TODO: the tenant config should have validation to prevent this instead.
+        debug_assert!(l0_flush_stall_threshold >= compaction_threshold);
+        Some(max(l0_flush_stall_threshold, compaction_threshold))
+    }
+
     fn get_image_creation_threshold(&self) -> usize {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
@@ -2385,6 +2479,7 @@ impl Timeline {
                 gate: Gate::default(),
 
                 compaction_lock: tokio::sync::Mutex::default(),
+                compaction_failed: AtomicBool::default(),
                 gc_lock: tokio::sync::Mutex::default(),
 
                 standby_horizon: AtomicLsn::new(0),
@@ -3600,6 +3695,12 @@ impl Timeline {
         mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>,
         ctx: &RequestContext,
     ) {
+        // Subscribe to L0 delta layer updates, for compaction backpressure.
+        let mut watch_l0 = match self.layers.read().await.layer_map() {
+            Ok(lm) => lm.watch_level0_deltas(),
+            Err(Shutdown) => return,
+        };
+
         info!("started flush loop");
         loop {
             tokio::select! {
@@ -3630,43 +3731,62 @@ impl Timeline {
                     break Ok(());
                 }
 
-                let timer = self.metrics.flush_time_histo.start_timer();
-
-                let num_frozen_layers;
-                let frozen_layer_total_size;
-                let layer_to_flush = {
-                    let guard = self.layers.read().await;
-                    let Ok(lm) = guard.layer_map() else {
+                // Fetch the next layer to flush, if any.
+                let (layer, l0_count, frozen_count, frozen_size) = {
+                    let layers = self.layers.read().await;
+                    let Ok(lm) = layers.layer_map() else {
                         info!("dropping out of flush loop for timeline shutdown");
                         return;
                     };
-                    num_frozen_layers = lm.frozen_layers.len();
-                    frozen_layer_total_size = lm
+                    let l0_count = lm.level0_deltas().len();
+                    let frozen_count = lm.frozen_layers.len();
+                    let frozen_size: u64 = lm
                         .frozen_layers
                         .iter()
                         .map(|l| l.estimated_in_mem_size())
-                        .sum::<u64>();
-                    lm.frozen_layers.front().cloned()
-                    // drop 'layers' lock to allow concurrent reads and writes
+                        .sum();
+                    let layer = lm.frozen_layers.front().cloned();
+                    (layer, l0_count, frozen_count, frozen_size)
+                    // drop 'layers' lock
                 };
-                let Some(layer_to_flush) = layer_to_flush else {
+                let Some(layer) = layer else {
                     break Ok(());
                 };
-                if num_frozen_layers
-                    > std::cmp::max(
-                        self.get_compaction_threshold(),
-                        DEFAULT_COMPACTION_THRESHOLD,
-                    )
-                    && frozen_layer_total_size >= /* 128 MB */ 128000000
-                {
-                    tracing::warn!(
-                        "too many frozen layers: {num_frozen_layers} layers with estimated in-mem size of {frozen_layer_total_size} bytes",
-                    );
-                }
-                match self.flush_frozen_layer(layer_to_flush, ctx).await {
-                    Ok(this_layer_to_lsn) => {
-                        flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn);
+
+                // Stall flushes to backpressure if compaction can't keep up. This is propagated up
+                // to WAL ingestion by having ephemeral layer rolls wait for flushes.
+                //
+                // NB: the compaction loop only checks `compaction_threshold` every 20 seconds, so
+                // we can end up stalling before compaction even starts. Consider making it more
+                // responsive (e.g. via `watch_level0_deltas`).
+                if let Some(stall_threshold) = self.get_l0_flush_stall_threshold() {
+                    if l0_count >= stall_threshold {
+                        warn!(
+                            "stalling layer flushes for compaction backpressure at {l0_count} \
+                            L0 layers ({frozen_count} frozen layers with {frozen_size} bytes)"
+                        );
+                        let stall_timer = self
+                            .metrics
+                            .flush_delay_histo
+                            .start_timer()
+                            .record_on_drop();
+                        tokio::select! {
+                            result = watch_l0.wait_for(|l0| *l0 < stall_threshold) => {
+                                if let Ok(l0) = result.as_deref() {
+                                    let delay = stall_timer.elapsed().as_secs_f64();
+                                    info!("resuming layer flushes at {l0} L0 layers after {delay:.3}s");
+                                }
+                            },
+                            _ = self.cancel.cancelled() => {},
+                        }
+                        continue; // check again
                     }
+                }
+
+                // Flush the layer.
+                let flush_timer = self.metrics.flush_time_histo.start_timer();
+                match self.flush_frozen_layer(layer, ctx).await {
+                    Ok(layer_lsn) => flushed_to_lsn = max(flushed_to_lsn, layer_lsn),
                     Err(FlushLayerError::Cancelled) => {
                         info!("dropping out of flush loop for timeline shutdown");
                         return;
@@ -3680,7 +3800,30 @@ impl Timeline {
                         break err.map(|_| ());
                     }
                 }
-                timer.stop_and_record();
+                let flush_duration = flush_timer.stop_and_record();
+
+                // Delay the next flush to backpressure if compaction can't keep up. We delay by the
+                // flush duration such that the flush takes 2x as long. This is propagated up to WAL
+                // ingestion by having ephemeral layer rolls wait for flushes.
+                if let Some(delay_threshold) = self.get_l0_flush_delay_threshold() {
+                    if l0_count >= delay_threshold {
+                        let delay = flush_duration.as_secs_f64();
+                        info!(
+                            "delaying layer flush by {delay:.3}s for compaction backpressure at \
+                            {l0_count} L0 layers ({frozen_count} frozen layers with {frozen_size} bytes)"
+                        );
+                        let _delay_timer = self
+                            .metrics
+                            .flush_delay_histo
+                            .start_timer()
+                            .record_on_drop();
+                        tokio::select! {
+                            _ = tokio::time::sleep(flush_duration) => {},
+                            _ = watch_l0.wait_for(|l0| *l0 < delay_threshold) => {},
+                            _ = self.cancel.cancelled() => {},
+                        }
+                    }
+                }
             };
 
             // Unsharded tenants should never advance their LSN beyond the end of the
@@ -5910,13 +6053,37 @@ impl TimelineWriter<'_> {
     async fn roll_layer(&mut self, freeze_at: Lsn) -> Result<(), FlushLayerError> {
         let current_size = self.write_guard.as_ref().unwrap().current_size;
 
+        // If layer flushes are backpressured due to compaction not keeping up, wait for the flush
+        // to propagate the backpressure up into WAL ingestion.
+        let l0_count = self
+            .tl
+            .layers
+            .read()
+            .await
+            .layer_map()?
+            .level0_deltas()
+            .len();
+        let wait_thresholds = [
+            self.get_l0_flush_delay_threshold(),
+            self.get_l0_flush_stall_threshold(),
+        ];
+        let wait_threshold = wait_thresholds.into_iter().flatten().min();
+
         // self.write_guard will be taken by the freezing
-        self.tl
+        let flush_id = self
+            .tl
             .freeze_inmem_layer_at(freeze_at, &mut self.write_guard)
             .await?;
 
         assert!(self.write_guard.is_none());
 
+        if let Some(wait_threshold) = wait_threshold {
+            if l0_count >= wait_threshold {
+                info!("layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers");
+                self.tl.wait_flush_completion(flush_id).await?;
+            }
+        }
+
         if current_size >= self.get_checkpoint_distance() * 2 {
             warn!("Flushed oversized open layer with size {}", current_size)
         }
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 5059039678..748ac0d569 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -99,8 +99,11 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*WARN.*path=/v1/utilization .*request was dropped before completing",
     # Can happen during shutdown
     ".*scheduling deletion on drop failed: queue is in state Stopped.*",
-    # Too many frozen layers error is normal during intensive benchmarks
-    ".*too many frozen layers.*",
+    # L0 flush backpressure delays are expected under heavy ingest load. We want to exercise
+    # this backpressure in tests.
+    ".*delaying layer flush by \\S+ for compaction backpressure.*",
+    ".*stalling layer flushes for compaction backpressure.*",
+    ".*layer roll waiting for flush due to compaction backpressure.*",
 )
 
 
diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index 8a4ad2d399..9b159c5fcf 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -23,6 +23,8 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
             "checkpoint_distance": "16384",
             "compaction_period": "1 s",
             "compaction_threshold": "1",
+            "l0_flush_delay_threshold": "0",
+            "l0_flush_stall_threshold": "0",
             "compaction_target_size": "16384",
         }
     )
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index b8d47346a3..1fdba223ad 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -139,6 +139,8 @@ def test_fully_custom_config(positive_env: NeonEnv):
     fully_custom_config = {
         "compaction_period": "1h",
         "compaction_threshold": 13,
+        "l0_flush_delay_threshold": 25,
+        "l0_flush_stall_threshold": 42,
         "compaction_target_size": 1048576,
         "checkpoint_distance": 10000,
         "checkpoint_timeout": "13m",
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index fccfbc7f09..0e28231a86 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -64,6 +64,8 @@ def test_branch_and_gc(neon_simple_env: NeonEnv):
             # tweak the default settings to allow quickly create image layers and L1 layers
             "compaction_period": "1 s",
             "compaction_threshold": "2",
+            "l0_flush_delay_threshold": "20",
+            "l0_flush_stall_threshold": "40",
             "image_creation_threshold": "1",
             # Disable PITR, this test will set an explicit space-based GC limit
             "pitr_interval": "0 s",
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index ac44630d30..cdc6c0053d 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -143,7 +143,7 @@ def test_create_snapshot(
 
     env = neon_env_builder.init_start(
         initial_tenant_conf={
-            # Miniature layers to enable generating non-trivial layer map without writing lots of data
+            # Miniature layers to enable generating non-trivial layer map without writing lots of data.
             "checkpoint_distance": f"{128 * 1024}",
             "compaction_threshold": "1",
             "compaction_target_size": f"{128 * 1024}",
diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py
index b43a443149..dab01fcd1a 100644
--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -11,10 +11,13 @@ from fixtures.neon_fixtures import NeonEnvBuilder
 # Test pageserver recovery after crash
 #
 def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
-    # Override default checkpointer settings to run it more often
+    # Override default checkpointer settings to run it more often.
+    # This also creates a bunch more L0 layers, so disable backpressure.
     env = neon_env_builder.init_start(
         initial_tenant_conf={
             "checkpoint_distance": "1048576",
+            "l0_flush_delay_threshold": "0",
+            "l0_flush_stall_threshold": "0",
         }
     )
     env.pageserver.is_testing_enabled_or_skip()
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 52b6b254aa..f6bc6f6f41 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -539,6 +539,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
             # small checkpointing and compaction targets to ensure we generate many operations
             "checkpoint_distance": f"{64 * 1024}",
             "compaction_threshold": "1",
+            "l0_flush_delay_threshold": "0",
+            "l0_flush_stall_threshold": "0",
             "compaction_target_size": f"{64 * 1024}",
             # large horizon to avoid automatic GC (our assert on gc_result below relies on that)
             "gc_horizon": f"{1024 ** 4}",
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 95bf9106cd..e2fdacdbfc 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -440,7 +440,7 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
     env = neon_env_builder.init_start(
         initial_tenant_conf={
             "checkpoint_distance": "100000",
-            "compaction_period": "10m",
+            "compaction_period": "0s",
         }
     )
     pageserver_http = env.pageserver.http_client()
diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index d9e59c71f4..4865178ca8 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -203,6 +203,9 @@ def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder):
         "checkpoint_distance": f"{128 * 1024}",
         "compaction_target_size": f"{128 * 1024}",
         "compaction_threshold": "1",
+        # disable L0 backpressure
+        "l0_flush_delay_threshold": "0",
+        "l0_flush_stall_threshold": "0",
         # create image layers eagerly, so that GC can remove some layers
         "image_creation_threshold": "1",
         # set PITR interval to be small, so we can do GC

From de8276488d10f386166b7063e1d894cbfe8a450f Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 24 Jan 2025 10:34:57 +0000
Subject: [PATCH 255/583] tests: enable wal reader fanout in tests (#10301)

Note: this has to merge after the release is cut on `2025-01-17` for
compat tests to start passing.

## Problem

SK wal reader fan-out is not enabled in tests by default.

## Summary of changes

Enable it.
---
 test_runner/fixtures/neon_fixtures.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 388c1eb046..7e3cc19829 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4390,6 +4390,7 @@ class Safekeeper(LogUtils):
                 "1s",
                 "--eviction-min-resident",
                 "10s",
+                "--wal-reader-fanout",
             ]
 
         self.extra_opts = extra_opts

From c286fea01896aabc0761b5d880246e93fe60ed4d Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Fri, 24 Jan 2025 11:44:48 +0100
Subject: [PATCH 256/583] Print logs in extensions test in another step to
 improve readability (#10483)

## Problem
The containers' log output is mixed with the tests' output, so you must
scroll up to find the error.
## Summary of changes
Printing of containers' logs moved to a separate step.
---
 .github/workflows/build_and_test.yml  | 4 ++--
 docker-compose/docker-compose.yml     | 4 ++--
 docker-compose/docker_compose_test.sh | 8 +-------
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b1230879d3..81127f7870 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -820,8 +820,8 @@ jobs:
       - name: Print logs and clean up
         if: always()
         run: |
-          docker compose -f ./docker-compose/docker-compose.yml logs || 0
-          docker compose -f ./docker-compose/docker-compose.yml down
+          docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true
+          docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down
 
   promote-images-dev:
     needs: [ check-permissions, tag, vm-compute-node-image, neon-image ]
diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
index 4f0a887c27..95e4b6fde7 100644
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -150,8 +150,8 @@ services:
         - REPOSITORY=${REPOSITORY:-neondatabase}
         - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16}
         - TAG=${TAG:-latest}
-        - http_proxy=$http_proxy
-        - https_proxy=$https_proxy
+        - http_proxy=${http_proxy:-}
+        - https_proxy=${https_proxy:-}
     environment:
       - PG_VERSION=${PG_VERSION:-16}
       #- RUST_BACKTRACE=1
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index 4f1ae64873..f42aca673b 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -22,7 +22,6 @@ PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"
 cleanup() {
     echo "show container information"
     docker ps
-    docker compose --profile test-extensions -f $COMPOSE_FILE logs
     echo "stop containers..."
     docker compose --profile test-extensions -f $COMPOSE_FILE down
 }
@@ -41,7 +40,6 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
         cnt=`expr $cnt + 3`
         if [ $cnt -gt 60 ]; then
             echo "timeout before the compute is ready."
-            cleanup
             exit 1
         fi
         if docker compose --profile test-extensions -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then
@@ -63,11 +61,9 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
         docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
         rm -rf $TMPDIR
         # We are running tests now
-        if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
+        if ! docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
             $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
         then
-            cleanup
-        else
             FAILED=$(tail -1 testout.txt)
             for d in $FAILED
             do
@@ -77,9 +73,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
                 cat $d/regression.out $d/regression.diffs || true
             done
         rm -rf $FAILED
-        cleanup
         exit 1
         fi
     fi
-    cleanup
 done

From dcc437da1dc2225c3d724bbeb44b53ebeab84089 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Fri, 24 Jan 2025 12:03:39 +0100
Subject: [PATCH 257/583] Make promote-images-prod depend on promote-images-dev
 (#10494)

## Problem
After talking about it again with @bayandin again this should replace
the changes from https://github.com/neondatabase/neon/pull/10475. While
the previous changes worked, they are less visually clear in what
happens, and we might end up in a situation where we update `latest`,
but don't actually have the tagged image pushed that contains the same
changes. The latter would result in potentially hard to debug
situations.

## Summary of changes
Revert c283aaaf8d66dd04ce463733cf6545269f70f4c9 and make
promote-images-prod depend on promote-images-dev instead.
---
 .github/workflows/build_and_test.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 81127f7870..32b99d9c38 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -859,7 +859,7 @@ jobs:
           done
 
   promote-images-prod:
-    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
+    needs: [ check-permissions, tag, test-images, promote-images-dev ]
     runs-on: ubuntu-22.04
     if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
 
@@ -892,14 +892,14 @@ jobs:
         run: |
           for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
             docker buildx imagetools create -t $repo/neon:latest \
-                                               neondatabase/neon:${{ needs.tag.outputs.build-tag }}
+                                               $repo/neon:${{ needs.tag.outputs.build-tag }}
 
             for version in ${VERSIONS}; do
               docker buildx imagetools create -t $repo/compute-node-${version}:latest \
-                                                 neondatabase/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
+                                                 $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
 
               docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
-                                                 neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
+                                                 $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
             done
           done
           docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \

From d8ab6ddb0fee7912eafa7012435bcdd014543663 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 24 Jan 2025 14:43:52 +0200
Subject: [PATCH 258/583] Check if relation has storage in
 calculate_relation_size (#10477)

## Problem

Parent of partitioned table has no storage, it relfilelocator is zero.
It cab be incorrectly hashed and produce wrong results.

See https://github.com/neondatabase/postgres/pull/518

## Summary of changes

This problem is already addressed in pg17.
Add the same check for all other PG versions.

Postgres PRs:
https://github.com/neondatabase/postgres/pull/566
https://github.com/neondatabase/postgres/pull/565
https://github.com/neondatabase/postgres/pull/564

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 5f3b3afdd7..c0aedfd3ca 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 5f3b3afdd7c24b4a0fd63ecb3288fab472fcc633
+Subproject commit c0aedfd3cac447510a2db843b561f0c52901b679
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 935292e883..355a7c69d3 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 935292e883298187f112db6e9c7f765037ddcf64
+Subproject commit 355a7c69d3f907f3612eb406cc7b9c2f55d59b59
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 061d563779..3cf7ce1afa 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 061d56377961ba56998e41b7d5d5e975919ad301
+Subproject commit 3cf7ce1afab75027716d14223f95ddb300754162
diff --git a/vendor/revisions.json b/vendor/revisions.json
index a104be8ae0..dba0e67fb4 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -5,14 +5,14 @@
   ],
   "v16": [
     "16.6",
-    "061d56377961ba56998e41b7d5d5e975919ad301"
+    "3cf7ce1afab75027716d14223f95ddb300754162"
   ],
   "v15": [
     "15.10",
-    "935292e883298187f112db6e9c7f765037ddcf64"
+    "355a7c69d3f907f3612eb406cc7b9c2f55d59b59"
   ],
   "v14": [
     "14.15",
-    "5f3b3afdd7c24b4a0fd63ecb3288fab472fcc633"
+    "c0aedfd3cac447510a2db843b561f0c52901b679"
   ]
 }

From ef2a2555b1c001ee6af9bf4bc9896aa6bd09b9ed Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 24 Jan 2025 14:55:05 +0100
Subject: [PATCH 259/583] pageserver: tighten compaction failure detection
 (#10502)

## Problem

If compaction fails, we disable L0 flush stalls to avoid persistent
stalls. However, the logic would unset the failure marker on offload
failures or shutdown. This can lead to sudden L0 flush stalls if we try
and fail to offload a timeline with compaction failures, or if there is
some kind of shutdown race.

Touches #10405.

## Summary of changes

Don't touch the compaction failure marker on offload failures or
shutdown.
---
 pageserver/src/tenant/timeline.rs | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fffa2c8e2b..ee43512501 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1704,14 +1704,16 @@ impl Timeline {
         };
 
         // Signal compaction failure to avoid L0 flush stalls when it's broken.
-        let compaction_failed = match result {
-            Ok(_) => false,
-            Err(CompactionError::Offload(_)) => false, // doesn't halt compaction
-            Err(CompactionError::ShuttingDown) => false, // not a failure
-            Err(CompactionError::Other(_)) => true,
+        match result {
+            Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
+            Err(CompactionError::Other(_)) => {
+                self.compaction_failed.store(true, AtomicOrdering::Relaxed)
+            }
+            // Don't change the current value on offload failure or shutdown. We don't want to
+            // abruptly stall nor resume L0 flushes in these cases.
+            Err(CompactionError::Offload(_)) => {}
+            Err(CompactionError::ShuttingDown) => {}
         };
-        self.compaction_failed
-            .store(compaction_failed, AtomicOrdering::Relaxed);
 
         result
     }

From 7000aaaf75ddb4451236a9dd41e71247864c2095 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 24 Jan 2025 14:55:48 +0000
Subject: [PATCH 260/583] chore: fix h2 stubgen (#10491)

## Problem

## Summary of changes

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 poetry.lock                           | 190 +++++++++++++++++---
 pyproject.toml                        |   3 +-
 test_runner/stubs/h2/__init__.pyi     |   1 +
 test_runner/stubs/h2/config.pyi       |  25 +--
 test_runner/stubs/h2/connection.pyi   | 162 +++++++----------
 test_runner/stubs/h2/errors.pyi       |  30 ++--
 test_runner/stubs/h2/events.pyi       |   8 +-
 test_runner/stubs/h2/exceptions.pyi   |  11 +-
 test_runner/stubs/h2/frame_buffer.pyi |  19 +-
 test_runner/stubs/h2/settings.pyi     |  70 ++++----
 test_runner/stubs/h2/stream.pyi       | 240 ++++++++++----------------
 test_runner/stubs/h2/utilities.pyi    |  41 +++--
 test_runner/stubs/h2/windows.pyi      |  11 +-
 13 files changed, 426 insertions(+), 385 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 2cd2bc6383..c471d3e69c 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -6,6 +6,7 @@ version = "2.3.5"
 description = "Happy Eyeballs for asyncio"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "aiohappyeyeballs-2.3.5-py3-none-any.whl", hash = "sha256:4d6dea59215537dbc746e93e779caea8178c866856a721c9c660d7a5a7b8be03"},
     {file = "aiohappyeyeballs-2.3.5.tar.gz", hash = "sha256:6fa48b9f1317254f122a07a131a86b71ca6946ca989ce6326fff54a99a920105"},
@@ -17,6 +18,7 @@ version = "3.10.11"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "aiohttp-3.10.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5077b1a5f40ffa3ba1f40d537d3bec4383988ee51fbba6b74aa8fb1bc466599e"},
     {file = "aiohttp-3.10.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8d6a14a4d93b5b3c2891fca94fa9d41b2322a68194422bef0dd5ec1e57d7d298"},
@@ -128,6 +130,7 @@ version = "1.4.0"
 description = "Postgres integration with asyncio."
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "aiopg-1.4.0-py3-none-any.whl", hash = "sha256:aea46e8aff30b039cfa818e6db4752c97656e893fc75e5a5dc57355a9e9dedbd"},
     {file = "aiopg-1.4.0.tar.gz", hash = "sha256:116253bef86b4d954116716d181e9a0294037f266718b2e1c9766af995639d71"},
@@ -146,6 +149,7 @@ version = "1.3.1"
 description = "aiosignal: a list of registered asynchronous callbacks"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"},
     {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"},
@@ -160,6 +164,7 @@ version = "2.13.2"
 description = "Allure pytest integration"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "allure-pytest-2.13.2.tar.gz", hash = "sha256:22243159e8ec81ce2b5254b4013802198821b1b42f118f69d4a289396607c7b3"},
     {file = "allure_pytest-2.13.2-py3-none-any.whl", hash = "sha256:17de9dbee7f61c8e66a5b5e818b00e419dbcea44cb55c24319401ba813220690"},
@@ -175,6 +180,7 @@ version = "2.13.2"
 description = "Common module for integrate allure with python-based frameworks"
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
 files = [
     {file = "allure-python-commons-2.13.2.tar.gz", hash = "sha256:8a03681330231b1deadd86b97ff68841c6591320114ae638570f1ed60d7a2033"},
     {file = "allure_python_commons-2.13.2-py3-none-any.whl", hash = "sha256:2bb3646ec3fbf5b36d178a5e735002bc130ae9f9ba80f080af97d368ba375051"},
@@ -190,6 +196,7 @@ version = "0.6.0"
 description = "Reusable constraint types to use with typing.Annotated"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"},
     {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"},
@@ -201,6 +208,7 @@ version = "4.13.1"
 description = "ANTLR 4.13.1 runtime for Python 3"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "antlr4-python3-runtime-4.13.1.tar.gz", hash = "sha256:3cd282f5ea7cfb841537fe01f143350fdb1c0b1ce7981443a2fa8513fddb6d1a"},
     {file = "antlr4_python3_runtime-4.13.1-py3-none-any.whl", hash = "sha256:78ec57aad12c97ac039ca27403ad61cb98aaec8a3f9bb8144f889aa0fa28b943"},
@@ -212,6 +220,7 @@ version = "4.3.0"
 description = "High level compatibility layer for multiple asynchronous event loop implementations"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "anyio-4.3.0-py3-none-any.whl", hash = "sha256:048e05d0f6caeed70d731f3db756d35dcc1f35747c8c403364a8332c630441b8"},
     {file = "anyio-4.3.0.tar.gz", hash = "sha256:f75253795a87df48568485fd18cdd2a3fa5c4f7c5be8e5e36637733fce06fed6"},
@@ -232,6 +241,7 @@ version = "4.0.3"
 description = "Timeout context manager for asyncio programs"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
     {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
@@ -243,6 +253,7 @@ version = "0.30.0"
 description = "An asyncio PostgreSQL driver"
 optional = false
 python-versions = ">=3.8.0"
+groups = ["main"]
 files = [
     {file = "asyncpg-0.30.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bfb4dd5ae0699bad2b233672c8fc5ccbd9ad24b89afded02341786887e37927e"},
     {file = "asyncpg-0.30.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc1f62c792752a49f88b7e6f774c26077091b44caceb1983509edc18a2222ec0"},
@@ -306,6 +317,7 @@ version = "21.4.0"
 description = "Classes Without Boilerplate"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+groups = ["main"]
 files = [
     {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"},
     {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"},
@@ -323,6 +335,7 @@ version = "1.88.0"
 description = "AWS SAM Translator is a library that transform SAM templates into AWS CloudFormation templates"
 optional = false
 python-versions = "!=4.0,<=4.0,>=3.8"
+groups = ["main"]
 files = [
     {file = "aws_sam_translator-1.88.0-py3-none-any.whl", hash = "sha256:aa93d498d8de3fb3d485c316155b1628144b823bbc176099a20de06df666fcac"},
     {file = "aws_sam_translator-1.88.0.tar.gz", hash = "sha256:e77c65f3488566122277accd44a0f1ec018e37403e0d5fe25120d96e537e91a7"},
@@ -343,6 +356,7 @@ version = "2.10.0"
 description = "The AWS X-Ray SDK for Python (the SDK) enables Python developers to record and emit information from within their applications to the AWS X-Ray service."
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "aws-xray-sdk-2.10.0.tar.gz", hash = "sha256:9b14924fd0628cf92936055864655354003f0b1acc3e1c3ffde6403d0799dd7a"},
     {file = "aws_xray_sdk-2.10.0-py2.py3-none-any.whl", hash = "sha256:7551e81a796e1a5471ebe84844c40e8edf7c218db33506d046fec61f7495eda4"},
@@ -358,6 +372,7 @@ version = "2.2.1"
 description = "Function decoration for backoff and retry"
 optional = false
 python-versions = ">=3.7,<4.0"
+groups = ["main"]
 files = [
     {file = "backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8"},
     {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"},
@@ -369,6 +384,7 @@ version = "1.34.11"
 description = "The AWS SDK for Python"
 optional = false
 python-versions = ">= 3.8"
+groups = ["main"]
 files = [
     {file = "boto3-1.34.11-py3-none-any.whl", hash = "sha256:1af021e0c6e3040e8de66d403e963566476235bb70f9a8e3f6784813ac2d8026"},
     {file = "boto3-1.34.11.tar.gz", hash = "sha256:31c130a40ec0631059b77d7e87f67ad03ff1685a5b37638ac0c4687026a3259d"},
@@ -388,6 +404,7 @@ version = "1.26.16"
 description = "Type annotations for boto3 1.26.16 generated with mypy-boto3-builder 7.11.11"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "boto3-stubs-1.26.16.tar.gz", hash = "sha256:618253ae19f1480785759bcaee8c8b10ed3fc037027247c26a3461a50f58406d"},
     {file = "boto3_stubs-1.26.16-py3-none-any.whl", hash = "sha256:8cf2925bc3e1349c93eb0f49c1061affc5ca314d69eeb335349037969d0787ed"},
@@ -732,6 +749,7 @@ version = "1.34.11"
 description = "Low-level, data-driven core of boto 3."
 optional = false
 python-versions = ">= 3.8"
+groups = ["main"]
 files = [
     {file = "botocore-1.34.11-py3-none-any.whl", hash = "sha256:1ff1398b6ea670e1c01ac67a33af3da854f8e700d3528289c04f319c330d8250"},
     {file = "botocore-1.34.11.tar.gz", hash = "sha256:51905c3d623c60df5dc5794387de7caf886d350180a01a3dfa762e903edb45a9"},
@@ -751,6 +769,7 @@ version = "1.27.38"
 description = "Type annotations for botocore 1.27.38 generated with mypy-boto3-builder 7.10.1"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "botocore-stubs-1.27.38.tar.gz", hash = "sha256:408e8b86b5d171b58f81c74ca9d3b5317a5a8e2d3bc2073aa841ac13b8939e56"},
     {file = "botocore_stubs-1.27.38-py3-none-any.whl", hash = "sha256:7add7641e9a479a9c8366893bb522fd9ca3d58714201e43662a200a148a1bc38"},
@@ -765,6 +784,7 @@ version = "2024.7.4"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
 files = [
     {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"},
     {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"},
@@ -776,6 +796,7 @@ version = "1.17.1"
 description = "Foreign Function Interface for Python calling C code."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"},
     {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"},
@@ -855,6 +876,7 @@ version = "0.87.1"
 description = "Checks CloudFormation templates for practices and behaviour that could potentially be improved"
 optional = false
 python-versions = "!=4.0,<=4.0,>=3.8"
+groups = ["main"]
 files = [
     {file = "cfn_lint-0.87.1-py3-none-any.whl", hash = "sha256:d450f450635fc223b6f66880ccac52a5fd1a52966fa1705f1ba52b88dfed3071"},
     {file = "cfn_lint-0.87.1.tar.gz", hash = "sha256:b3ce9d3e5e0eadcea5d584c8ccaa00bf2a990a36a64d7ffd8683bc60b7e4f06f"},
@@ -878,6 +900,7 @@ version = "2.1.0"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
 optional = false
 python-versions = ">=3.6.0"
+groups = ["main"]
 files = [
     {file = "charset-normalizer-2.1.0.tar.gz", hash = "sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413"},
     {file = "charset_normalizer-2.1.0-py3-none-any.whl", hash = "sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5"},
@@ -892,6 +915,7 @@ version = "8.1.3"
 description = "Composable command line interface toolkit"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"},
     {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"},
@@ -906,6 +930,7 @@ version = "0.7.17"
 description = "ClickHouse Database Core Driver for Python, Pandas, and Superset"
 optional = false
 python-versions = "~=3.8"
+groups = ["main"]
 files = [
     {file = "clickhouse-connect-0.7.17.tar.gz", hash = "sha256:854f1f9f3e024e7f89ae5d57cd3289d7a4c3dc91a9f24c4d233014f0ea19cb2d"},
     {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aca36f5f28be1ada2981fce87724bbf451f267c918015baec59e527de3c9c882"},
@@ -996,6 +1021,8 @@ version = "0.4.5"
 description = "Cross-platform colored terminal text."
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+groups = ["main"]
+markers = "sys_platform == \"win32\" or platform_system == \"Windows\""
 files = [
     {file = "colorama-0.4.5-py2.py3-none-any.whl", hash = "sha256:854bf444933e37f5824ae7bfc1e98d5bce2ebe4160d46b5edf346a89358e99da"},
     {file = "colorama-0.4.5.tar.gz", hash = "sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4"},
@@ -1007,6 +1034,7 @@ version = "43.0.1"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "cryptography-43.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d"},
     {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062"},
@@ -1056,6 +1084,7 @@ version = "7.1.0"
 description = "A Python library for the Docker Engine API."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0"},
     {file = "docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c"},
@@ -1078,6 +1107,7 @@ version = "1.9.0"
 description = "execnet: rapid multi-Python deployment"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+groups = ["main"]
 files = [
     {file = "execnet-1.9.0-py2.py3-none-any.whl", hash = "sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"},
     {file = "execnet-1.9.0.tar.gz", hash = "sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5"},
@@ -1092,6 +1122,7 @@ version = "2.2.5"
 description = "A simple framework for building complex web applications."
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "Flask-2.2.5-py3-none-any.whl", hash = "sha256:58107ed83443e86067e41eff4631b058178191a355886f8e479e347fa1285fdf"},
     {file = "Flask-2.2.5.tar.gz", hash = "sha256:edee9b0a7ff26621bd5a8c10ff484ae28737a2410d99b0bb9a6850c7fb977aa0"},
@@ -1113,6 +1144,7 @@ version = "5.0.0"
 description = "A Flask extension adding a decorator for CORS support"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"},
     {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"},
@@ -1127,6 +1159,7 @@ version = "1.5.0"
 description = "A list-like structure which implements collections.abc.MutableSequence"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5b6a66c18b5b9dd261ca98dffcb826a525334b2f29e7caa54e182255c5f6a65a"},
     {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d1b3eb7b05ea246510b43a7e53ed1653e55c2121019a97e60cad7efb881a97bb"},
@@ -1228,6 +1261,7 @@ version = "3.2.1"
 description = "GraphQL implementation for Python, a port of GraphQL.js, the JavaScript reference implementation for GraphQL."
 optional = false
 python-versions = ">=3.6,<4"
+groups = ["main"]
 files = [
     {file = "graphql-core-3.2.1.tar.gz", hash = "sha256:9d1bf141427b7d54be944587c8349df791ce60ade2e3cccaf9c56368c133c201"},
     {file = "graphql_core-3.2.1-py3-none-any.whl", hash = "sha256:f83c658e4968998eed1923a2e3e3eddd347e005ac0315fbb7ca4d70ea9156323"},
@@ -1239,6 +1273,7 @@ version = "0.14.0"
 description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
     {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
@@ -1247,27 +1282,33 @@ files = [
 [[package]]
 name = "h2"
 version = "4.1.0"
-description = "HTTP/2 State-Machine based protocol implementation"
+description = "Pure-Python HTTP/2 protocol implementation"
 optional = false
-python-versions = ">=3.6.1"
-files = [
-    {file = "h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d"},
-    {file = "h2-4.1.0.tar.gz", hash = "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"},
-]
+python-versions = ">=3.9"
+groups = ["main"]
+files = []
+develop = false
 
 [package.dependencies]
-hpack = ">=4.0,<5"
-hyperframe = ">=6.0,<7"
+hpack = ">=4.1,<5"
+hyperframe = ">=6.1,<7"
+
+[package.source]
+type = "git"
+url = "https://github.com/python-hyper/h2"
+reference = "HEAD"
+resolved_reference = "0b98b244b5fd1fe96100ac14905417a3b70a4286"
 
 [[package]]
 name = "hpack"
-version = "4.0.0"
-description = "Pure-Python HPACK header compression"
+version = "4.1.0"
+description = "Pure-Python HPACK header encoding"
 optional = false
-python-versions = ">=3.6.1"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "hpack-4.0.0-py3-none-any.whl", hash = "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c"},
-    {file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"},
+    {file = "hpack-4.1.0-py3-none-any.whl", hash = "sha256:157ac792668d995c657d93111f46b4535ed114f0c9c8d672271bbec7eae1b496"},
+    {file = "hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca"},
 ]
 
 [[package]]
@@ -1276,6 +1317,7 @@ version = "1.0.3"
 description = "A minimal low-level HTTP client."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "httpcore-1.0.3-py3-none-any.whl", hash = "sha256:9a6a501c3099307d9fd76ac244e08503427679b1e81ceb1d922485e2f2462ad2"},
     {file = "httpcore-1.0.3.tar.gz", hash = "sha256:5c0f9546ad17dac4d0772b0808856eb616eb8b48ce94f49ed819fd6982a8a544"},
@@ -1297,6 +1339,7 @@ version = "0.26.0"
 description = "The next generation HTTP client."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "httpx-0.26.0-py3-none-any.whl", hash = "sha256:8915f5a3627c4d47b73e8202457cb28f1266982d1159bd5779d86a80c0eab1cd"},
     {file = "httpx-0.26.0.tar.gz", hash = "sha256:451b55c30d5185ea6b23c2c793abf9bb237d2a7dfb901ced6ff69ad37ec1dfaf"},
@@ -1318,13 +1361,14 @@ socks = ["socksio (==1.*)"]
 
 [[package]]
 name = "hyperframe"
-version = "6.0.1"
-description = "HTTP/2 framing layer for Python"
+version = "6.1.0"
+description = "Pure-Python HTTP/2 framing"
 optional = false
-python-versions = ">=3.6.1"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "hyperframe-6.0.1-py3-none-any.whl", hash = "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15"},
-    {file = "hyperframe-6.0.1.tar.gz", hash = "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"},
+    {file = "hyperframe-6.1.0-py3-none-any.whl", hash = "sha256:b03380493a519fce58ea5af42e4a42317bf9bd425596f7a0835ffce80f1a42e5"},
+    {file = "hyperframe-6.1.0.tar.gz", hash = "sha256:f630908a00854a7adeabd6382b43923a4c4cd4b821fcb527e6ab9e15382a3b08"},
 ]
 
 [[package]]
@@ -1333,6 +1377,7 @@ version = "3.7"
 description = "Internationalized Domain Names in Applications (IDNA)"
 optional = false
 python-versions = ">=3.5"
+groups = ["main"]
 files = [
     {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"},
     {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
@@ -1344,6 +1389,7 @@ version = "1.1.1"
 description = "iniconfig: brain-dead simple config-ini parsing"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
     {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
@@ -1355,6 +1401,7 @@ version = "2.1.2"
 description = "Safely pass data to untrusted environments and back."
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "itsdangerous-2.1.2-py3-none-any.whl", hash = "sha256:2c2349112351b88699d8d4b6b075022c0808887cb7ad10069318a8b0bc88db44"},
     {file = "itsdangerous-2.1.2.tar.gz", hash = "sha256:5dbbc68b317e5e42f327f9021763545dc3fc3bfe22e6deb96aaf1fc38874156a"},
@@ -1366,6 +1413,7 @@ version = "3.1.5"
 description = "A very fast and expressive template engine."
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"},
     {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"},
@@ -1383,6 +1431,7 @@ version = "1.0.1"
 description = "JSON Matching Expressions"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"},
     {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"},
@@ -1394,6 +1443,7 @@ version = "0.9.0"
 description = "The ultimate Python library for JOSE RFCs, including JWS, JWE, JWK, JWA, JWT"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "joserfc-0.9.0-py3-none-any.whl", hash = "sha256:4026bdbe2c196cd40574e916fa1e28874d99649412edaab0e373dec3077153fb"},
     {file = "joserfc-0.9.0.tar.gz", hash = "sha256:eebca7f587b1761ce43a98ffd5327f2b600b9aa5bb0a77b947687f503ad43bc0"},
@@ -1411,6 +1461,7 @@ version = "1.2.3"
 description = "Generate source code for Python classes from a JSON schema."
 optional = false
 python-versions = ">= 2.7"
+groups = ["main"]
 files = [
     {file = "jschema_to_python-1.2.3-py3-none-any.whl", hash = "sha256:8a703ca7604d42d74b2815eecf99a33359a8dccbb80806cce386d5e2dd992b05"},
     {file = "jschema_to_python-1.2.3.tar.gz", hash = "sha256:76ff14fe5d304708ccad1284e4b11f96a658949a31ee7faed9e0995279549b91"},
@@ -1427,6 +1478,7 @@ version = "2.0.0"
 description = "Diff JSON and JSON-like structures in Python"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "jsondiff-2.0.0-py3-none-any.whl", hash = "sha256:689841d66273fc88fc79f7d33f4c074774f4f214b6466e3aff0e5adaf889d1e0"},
     {file = "jsondiff-2.0.0.tar.gz", hash = "sha256:2795844ef075ec8a2b8d385c4d59f5ea48b08e7180fce3cb2787be0db00b1fb4"},
@@ -1438,6 +1490,8 @@ version = "0.20.0"
 description = "Python bindings for Jsonnet - The data templating language"
 optional = false
 python-versions = "*"
+groups = ["main"]
+markers = "python_version < \"3.13\""
 files = [
     {file = "jsonnet-0.20.0.tar.gz", hash = "sha256:7e770c7bf3a366b97b650a39430450f77612e74406731eb75c5bd59f3f104d4f"},
 ]
@@ -1448,6 +1502,7 @@ version = "1.32"
 description = "Apply JSON-Patches (RFC 6902)"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+groups = ["main"]
 files = [
     {file = "jsonpatch-1.32-py2.py3-none-any.whl", hash = "sha256:26ac385719ac9f54df8a2f0827bb8253aa3ea8ab7b3368457bcdb8c14595a397"},
     {file = "jsonpatch-1.32.tar.gz", hash = "sha256:b6ddfe6c3db30d81a96aaeceb6baf916094ffa23d7dd5fa2c13e13f8b6e600c2"},
@@ -1462,6 +1517,7 @@ version = "1.6.1"
 description = "A final implementation of JSONPath for Python that aims to be standard compliant, including arithmetic and binary comparison operators and providing clear AST for metaprogramming."
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "jsonpath-ng-1.6.1.tar.gz", hash = "sha256:086c37ba4917304850bd837aeab806670224d3f038fe2833ff593a672ef0a5fa"},
     {file = "jsonpath_ng-1.6.1-py3-none-any.whl", hash = "sha256:8f22cd8273d7772eea9aaa84d922e0841aa36fdb8a2c6b7f6c3791a16a9bc0be"},
@@ -1476,6 +1532,7 @@ version = "2.2.0"
 description = "Python library for serializing any arbitrary object graph into JSON"
 optional = false
 python-versions = ">=2.7"
+groups = ["main"]
 files = [
     {file = "jsonpickle-2.2.0-py2.py3-none-any.whl", hash = "sha256:de7f2613818aa4f234138ca11243d6359ff83ae528b2185efdd474f62bcf9ae1"},
     {file = "jsonpickle-2.2.0.tar.gz", hash = "sha256:7b272918b0554182e53dc340ddd62d9b7f902fec7e7b05620c04f3ccef479a0e"},
@@ -1492,6 +1549,7 @@ version = "2.3"
 description = "Identify specific nodes in a JSON document (RFC 6901)"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+groups = ["main"]
 files = [
     {file = "jsonpointer-2.3-py2.py3-none-any.whl", hash = "sha256:51801e558539b4e9cd268638c078c6c5746c9ac96bc38152d443400e4f3793e9"},
     {file = "jsonpointer-2.3.tar.gz", hash = "sha256:97cba51526c829282218feb99dab1b1e6bdf8efd1c43dc9d57be093c0d69c99a"},
@@ -1503,6 +1561,7 @@ version = "4.17.3"
 description = "An implementation of JSON Schema validation for Python"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "jsonschema-4.17.3-py3-none-any.whl", hash = "sha256:a870ad254da1a8ca84b6a2905cac29d265f805acc57af304784962a2aa6508f6"},
     {file = "jsonschema-4.17.3.tar.gz", hash = "sha256:0f864437ab8b6076ba6707453ef8f98a6a0d512a80e93f8abdb676f737ecb60d"},
@@ -1522,6 +1581,7 @@ version = "0.1.6"
 description = "JSONSchema Spec with object-oriented paths"
 optional = false
 python-versions = ">=3.7.0,<4.0.0"
+groups = ["main"]
 files = [
     {file = "jsonschema_spec-0.1.6-py3-none-any.whl", hash = "sha256:f2206d18c89d1824c1f775ba14ed039743b41a9167bd2c5bdb774b66b3ca0bbf"},
     {file = "jsonschema_spec-0.1.6.tar.gz", hash = "sha256:90215863b56e212086641956b20127ccbf6d8a3a38343dad01d6a74d19482f76"},
@@ -1539,6 +1599,7 @@ version = "1.9"
 description = "Creates JUnit XML test result documents that can be read by tools such as Jenkins"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "junit-xml-1.9.tar.gz", hash = "sha256:de16a051990d4e25a3982b2dd9e89d671067548718866416faec14d9de56db9f"},
     {file = "junit_xml-1.9-py2.py3-none-any.whl", hash = "sha256:ec5ca1a55aefdd76d28fcc0b135251d156c7106fa979686a4b48d62b761b4732"},
@@ -1553,6 +1614,7 @@ version = "1.5.6"
 description = "Implementation of JOSE Web standards"
 optional = false
 python-versions = ">= 3.8"
+groups = ["main"]
 files = [
     {file = "jwcrypto-1.5.6-py3-none-any.whl", hash = "sha256:150d2b0ebbdb8f40b77f543fb44ffd2baeff48788be71f67f03566692fd55789"},
     {file = "jwcrypto-1.5.6.tar.gz", hash = "sha256:771a87762a0c081ae6166958a954f80848820b2ab066937dc8b8379d65b1b039"},
@@ -1568,6 +1630,7 @@ version = "2.0.2"
 description = "Pure Python client for Apache Kafka"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"},
     {file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"},
@@ -1582,6 +1645,7 @@ version = "1.10.0"
 description = "A fast and thorough lazy object proxy."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "lazy-object-proxy-1.10.0.tar.gz", hash = "sha256:78247b6d45f43a52ef35c25b5581459e85117225408a4128a3daf8bf9648ac69"},
     {file = "lazy_object_proxy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:855e068b0358ab916454464a884779c7ffa312b8925c6f7401e952dcf3b89977"},
@@ -1628,6 +1692,7 @@ version = "4.3.3"
 description = "LZ4 Bindings for Python"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"},
     {file = "lz4-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f"},
@@ -1678,6 +1743,7 @@ version = "2.1.1"
 description = "Safely add untrusted strings to HTML/XML markup."
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812"},
     {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a"},
@@ -1727,6 +1793,7 @@ version = "5.0.6"
 description = ""
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "moto-5.0.6-py2.py3-none-any.whl", hash = "sha256:ca1e22831a741733b581ff2ef4d6ae2e1c6db1eab97af1b78b86ca2c6e88c609"},
     {file = "moto-5.0.6.tar.gz", hash = "sha256:ad8b23f2b555ad694da8b2432a42b6d96beaaf67a4e7d932196a72193a2eee2c"},
@@ -1786,6 +1853,7 @@ version = "1.3.0"
 description = "Python library for arbitrary-precision floating-point arithmetic"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"},
     {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"},
@@ -1803,6 +1871,7 @@ version = "6.0.5"
 description = "multidict implementation"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9"},
     {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604"},
@@ -1902,6 +1971,7 @@ version = "1.13.0"
 description = "Optional static typing for Python"
 optional = false
 python-versions = ">=3.8"
+groups = ["dev"]
 files = [
     {file = "mypy-1.13.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6607e0f1dd1fb7f0aca14d936d13fd19eba5e17e1cd2a14f808fa5f8f6d8f60a"},
     {file = "mypy-1.13.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8a21be69bd26fa81b1f80a61ee7ab05b076c674d9b18fb56239d72e21d9f4c80"},
@@ -1954,6 +2024,7 @@ version = "1.26.0.post1"
 description = "Type annotations for boto3.S3 1.26.0 service generated with mypy-boto3-builder 7.11.10"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "mypy-boto3-s3-1.26.0.post1.tar.gz", hash = "sha256:6d7079f8c739dc993cbedad0736299c413b297814b73795a3855a79169ecc938"},
     {file = "mypy_boto3_s3-1.26.0.post1-py3-none-any.whl", hash = "sha256:7de2792ff0cc541b84cd46ff3a6aa2b6e5f267217f2203f27f6e4016bddc644d"},
@@ -1968,6 +2039,7 @@ version = "1.0.0"
 description = "Type system extensions for programs checked with the mypy type checker."
 optional = false
 python-versions = ">=3.5"
+groups = ["dev"]
 files = [
     {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"},
     {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
@@ -1979,6 +2051,7 @@ version = "2.8.5"
 description = "Python package for creating and manipulating graphs and networks"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "networkx-2.8.5-py3-none-any.whl", hash = "sha256:a762f4b385692d9c3a6f2912d058d76d29a827deaedf9e63ed14d397b8030687"},
     {file = "networkx-2.8.5.tar.gz", hash = "sha256:15a7b81a360791c458c55a417418ea136c13378cfdc06a2dcdc12bd2f9cf09c1"},
@@ -1997,6 +2070,7 @@ version = "0.4.4"
 description = "OpenAPI schema validation for Python"
 optional = false
 python-versions = ">=3.7.0,<4.0.0"
+groups = ["main"]
 files = [
     {file = "openapi_schema_validator-0.4.4-py3-none-any.whl", hash = "sha256:79f37f38ef9fd5206b924ed7a6f382cea7b649b3b56383c47f1906082b7b9015"},
     {file = "openapi_schema_validator-0.4.4.tar.gz", hash = "sha256:c573e2be2c783abae56c5a1486ab716ca96e09d1c3eab56020d1dc680aa57bf8"},
@@ -2015,6 +2089,7 @@ version = "0.5.7"
 description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3 spec validator"
 optional = false
 python-versions = ">=3.7.0,<4.0.0"
+groups = ["main"]
 files = [
     {file = "openapi_spec_validator-0.5.7-py3-none-any.whl", hash = "sha256:8712d2879db7692974ef89c47a3ebfc79436442921ec3a826ac0ce80cde8c549"},
     {file = "openapi_spec_validator-0.5.7.tar.gz", hash = "sha256:6c2d42180045a80fd6314de848b94310bdb0fa4949f4b099578b69f79d9fa5ac"},
@@ -2032,6 +2107,7 @@ version = "24.2"
 description = "Core utilities for Python packages"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"},
     {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"},
@@ -2043,6 +2119,7 @@ version = "0.4.3"
 description = "Object-oriented paths"
 optional = false
 python-versions = ">=3.7.0,<4.0.0"
+groups = ["main"]
 files = [
     {file = "pathable-0.4.3-py3-none-any.whl", hash = "sha256:cdd7b1f9d7d5c8b8d3315dbf5a86b2596053ae845f056f57d97c0eefff84da14"},
     {file = "pathable-0.4.3.tar.gz", hash = "sha256:5c869d315be50776cc8a993f3af43e0c60dc01506b399643f919034ebf4cdcab"},
@@ -2054,6 +2131,7 @@ version = "5.9.0"
 description = "Python Build Reasonableness"
 optional = false
 python-versions = ">=2.6"
+groups = ["main"]
 files = [
     {file = "pbr-5.9.0-py2.py3-none-any.whl", hash = "sha256:e547125940bcc052856ded43be8e101f63828c2d94239ffbe2b327ba3d5ccf0a"},
     {file = "pbr-5.9.0.tar.gz", hash = "sha256:e8dca2f4b43560edef58813969f52a56cef023146cbb8931626db80e6c1c4308"},
@@ -2065,6 +2143,7 @@ version = "1.0.0"
 description = "plugin and hook calling mechanisms for python"
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
 files = [
     {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
     {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
@@ -2080,6 +2159,7 @@ version = "3.11"
 description = "Python Lex & Yacc"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"},
     {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"},
@@ -2091,6 +2171,7 @@ version = "0.14.1"
 description = "Python client for the Prometheus monitoring system."
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
 files = [
     {file = "prometheus_client-0.14.1-py3-none-any.whl", hash = "sha256:522fded625282822a89e2773452f42df14b5a8e84a86433e3f8a189c1d54dc01"},
     {file = "prometheus_client-0.14.1.tar.gz", hash = "sha256:5459c427624961076277fdc6dc50540e2bacb98eebde99886e59ec55ed92093a"},
@@ -2105,6 +2186,7 @@ version = "0.2.0"
 description = "Accelerated property cache"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c5869b8fd70b81835a6f187c5fdbe67917a04d7e52b6e7cc4e5fe39d55c39d58"},
     {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:952e0d9d07609d9c5be361f33b0d6d650cd2bae393aabb11d9b719364521984b"},
@@ -2212,6 +2294,7 @@ version = "5.9.4"
 description = "Cross-platform lib for process and system monitoring in Python."
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+groups = ["main"]
 files = [
     {file = "psutil-5.9.4-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c1ca331af862803a42677c120aff8a814a804e09832f166f226bfd22b56feee8"},
     {file = "psutil-5.9.4-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:68908971daf802203f3d37e78d3f8831b6d1014864d7a85937941bb35f09aefe"},
@@ -2238,6 +2321,7 @@ version = "2.9.10"
 description = "psycopg2 - Python-PostgreSQL Database Adapter"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "psycopg2-binary-2.9.10.tar.gz", hash = "sha256:4b3df0e6990aa98acda57d983942eff13d824135fe2250e6522edaa782a06de2"},
     {file = "psycopg2_binary-2.9.10-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:0ea8e3d0ae83564f2fc554955d327fa081d065c8ca5cc6d2abb643e2c9c1200f"},
@@ -2286,6 +2370,7 @@ files = [
     {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"},
     {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"},
     {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:27422aa5f11fbcd9b18da48373eb67081243662f9b46e6fd07c3eb46e4535142"},
     {file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"},
     {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"},
     {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"},
@@ -2314,6 +2399,7 @@ version = "0.5.4"
 description = "Pure Python PartiQL Parser"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "py_partiql_parser-0.5.4-py2.py3-none-any.whl", hash = "sha256:3dc4295a47da9587681a96b35c6e151886fdbd0a4acbe0d97c4c68e5f689d315"},
     {file = "py_partiql_parser-0.5.4.tar.gz", hash = "sha256:72e043919538fa63edae72fb59afc7e3fd93adbde656718a7d2b4666f23dd114"},
@@ -2328,6 +2414,7 @@ version = "2.21"
 description = "C parser in Python"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+groups = ["main"]
 files = [
     {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"},
     {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
@@ -2339,6 +2426,7 @@ version = "2.10.4"
 description = "Data validation using Python type hints"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "pydantic-2.10.4-py3-none-any.whl", hash = "sha256:597e135ea68be3a37552fb524bc7d0d66dcf93d395acd93a00682f1efcb8ee3d"},
     {file = "pydantic-2.10.4.tar.gz", hash = "sha256:82f12e9723da6de4fe2ba888b5971157b3be7ad914267dea8f05f82b28254f06"},
@@ -2359,6 +2447,7 @@ version = "2.27.2"
 description = "Core functionality for Pydantic validation and serialization"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "pydantic_core-2.27.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2d367ca20b2f14095a8f4fa1210f5a7b78b8a20009ecced6b12818f455b1e9fa"},
     {file = "pydantic_core-2.27.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:491a2b73db93fab69731eaee494f320faa4e093dbed776be1a829c2eb222c34c"},
@@ -2471,6 +2560,7 @@ version = "2.4.0"
 description = "JSON Web Token implementation in Python"
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
 files = [
     {file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"},
     {file = "PyJWT-2.4.0.tar.gz", hash = "sha256:d42908208c699b3b973cbeb01a969ba6a96c821eefb1c5bfe4c390c01d67abba"},
@@ -2491,6 +2581,7 @@ version = "3.0.9"
 description = "pyparsing module - Classes and methods to define and execute parsing grammars"
 optional = false
 python-versions = ">=3.6.8"
+groups = ["main"]
 files = [
     {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"},
     {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"},
@@ -2505,6 +2596,7 @@ version = "0.18.1"
 description = "Persistent/Functional/Immutable data structures"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "pyrsistent-0.18.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:df46c854f490f81210870e509818b729db4488e1f30f2a1ce1698b2295a878d1"},
     {file = "pyrsistent-0.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d45866ececf4a5fff8742c25722da6d4c9e180daa7b405dc0a2a2790d668c26"},
@@ -2535,6 +2627,7 @@ version = "7.4.4"
 description = "pytest: simple powerful testing with Python"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"},
     {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"},
@@ -2555,6 +2648,7 @@ version = "0.21.0"
 description = "Pytest support for asyncio"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "pytest-asyncio-0.21.0.tar.gz", hash = "sha256:2b38a496aef56f56b0e87557ec313e11e1ab9276fc3863f6a7be0f1d0e415e1b"},
     {file = "pytest_asyncio-0.21.0-py3-none-any.whl", hash = "sha256:f2b3366b7cd501a4056858bd39349d5af19742aed2d81660b7998b6341c7eb9c"},
@@ -2573,6 +2667,7 @@ version = "1.0.8"
 description = "pytest-httpserver is a httpserver for pytest"
 optional = false
 python-versions = ">=3.8,<4.0"
+groups = ["main"]
 files = [
     {file = "pytest_httpserver-1.0.8-py3-none-any.whl", hash = "sha256:24cd3d9f6a0b927c7bfc400d0b3fda7442721b8267ce29942bf307b190f0bb09"},
     {file = "pytest_httpserver-1.0.8.tar.gz", hash = "sha256:e052f69bc8a9073db02484681e8e47004dd1fb3763b0ae833bd899e5895c559a"},
@@ -2587,6 +2682,7 @@ version = "0.6.3"
 description = "It helps to use fixtures in pytest.mark.parametrize"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"},
     {file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"},
@@ -2601,6 +2697,7 @@ version = "1.1.0"
 description = "pytest plugin to run your tests in a specific order"
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
 files = [
     {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"},
     {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"},
@@ -2615,6 +2712,7 @@ version = "0.9.3"
 description = "pytest plugin for repeating tests"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "pytest_repeat-0.9.3-py3-none-any.whl", hash = "sha256:26ab2df18226af9d5ce441c858f273121e92ff55f5bb311d25755b8d7abdd8ed"},
     {file = "pytest_repeat-0.9.3.tar.gz", hash = "sha256:ffd3836dfcd67bb270bec648b330e20be37d2966448c4148c4092d1e8aba8185"},
@@ -2629,6 +2727,7 @@ version = "15.0"
 description = "pytest plugin to re-run tests to eliminate flaky failures"
 optional = false
 python-versions = ">=3.9"
+groups = ["main"]
 files = [
     {file = "pytest-rerunfailures-15.0.tar.gz", hash = "sha256:2d9ac7baf59f4c13ac730b47f6fa80e755d1ba0581da45ce30b72fb3542b4474"},
     {file = "pytest_rerunfailures-15.0-py3-none-any.whl", hash = "sha256:dd150c4795c229ef44320adc9a0c0532c51b78bb7a6843a8c53556b9a611df1a"},
@@ -2644,6 +2743,7 @@ version = "0.8.1"
 description = "Pytest plugin which splits the test suite to equally sized sub suites based on test execution time."
 optional = false
 python-versions = ">=3.7.1,<4.0"
+groups = ["main"]
 files = [
     {file = "pytest_split-0.8.1-py3-none-any.whl", hash = "sha256:74b110ea091bd147cc1c5f9665a59506e5cedfa66f96a89fb03e4ab447c2c168"},
     {file = "pytest_split-0.8.1.tar.gz", hash = "sha256:2d88bd3dc528689a7a3f58fc12ea165c3aa62e90795e420dfad920afe5612d6d"},
@@ -2658,6 +2758,7 @@ version = "2.1.0"
 description = "pytest plugin to abort hanging tests"
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
 files = [
     {file = "pytest-timeout-2.1.0.tar.gz", hash = "sha256:c07ca07404c612f8abbe22294b23c368e2e5104b521c1790195561f37e1ac3d9"},
     {file = "pytest_timeout-2.1.0-py3-none-any.whl", hash = "sha256:f6f50101443ce70ad325ceb4473c4255e9d74e3c7cd0ef827309dfa4c0d975c6"},
@@ -2672,6 +2773,7 @@ version = "3.3.1"
 description = "pytest xdist plugin for distributed testing, most importantly across multiple CPUs"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "pytest-xdist-3.3.1.tar.gz", hash = "sha256:d5ee0520eb1b7bcca50a60a518ab7a7707992812c578198f8b44fdfac78e8c93"},
     {file = "pytest_xdist-3.3.1-py3-none-any.whl", hash = "sha256:ff9daa7793569e6a68544850fd3927cd257cc03a7ef76c95e86915355e82b5f2"},
@@ -2692,6 +2794,7 @@ version = "2.8.2"
 description = "Extensions to the standard Python datetime module"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
+groups = ["main"]
 files = [
     {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
     {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
@@ -2706,6 +2809,7 @@ version = "1.0.1"
 description = "Read key-value pairs from a .env file and set them as environment variables"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca"},
     {file = "python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a"},
@@ -2720,6 +2824,7 @@ version = "2024.1"
 description = "World timezone definitions, modern and historical"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
     {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
@@ -2731,6 +2836,8 @@ version = "308"
 description = "Python for Window Extensions"
 optional = false
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"win32\""
 files = [
     {file = "pywin32-308-cp310-cp310-win32.whl", hash = "sha256:796ff4426437896550d2981b9c2ac0ffd75238ad9ea2d3bfa67a1abd546d262e"},
     {file = "pywin32-308-cp310-cp310-win_amd64.whl", hash = "sha256:4fc888c59b3c0bef905ce7eb7e2106a07712015ea1c8234b703a088d46110e8e"},
@@ -2758,6 +2865,7 @@ version = "6.0.2"
 description = "YAML parser and emitter for Python"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
     {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
@@ -2820,6 +2928,7 @@ version = "2024.4.28"
 description = "Alternative regular expression module, to replace re."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd196d056b40af073d95a2879678585f0b74ad35190fac04ca67954c582c6b61"},
     {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8bb381f777351bd534462f63e1c6afb10a7caa9fa2a421ae22c26e796fe31b1f"},
@@ -2908,6 +3017,7 @@ version = "2.32.3"
 description = "Python HTTP for Humans."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
     {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
@@ -2929,6 +3039,7 @@ version = "0.25.3"
 description = "A utility library for mocking out the `requests` Python library."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "responses-0.25.3-py3-none-any.whl", hash = "sha256:521efcbc82081ab8daa588e08f7e8a64ce79b91c39f6e62199b19159bea7dbcb"},
     {file = "responses-0.25.3.tar.gz", hash = "sha256:617b9247abd9ae28313d57a75880422d55ec63c29d33d629697590a034358dba"},
@@ -2948,6 +3059,7 @@ version = "0.1.4"
 description = "A pure python RFC3339 validator"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+groups = ["main"]
 files = [
     {file = "rfc3339_validator-0.1.4-py2.py3-none-any.whl", hash = "sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa"},
     {file = "rfc3339_validator-0.1.4.tar.gz", hash = "sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b"},
@@ -2962,6 +3074,7 @@ version = "0.7.0"
 description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
+groups = ["dev"]
 files = [
     {file = "ruff-0.7.0-py3-none-linux_armv6l.whl", hash = "sha256:0cdf20c2b6ff98e37df47b2b0bd3a34aaa155f59a11182c1303cce79be715628"},
     {file = "ruff-0.7.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:496494d350c7fdeb36ca4ef1c9f21d80d182423718782222c29b3e72b3512737"},
@@ -2989,6 +3102,7 @@ version = "0.10.0"
 description = "An Amazon S3 Transfer Manager"
 optional = false
 python-versions = ">= 3.8"
+groups = ["main"]
 files = [
     {file = "s3transfer-0.10.0-py3-none-any.whl", hash = "sha256:3cdb40f5cfa6966e812209d0994f2a4709b561c88e90cf00c2696d2df4e56b2e"},
     {file = "s3transfer-0.10.0.tar.gz", hash = "sha256:d0c8bbf672d5eebbe4e57945e23b972d963f07d82f661cabf678a5c88831595b"},
@@ -3006,6 +3120,7 @@ version = "1.0.4"
 description = "Classes implementing the SARIF 2.1.0 object model."
 optional = false
 python-versions = ">= 2.7"
+groups = ["main"]
 files = [
     {file = "sarif_om-1.0.4-py3-none-any.whl", hash = "sha256:539ef47a662329b1c8502388ad92457425e95dc0aaaf995fe46f4984c4771911"},
     {file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"},
@@ -3021,6 +3136,7 @@ version = "70.0.0"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"},
     {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"},
@@ -3036,6 +3152,7 @@ version = "1.16.0"
 description = "Python 2 and 3 compatibility utilities"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
+groups = ["main"]
 files = [
     {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
@@ -3047,6 +3164,7 @@ version = "1.3.0"
 description = "Sniff out which async library your code is running under"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"},
     {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
@@ -3058,6 +3176,7 @@ version = "1.12"
 description = "Computer algebra system (CAS) in Python"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"},
     {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"},
@@ -3072,6 +3191,7 @@ version = "4.9.0"
 description = "Python library for throwaway instances of anything that can run in a Docker container"
 optional = false
 python-versions = "<4.0,>=3.9"
+groups = ["main"]
 files = [
     {file = "testcontainers-4.9.0-py3-none-any.whl", hash = "sha256:c6fee929990972c40bf6b91b7072c94064ff3649b405a14fde0274c8b2479d32"},
     {file = "testcontainers-4.9.0.tar.gz", hash = "sha256:2cd6af070109ff68c1ab5389dc89c86c2dc3ab30a21ca734b2cb8f0f80ad479e"},
@@ -3125,6 +3245,7 @@ version = "0.10.2"
 description = "Python Library for Tom's Obvious, Minimal Language"
 optional = false
 python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
+groups = ["main"]
 files = [
     {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
     {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
@@ -3136,6 +3257,7 @@ version = "1.5.0.20240925"
 description = "Typing stubs for jwcrypto"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "types-jwcrypto-1.5.0.20240925.tar.gz", hash = "sha256:50e17b790378c96239344476c7bd13b52d0c7eeb6d16c2d53723e48cc6bbf4fe"},
     {file = "types_jwcrypto-1.5.0.20240925-py3-none-any.whl", hash = "sha256:2d12a2d528240d326075e896aafec7056b9136bf3207fa6ccf3fcb8fbf9e11a1"},
@@ -3150,6 +3272,7 @@ version = "5.9.5.12"
 description = "Typing stubs for psutil"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "types-psutil-5.9.5.12.tar.gz", hash = "sha256:61a91679d3fe737250013b624dca09375e7cc3ad77dcc734553746c429c02aca"},
     {file = "types_psutil-5.9.5.12-py3-none-any.whl", hash = "sha256:e9a147b8561235c6afcce5aa1adb973fad9ab2c50cf89820697687f53510358f"},
@@ -3161,6 +3284,7 @@ version = "2.9.21.20241019"
 description = "Typing stubs for psycopg2"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "types-psycopg2-2.9.21.20241019.tar.gz", hash = "sha256:bca89b988d2ebd19bcd08b177d22a877ea8b841decb10ed130afcf39404612fa"},
     {file = "types_psycopg2-2.9.21.20241019-py3-none-any.whl", hash = "sha256:44d091e67732d16a941baae48cd7b53bf91911bc36888652447cf1ef0c1fb3f6"},
@@ -3172,6 +3296,7 @@ version = "0.6.3.3"
 description = "Typing stubs for pytest-lazy-fixture"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "types-pytest-lazy-fixture-0.6.3.3.tar.gz", hash = "sha256:2ef79d66bcde0e50acdac8dc55074b9ae0d4cfaeabdd638f5522f4cac7c8a2c7"},
     {file = "types_pytest_lazy_fixture-0.6.3.3-py3-none-any.whl", hash = "sha256:a56a55649147ff960ff79d4b2c781a4f769351abc1876873f3116d0bd0c96353"},
@@ -3183,6 +3308,7 @@ version = "6.0.12.20240917"
 description = "Typing stubs for PyYAML"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "types-PyYAML-6.0.12.20240917.tar.gz", hash = "sha256:d1405a86f9576682234ef83bcb4e6fff7c9305c8b1fbad5e0bcd4f7dbdc9c587"},
     {file = "types_PyYAML-6.0.12.20240917-py3-none-any.whl", hash = "sha256:392b267f1c0fe6022952462bf5d6523f31e37f6cea49b14cee7ad634b6301570"},
@@ -3194,6 +3320,7 @@ version = "2.31.0.0"
 description = "Typing stubs for requests"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "types-requests-2.31.0.0.tar.gz", hash = "sha256:c1c29d20ab8d84dff468d7febfe8e0cb0b4664543221b386605e14672b44ea25"},
     {file = "types_requests-2.31.0.0-py3-none-any.whl", hash = "sha256:7c5cea7940f8e92ec560bbc468f65bf684aa3dcf0554a6f8c4710f5f708dc598"},
@@ -3208,6 +3335,7 @@ version = "0.6.0.post3"
 description = "Type annotations and code completion for s3transfer"
 optional = false
 python-versions = ">=3.7,<4.0"
+groups = ["main"]
 files = [
     {file = "types-s3transfer-0.6.0.post3.tar.gz", hash = "sha256:92c3704e5d041202bfb5ddb79d083fd1a02de2c5dfec6a91576823e6b5c93993"},
     {file = "types_s3transfer-0.6.0.post3-py3-none-any.whl", hash = "sha256:eedc5117275565b3c83662c0ccc81662a34da5dda8bd502b89d296b6d5cb091d"},
@@ -3219,6 +3347,7 @@ version = "0.10.8.6"
 description = "Typing stubs for toml"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "types-toml-0.10.8.6.tar.gz", hash = "sha256:6d3ac79e36c9ee593c5d4fb33a50cca0e3adceb6ef5cff8b8e5aef67b4c4aaf2"},
     {file = "types_toml-0.10.8.6-py3-none-any.whl", hash = "sha256:de7b2bb1831d6f7a4b554671ffe5875e729753496961b3e9b202745e4955dafa"},
@@ -3230,6 +3359,7 @@ version = "1.26.17"
 description = "Typing stubs for urllib3"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "types-urllib3-1.26.17.tar.gz", hash = "sha256:73fd274524c3fc7cd8cd9ceb0cb67ed99b45f9cb2831013e46d50c1451044800"},
     {file = "types_urllib3-1.26.17-py3-none-any.whl", hash = "sha256:0d027fcd27dbb3cb532453b4d977e05bc1e13aefd70519866af211b3003d895d"},
@@ -3241,6 +3371,7 @@ version = "4.12.2"
 description = "Backported and Experimental Type Hints for Python 3.8+"
 optional = false
 python-versions = ">=3.8"
+groups = ["main", "dev"]
 files = [
     {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
     {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
@@ -3252,6 +3383,7 @@ version = "1.26.19"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
+groups = ["main"]
 files = [
     {file = "urllib3-1.26.19-py2.py3-none-any.whl", hash = "sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3"},
     {file = "urllib3-1.26.19.tar.gz", hash = "sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429"},
@@ -3268,6 +3400,7 @@ version = "12.0"
 description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "websockets-12.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d554236b2a2006e0ce16315c16eaa0d628dab009c33b63ea03f41c6107958374"},
     {file = "websockets-12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2d225bb6886591b1746b17c0573e29804619c8f755b5598d875bb4235ea639be"},
@@ -3349,6 +3482,7 @@ version = "3.0.6"
 description = "The comprehensive WSGI web application library."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "werkzeug-3.0.6-py3-none-any.whl", hash = "sha256:1bc0c2310d2fbb07b1dd1105eba2f7af72f322e1e455f2f93c993bee8c8a5f17"},
     {file = "werkzeug-3.0.6.tar.gz", hash = "sha256:a8dd59d4de28ca70471a34cba79bed5f7ef2e036a76b3ab0835474246eb41f8d"},
@@ -3366,6 +3500,7 @@ version = "1.14.1"
 description = "Module for decorators, wrappers and monkey patching."
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
+groups = ["main"]
 files = [
     {file = "wrapt-1.14.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3"},
     {file = "wrapt-1.14.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef"},
@@ -3386,6 +3521,16 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -3439,6 +3584,7 @@ version = "0.13.0"
 description = "Makes working with XML feel like you are working with JSON"
 optional = false
 python-versions = ">=3.4"
+groups = ["main"]
 files = [
     {file = "xmltodict-0.13.0-py2.py3-none-any.whl", hash = "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852"},
     {file = "xmltodict-0.13.0.tar.gz", hash = "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56"},
@@ -3450,6 +3596,7 @@ version = "1.17.2"
 description = "Yet another URL library"
 optional = false
 python-versions = ">=3.9"
+groups = ["main"]
 files = [
     {file = "yarl-1.17.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:93771146ef048b34201bfa382c2bf74c524980870bb278e6df515efaf93699ff"},
     {file = "yarl-1.17.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8281db240a1616af2f9c5f71d355057e73a1409c4648c8949901396dc0a3c151"},
@@ -3546,6 +3693,7 @@ version = "0.23.0"
 description = "Zstandard bindings for Python"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "zstandard-0.23.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bf0a05b6059c0528477fba9054d09179beb63744355cab9f38059548fedd46a9"},
     {file = "zstandard-0.23.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fc9ca1c9718cb3b06634c7c8dec57d24e9438b2aa9a0f02b8bb36bf478538880"},
@@ -3653,6 +3801,6 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
 cffi = ["cffi (>=1.11)"]
 
 [metadata]
-lock-version = "2.0"
+lock-version = "2.1"
 python-versions = "^3.11"
-content-hash = "e6904aca09abc6c805604b21a5702a97e0056406f9ec7469b091d35ee10a6b16"
+content-hash = "4dc3165fe22c0e0f7a030ea0f8a680ae2ff74561d8658c393abbe9112caaf5d7"
diff --git a/pyproject.toml b/pyproject.toml
index 735d12d756..e299c421e9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,7 +43,7 @@ websockets = "^12.0"
 clickhouse-connect = "^0.7.16"
 kafka-python = "^2.0.2"
 jwcrypto = "^1.5.6"
-h2 = "^4.1.0"
+h2 = {git = "https://github.com/python-hyper/h2"}
 types-jwcrypto = "^1.5.0.20240925"
 pyyaml = "^6.0.2"
 types-pyyaml = "^6.0.12.20240917"
@@ -94,6 +94,7 @@ target-version = "py311"
 extend-exclude = [
     "vendor/",
     "target/",
+    "test_runner/stubs/", # Autogenerated by mypy's stubgen
 ]
 line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter
 
diff --git a/test_runner/stubs/h2/__init__.pyi b/test_runner/stubs/h2/__init__.pyi
index e69de29bb2..bda5b5a7f4 100644
--- a/test_runner/stubs/h2/__init__.pyi
+++ b/test_runner/stubs/h2/__init__.pyi
@@ -0,0 +1 @@
+__version__: str
diff --git a/test_runner/stubs/h2/config.pyi b/test_runner/stubs/h2/config.pyi
index 710005db69..422344b981 100644
--- a/test_runner/stubs/h2/config.pyi
+++ b/test_runner/stubs/h2/config.pyi
@@ -1,11 +1,12 @@
 from _typeshed import Incomplete
+from typing import Any
 
 class _BooleanConfigOption:
     name: Incomplete
     attr_name: Incomplete
-    def __init__(self, name) -> None: ...
-    def __get__(self, instance, owner): ...
-    def __set__(self, instance, value) -> None: ...
+    def __init__(self, name: str) -> None: ...
+    def __get__(self, instance: Any, owner: Any) -> bool: ...
+    def __set__(self, instance: Any, value: bool) -> None: ...
 
 class DummyLogger:
     def __init__(self, *vargs) -> None: ...
@@ -15,7 +16,7 @@ class DummyLogger:
 class OutputLogger:
     file: Incomplete
     trace_level: Incomplete
-    def __init__(self, file: Incomplete | None = ..., trace_level: bool = ...) -> None: ...
+    def __init__(self, file: Incomplete | None = None, trace_level: bool = False) -> None: ...
     def debug(self, fmtstr, *args) -> None: ...
     def trace(self, fmtstr, *args) -> None: ...
 
@@ -23,20 +24,12 @@ class H2Configuration:
     client_side: Incomplete
     validate_outbound_headers: Incomplete
     normalize_outbound_headers: Incomplete
+    split_outbound_cookies: Incomplete
     validate_inbound_headers: Incomplete
     normalize_inbound_headers: Incomplete
     logger: Incomplete
-    def __init__(
-        self,
-        client_side: bool = ...,
-        header_encoding: Incomplete | None = ...,
-        validate_outbound_headers: bool = ...,
-        normalize_outbound_headers: bool = ...,
-        validate_inbound_headers: bool = ...,
-        normalize_inbound_headers: bool = ...,
-        logger: Incomplete | None = ...,
-    ) -> None: ...
+    def __init__(self, client_side: bool = True, header_encoding: bool | str | None = None, validate_outbound_headers: bool = True, normalize_outbound_headers: bool = True, split_outbound_cookies: bool = False, validate_inbound_headers: bool = True, normalize_inbound_headers: bool = True, logger: DummyLogger | OutputLogger | None = None) -> None: ...
     @property
-    def header_encoding(self): ...
+    def header_encoding(self) -> bool | str | None: ...
     @header_encoding.setter
-    def header_encoding(self, value) -> None: ...
+    def header_encoding(self, value: bool | str | None) -> None: ...
diff --git a/test_runner/stubs/h2/connection.pyi b/test_runner/stubs/h2/connection.pyi
index 04be18ca74..f7ec78a997 100644
--- a/test_runner/stubs/h2/connection.pyi
+++ b/test_runner/stubs/h2/connection.pyi
@@ -1,72 +1,55 @@
-from enum import Enum, IntEnum
-
-from _typeshed import Incomplete
-
 from .config import H2Configuration as H2Configuration
 from .errors import ErrorCodes as ErrorCodes
-from .events import AlternativeServiceAvailable as AlternativeServiceAvailable
-from .events import ConnectionTerminated as ConnectionTerminated
-from .events import PingAckReceived as PingAckReceived
-from .events import PingReceived as PingReceived
-from .events import PriorityUpdated as PriorityUpdated
-from .events import RemoteSettingsChanged as RemoteSettingsChanged
-from .events import SettingsAcknowledged as SettingsAcknowledged
-from .events import UnknownFrameReceived as UnknownFrameReceived
-from .events import WindowUpdated as WindowUpdated
-from .exceptions import DenialOfServiceError as DenialOfServiceError
-from .exceptions import FlowControlError as FlowControlError
-from .exceptions import FrameTooLargeError as FrameTooLargeError
-from .exceptions import NoAvailableStreamIDError as NoAvailableStreamIDError
-from .exceptions import NoSuchStreamError as NoSuchStreamError
-from .exceptions import ProtocolError as ProtocolError
-from .exceptions import RFC1122Error as RFC1122Error
-from .exceptions import StreamClosedError as StreamClosedError
-from .exceptions import StreamIDTooLowError as StreamIDTooLowError
-from .exceptions import TooManyStreamsError as TooManyStreamsError
+from .events import AlternativeServiceAvailable as AlternativeServiceAvailable, ConnectionTerminated as ConnectionTerminated, Event as Event, InformationalResponseReceived as InformationalResponseReceived, PingAckReceived as PingAckReceived, PingReceived as PingReceived, PriorityUpdated as PriorityUpdated, RemoteSettingsChanged as RemoteSettingsChanged, RequestReceived as RequestReceived, ResponseReceived as ResponseReceived, SettingsAcknowledged as SettingsAcknowledged, TrailersReceived as TrailersReceived, UnknownFrameReceived as UnknownFrameReceived, WindowUpdated as WindowUpdated
+from .exceptions import DenialOfServiceError as DenialOfServiceError, FlowControlError as FlowControlError, FrameTooLargeError as FrameTooLargeError, NoAvailableStreamIDError as NoAvailableStreamIDError, NoSuchStreamError as NoSuchStreamError, ProtocolError as ProtocolError, RFC1122Error as RFC1122Error, StreamClosedError as StreamClosedError, StreamIDTooLowError as StreamIDTooLowError, TooManyStreamsError as TooManyStreamsError
 from .frame_buffer import FrameBuffer as FrameBuffer
-from .settings import SettingCodes as SettingCodes
-from .settings import Settings as Settings
-from .stream import H2Stream as H2Stream
-from .stream import StreamClosedBy as StreamClosedBy
-from .utilities import guard_increment_window as guard_increment_window
+from .settings import ChangedSetting as ChangedSetting, SettingCodes as SettingCodes, Settings as Settings
+from .stream import H2Stream as H2Stream, StreamClosedBy as StreamClosedBy
+from .utilities import SizeLimitDict as SizeLimitDict, guard_increment_window as guard_increment_window
 from .windows import WindowManager as WindowManager
+from _typeshed import Incomplete
+from collections.abc import Iterable
+from enum import Enum, IntEnum
+from hpack.struct import Header as Header, HeaderWeaklyTyped as HeaderWeaklyTyped
+from hyperframe.frame import Frame as Frame
+from typing import Any
 
 class ConnectionState(Enum):
-    IDLE: int
-    CLIENT_OPEN: int
-    SERVER_OPEN: int
-    CLOSED: int
+    IDLE = 0
+    CLIENT_OPEN = 1
+    SERVER_OPEN = 2
+    CLOSED = 3
 
 class ConnectionInputs(Enum):
-    SEND_HEADERS: int
-    SEND_PUSH_PROMISE: int
-    SEND_DATA: int
-    SEND_GOAWAY: int
-    SEND_WINDOW_UPDATE: int
-    SEND_PING: int
-    SEND_SETTINGS: int
-    SEND_RST_STREAM: int
-    SEND_PRIORITY: int
-    RECV_HEADERS: int
-    RECV_PUSH_PROMISE: int
-    RECV_DATA: int
-    RECV_GOAWAY: int
-    RECV_WINDOW_UPDATE: int
-    RECV_PING: int
-    RECV_SETTINGS: int
-    RECV_RST_STREAM: int
-    RECV_PRIORITY: int
-    SEND_ALTERNATIVE_SERVICE: int
-    RECV_ALTERNATIVE_SERVICE: int
+    SEND_HEADERS = 0
+    SEND_PUSH_PROMISE = 1
+    SEND_DATA = 2
+    SEND_GOAWAY = 3
+    SEND_WINDOW_UPDATE = 4
+    SEND_PING = 5
+    SEND_SETTINGS = 6
+    SEND_RST_STREAM = 7
+    SEND_PRIORITY = 8
+    RECV_HEADERS = 9
+    RECV_PUSH_PROMISE = 10
+    RECV_DATA = 11
+    RECV_GOAWAY = 12
+    RECV_WINDOW_UPDATE = 13
+    RECV_PING = 14
+    RECV_SETTINGS = 15
+    RECV_RST_STREAM = 16
+    RECV_PRIORITY = 17
+    SEND_ALTERNATIVE_SERVICE = 18
+    RECV_ALTERNATIVE_SERVICE = 19
 
 class AllowedStreamIDs(IntEnum):
-    EVEN: int
-    ODD: int
+    EVEN = 0
+    ODD = 1
 
 class H2ConnectionStateMachine:
     state: Incomplete
     def __init__(self) -> None: ...
-    def process_input(self, input_): ...
+    def process_input(self, input_: ConnectionInputs) -> list[Event]: ...
 
 class H2Connection:
     DEFAULT_MAX_OUTBOUND_FRAME_SIZE: int
@@ -88,55 +71,30 @@ class H2Connection:
     max_outbound_frame_size: Incomplete
     max_inbound_frame_size: Incomplete
     incoming_buffer: Incomplete
-    def __init__(self, config: Incomplete | None = ...) -> None: ...
+    def __init__(self, config: H2Configuration | None = None) -> None: ...
     @property
-    def open_outbound_streams(self): ...
+    def open_outbound_streams(self) -> int: ...
     @property
-    def open_inbound_streams(self): ...
+    def open_inbound_streams(self) -> int: ...
     @property
-    def inbound_flow_control_window(self): ...
+    def inbound_flow_control_window(self) -> int: ...
     def initiate_connection(self) -> None: ...
-    def initiate_upgrade_connection(self, settings_header: Incomplete | None = ...): ...
-    def get_next_available_stream_id(self): ...
-    def send_headers(
-        self,
-        stream_id,
-        headers,
-        end_stream: bool = ...,
-        priority_weight: Incomplete | None = ...,
-        priority_depends_on: Incomplete | None = ...,
-        priority_exclusive: Incomplete | None = ...,
-    ) -> None: ...
-    def send_data(
-        self, stream_id, data, end_stream: bool = ..., pad_length: Incomplete | None = ...
-    ) -> None: ...
-    def end_stream(self, stream_id) -> None: ...
-    def increment_flow_control_window(
-        self, increment, stream_id: Incomplete | None = ...
-    ) -> None: ...
-    def push_stream(self, stream_id, promised_stream_id, request_headers) -> None: ...
-    def ping(self, opaque_data) -> None: ...
-    def reset_stream(self, stream_id, error_code: int = ...) -> None: ...
-    def close_connection(
-        self,
-        error_code: int = ...,
-        additional_data: Incomplete | None = ...,
-        last_stream_id: Incomplete | None = ...,
-    ) -> None: ...
-    def update_settings(self, new_settings) -> None: ...
-    def advertise_alternative_service(
-        self, field_value, origin: Incomplete | None = ..., stream_id: Incomplete | None = ...
-    ) -> None: ...
-    def prioritize(
-        self,
-        stream_id,
-        weight: Incomplete | None = ...,
-        depends_on: Incomplete | None = ...,
-        exclusive: Incomplete | None = ...,
-    ) -> None: ...
-    def local_flow_control_window(self, stream_id): ...
-    def remote_flow_control_window(self, stream_id): ...
-    def acknowledge_received_data(self, acknowledged_size, stream_id) -> None: ...
-    def data_to_send(self, amount: Incomplete | None = ...): ...
+    def initiate_upgrade_connection(self, settings_header: bytes | None = None) -> bytes | None: ...
+    def get_next_available_stream_id(self) -> int: ...
+    def send_headers(self, stream_id: int, headers: Iterable[HeaderWeaklyTyped], end_stream: bool = False, priority_weight: int | None = None, priority_depends_on: int | None = None, priority_exclusive: bool | None = None) -> None: ...
+    def send_data(self, stream_id: int, data: bytes | memoryview, end_stream: bool = False, pad_length: Any = None) -> None: ...
+    def end_stream(self, stream_id: int) -> None: ...
+    def increment_flow_control_window(self, increment: int, stream_id: int | None = None) -> None: ...
+    def push_stream(self, stream_id: int, promised_stream_id: int, request_headers: Iterable[HeaderWeaklyTyped]) -> None: ...
+    def ping(self, opaque_data: bytes | str) -> None: ...
+    def reset_stream(self, stream_id: int, error_code: ErrorCodes | int = 0) -> None: ...
+    def close_connection(self, error_code: ErrorCodes | int = 0, additional_data: bytes | None = None, last_stream_id: int | None = None) -> None: ...
+    def update_settings(self, new_settings: dict[SettingCodes | int, int]) -> None: ...
+    def advertise_alternative_service(self, field_value: bytes | str, origin: bytes | None = None, stream_id: int | None = None) -> None: ...
+    def prioritize(self, stream_id: int, weight: int | None = None, depends_on: int | None = None, exclusive: bool | None = None) -> None: ...
+    def local_flow_control_window(self, stream_id: int) -> int: ...
+    def remote_flow_control_window(self, stream_id: int) -> int: ...
+    def acknowledge_received_data(self, acknowledged_size: int, stream_id: int) -> None: ...
+    def data_to_send(self, amount: int | None = None) -> bytes: ...
     def clear_outbound_data_buffer(self) -> None: ...
-    def receive_data(self, data): ...
+    def receive_data(self, data: bytes) -> list[Event]: ...
diff --git a/test_runner/stubs/h2/errors.pyi b/test_runner/stubs/h2/errors.pyi
index b70c632f8c..7cf77bd833 100644
--- a/test_runner/stubs/h2/errors.pyi
+++ b/test_runner/stubs/h2/errors.pyi
@@ -1,17 +1,19 @@
 import enum
 
+__all__ = ['ErrorCodes']
+
 class ErrorCodes(enum.IntEnum):
-    NO_ERROR: int
-    PROTOCOL_ERROR: int
-    INTERNAL_ERROR: int
-    FLOW_CONTROL_ERROR: int
-    SETTINGS_TIMEOUT: int
-    STREAM_CLOSED: int
-    FRAME_SIZE_ERROR: int
-    REFUSED_STREAM: int
-    CANCEL: int
-    COMPRESSION_ERROR: int
-    CONNECT_ERROR: int
-    ENHANCE_YOUR_CALM: int
-    INADEQUATE_SECURITY: int
-    HTTP_1_1_REQUIRED: int
+    NO_ERROR = 0
+    PROTOCOL_ERROR = 1
+    INTERNAL_ERROR = 2
+    FLOW_CONTROL_ERROR = 3
+    SETTINGS_TIMEOUT = 4
+    STREAM_CLOSED = 5
+    FRAME_SIZE_ERROR = 6
+    REFUSED_STREAM = 7
+    CANCEL = 8
+    COMPRESSION_ERROR = 9
+    CONNECT_ERROR = 10
+    ENHANCE_YOUR_CALM = 11
+    INADEQUATE_SECURITY = 12
+    HTTP_1_1_REQUIRED = 13
diff --git a/test_runner/stubs/h2/events.pyi b/test_runner/stubs/h2/events.pyi
index 75d0a9e53b..a086db38b3 100644
--- a/test_runner/stubs/h2/events.pyi
+++ b/test_runner/stubs/h2/events.pyi
@@ -1,6 +1,8 @@
+from .errors import ErrorCodes as ErrorCodes
+from .settings import ChangedSetting as ChangedSetting, SettingCodes as SettingCodes, Settings as Settings
 from _typeshed import Incomplete
-
-from .settings import ChangedSetting as ChangedSetting
+from hpack import HeaderTuple as HeaderTuple
+from hyperframe.frame import Frame as Frame
 
 class Event: ...
 
@@ -53,7 +55,7 @@ class RemoteSettingsChanged(Event):
     changed_settings: Incomplete
     def __init__(self) -> None: ...
     @classmethod
-    def from_settings(cls, old_settings, new_settings): ...
+    def from_settings(cls, old_settings: Settings | dict[int, int], new_settings: dict[int, int]) -> RemoteSettingsChanged: ...
 
 class PingReceived(Event):
     ping_data: Incomplete
diff --git a/test_runner/stubs/h2/exceptions.pyi b/test_runner/stubs/h2/exceptions.pyi
index 82019d5ec1..7149b46521 100644
--- a/test_runner/stubs/h2/exceptions.pyi
+++ b/test_runner/stubs/h2/exceptions.pyi
@@ -1,3 +1,4 @@
+from .errors import ErrorCodes as ErrorCodes
 from _typeshed import Incomplete
 
 class H2Error(Exception): ...
@@ -19,27 +20,27 @@ class FlowControlError(ProtocolError):
 class StreamIDTooLowError(ProtocolError):
     stream_id: Incomplete
     max_stream_id: Incomplete
-    def __init__(self, stream_id, max_stream_id) -> None: ...
+    def __init__(self, stream_id: int, max_stream_id: int) -> None: ...
 
 class NoAvailableStreamIDError(ProtocolError): ...
 
 class NoSuchStreamError(ProtocolError):
     stream_id: Incomplete
-    def __init__(self, stream_id) -> None: ...
+    def __init__(self, stream_id: int) -> None: ...
 
 class StreamClosedError(NoSuchStreamError):
     stream_id: Incomplete
     error_code: Incomplete
-    def __init__(self, stream_id) -> None: ...
+    def __init__(self, stream_id: int) -> None: ...
 
 class InvalidSettingsValueError(ProtocolError, ValueError):
     error_code: Incomplete
-    def __init__(self, msg, error_code) -> None: ...
+    def __init__(self, msg: str, error_code: ErrorCodes) -> None: ...
 
 class InvalidBodyLengthError(ProtocolError):
     expected_length: Incomplete
     actual_length: Incomplete
-    def __init__(self, expected, actual) -> None: ...
+    def __init__(self, expected: int, actual: int) -> None: ...
 
 class UnsupportedFrameError(ProtocolError): ...
 class RFC1122Error(H2Error): ...
diff --git a/test_runner/stubs/h2/frame_buffer.pyi b/test_runner/stubs/h2/frame_buffer.pyi
index f47adab704..90746f63c1 100644
--- a/test_runner/stubs/h2/frame_buffer.pyi
+++ b/test_runner/stubs/h2/frame_buffer.pyi
@@ -1,19 +1,12 @@
-from .exceptions import (
-    FrameDataMissingError as FrameDataMissingError,
-)
-from .exceptions import (
-    FrameTooLargeError as FrameTooLargeError,
-)
-from .exceptions import (
-    ProtocolError as ProtocolError,
-)
+from .exceptions import FrameDataMissingError as FrameDataMissingError, FrameTooLargeError as FrameTooLargeError, ProtocolError as ProtocolError
+from hyperframe.frame import Frame
 
 CONTINUATION_BACKLOG: int
 
 class FrameBuffer:
     data: bytes
     max_frame_size: int
-    def __init__(self, server: bool = ...) -> None: ...
-    def add_data(self, data) -> None: ...
-    def __iter__(self): ...
-    def __next__(self): ...
+    def __init__(self, server: bool = False) -> None: ...
+    def add_data(self, data: bytes) -> None: ...
+    def __iter__(self) -> FrameBuffer: ...
+    def __next__(self) -> Frame: ...
diff --git a/test_runner/stubs/h2/settings.pyi b/test_runner/stubs/h2/settings.pyi
index a352abe53e..c3920f9969 100644
--- a/test_runner/stubs/h2/settings.pyi
+++ b/test_runner/stubs/h2/settings.pyi
@@ -1,61 +1,59 @@
 import enum
-from collections.abc import MutableMapping
-from typing import Any
-
+from .errors import ErrorCodes as ErrorCodes
+from .exceptions import InvalidSettingsValueError as InvalidSettingsValueError
 from _typeshed import Incomplete
-from h2.errors import ErrorCodes as ErrorCodes
-from h2.exceptions import InvalidSettingsValueError as InvalidSettingsValueError
+from collections.abc import Iterator, MutableMapping
 
 class SettingCodes(enum.IntEnum):
-    HEADER_TABLE_SIZE: Incomplete
-    ENABLE_PUSH: Incomplete
-    MAX_CONCURRENT_STREAMS: Incomplete
-    INITIAL_WINDOW_SIZE: Incomplete
-    MAX_FRAME_SIZE: Incomplete
-    MAX_HEADER_LIST_SIZE: Incomplete
-    ENABLE_CONNECT_PROTOCOL: Incomplete
+    HEADER_TABLE_SIZE = ...
+    ENABLE_PUSH = ...
+    MAX_CONCURRENT_STREAMS = ...
+    INITIAL_WINDOW_SIZE = ...
+    MAX_FRAME_SIZE = ...
+    MAX_HEADER_LIST_SIZE = ...
+    ENABLE_CONNECT_PROTOCOL = ...
 
 class ChangedSetting:
     setting: Incomplete
     original_value: Incomplete
     new_value: Incomplete
-    def __init__(self, setting, original_value, new_value) -> None: ...
+    def __init__(self, setting: SettingCodes | int, original_value: int | None, new_value: int) -> None: ...
 
-class Settings(MutableMapping[str, Any]):
-    def __init__(self, client: bool = ..., initial_values: Incomplete | None = ...) -> None: ...
-    def acknowledge(self): ...
+class Settings(MutableMapping[SettingCodes | int, int]):
+    def __init__(self, client: bool = True, initial_values: dict[SettingCodes, int] | None = None) -> None: ...
+    def acknowledge(self) -> dict[SettingCodes | int, ChangedSetting]: ...
     @property
-    def header_table_size(self): ...
+    def header_table_size(self) -> int: ...
     @header_table_size.setter
-    def header_table_size(self, value) -> None: ...
+    def header_table_size(self, value: int) -> None: ...
     @property
-    def enable_push(self): ...
+    def enable_push(self) -> int: ...
     @enable_push.setter
-    def enable_push(self, value) -> None: ...
+    def enable_push(self, value: int) -> None: ...
     @property
-    def initial_window_size(self): ...
+    def initial_window_size(self) -> int: ...
     @initial_window_size.setter
-    def initial_window_size(self, value) -> None: ...
+    def initial_window_size(self, value: int) -> None: ...
     @property
-    def max_frame_size(self): ...
+    def max_frame_size(self) -> int: ...
     @max_frame_size.setter
-    def max_frame_size(self, value) -> None: ...
+    def max_frame_size(self, value: int) -> None: ...
     @property
-    def max_concurrent_streams(self): ...
+    def max_concurrent_streams(self) -> int: ...
     @max_concurrent_streams.setter
-    def max_concurrent_streams(self, value) -> None: ...
+    def max_concurrent_streams(self, value: int) -> None: ...
     @property
-    def max_header_list_size(self): ...
+    def max_header_list_size(self) -> int | None: ...
     @max_header_list_size.setter
-    def max_header_list_size(self, value) -> None: ...
+    def max_header_list_size(self, value: int) -> None: ...
     @property
-    def enable_connect_protocol(self): ...
+    def enable_connect_protocol(self) -> int: ...
     @enable_connect_protocol.setter
-    def enable_connect_protocol(self, value) -> None: ...
-    def __getitem__(self, key): ...
-    def __setitem__(self, key, value) -> None: ...
-    def __delitem__(self, key) -> None: ...
-    def __iter__(self): ...
+    def enable_connect_protocol(self, value: int) -> None: ...
+    def __getitem__(self, key: SettingCodes | int) -> int: ...
+    def __setitem__(self, key: SettingCodes | int, value: int) -> None: ...
+    def __delitem__(self, key: SettingCodes | int) -> None: ...
+    def __iter__(self) -> Iterator[SettingCodes | int]: ...
     def __len__(self) -> int: ...
-    def __eq__(self, other): ...
-    def __ne__(self, other): ...
+    def __eq__(self, other: object) -> bool: ...
+    def __ne__(self, other: object) -> bool: ...
diff --git a/test_runner/stubs/h2/stream.pyi b/test_runner/stubs/h2/stream.pyi
index d52ab8e72b..89171da981 100644
--- a/test_runner/stubs/h2/stream.pyi
+++ b/test_runner/stubs/h2/stream.pyi
@@ -1,114 +1,52 @@
-from enum import Enum, IntEnum
-
-from _typeshed import Incomplete
-
+from .config import H2Configuration as H2Configuration
 from .errors import ErrorCodes as ErrorCodes
-from .events import (
-    AlternativeServiceAvailable as AlternativeServiceAvailable,
-)
-from .events import (
-    DataReceived as DataReceived,
-)
-from .events import (
-    InformationalResponseReceived as InformationalResponseReceived,
-)
-from .events import (
-    PushedStreamReceived as PushedStreamReceived,
-)
-from .events import (
-    RequestReceived as RequestReceived,
-)
-from .events import (
-    ResponseReceived as ResponseReceived,
-)
-from .events import (
-    StreamEnded as StreamEnded,
-)
-from .events import (
-    StreamReset as StreamReset,
-)
-from .events import (
-    TrailersReceived as TrailersReceived,
-)
-from .events import (
-    WindowUpdated as WindowUpdated,
-)
-from .exceptions import (
-    FlowControlError as FlowControlError,
-)
-from .exceptions import (
-    InvalidBodyLengthError as InvalidBodyLengthError,
-)
-from .exceptions import (
-    ProtocolError as ProtocolError,
-)
-from .exceptions import (
-    StreamClosedError as StreamClosedError,
-)
-from .utilities import (
-    HeaderValidationFlags as HeaderValidationFlags,
-)
-from .utilities import (
-    authority_from_headers as authority_from_headers,
-)
-from .utilities import (
-    extract_method_header as extract_method_header,
-)
-from .utilities import (
-    guard_increment_window as guard_increment_window,
-)
-from .utilities import (
-    is_informational_response as is_informational_response,
-)
-from .utilities import (
-    normalize_inbound_headers as normalize_inbound_headers,
-)
-from .utilities import (
-    normalize_outbound_headers as normalize_outbound_headers,
-)
-from .utilities import (
-    validate_headers as validate_headers,
-)
-from .utilities import (
-    validate_outbound_headers as validate_outbound_headers,
-)
+from .events import AlternativeServiceAvailable as AlternativeServiceAvailable, DataReceived as DataReceived, Event as Event, InformationalResponseReceived as InformationalResponseReceived, PushedStreamReceived as PushedStreamReceived, RequestReceived as RequestReceived, ResponseReceived as ResponseReceived, StreamEnded as StreamEnded, StreamReset as StreamReset, TrailersReceived as TrailersReceived, WindowUpdated as WindowUpdated
+from .exceptions import FlowControlError as FlowControlError, InvalidBodyLengthError as InvalidBodyLengthError, ProtocolError as ProtocolError, StreamClosedError as StreamClosedError
+from .utilities import HeaderValidationFlags as HeaderValidationFlags, authority_from_headers as authority_from_headers, extract_method_header as extract_method_header, guard_increment_window as guard_increment_window, is_informational_response as is_informational_response, normalize_inbound_headers as normalize_inbound_headers, normalize_outbound_headers as normalize_outbound_headers, utf8_encode_headers as utf8_encode_headers, validate_headers as validate_headers, validate_outbound_headers as validate_outbound_headers
 from .windows import WindowManager as WindowManager
+from _typeshed import Incomplete
+from collections.abc import Iterable
+from enum import Enum, IntEnum
+from hpack.hpack import Encoder as Encoder
+from hpack.struct import Header as Header, HeaderWeaklyTyped as HeaderWeaklyTyped
+from hyperframe.frame import AltSvcFrame, ContinuationFrame, Frame as Frame, HeadersFrame, PushPromiseFrame, RstStreamFrame
+from typing import Any
 
 class StreamState(IntEnum):
-    IDLE: int
-    RESERVED_REMOTE: int
-    RESERVED_LOCAL: int
-    OPEN: int
-    HALF_CLOSED_REMOTE: int
-    HALF_CLOSED_LOCAL: int
-    CLOSED: int
+    IDLE = 0
+    RESERVED_REMOTE = 1
+    RESERVED_LOCAL = 2
+    OPEN = 3
+    HALF_CLOSED_REMOTE = 4
+    HALF_CLOSED_LOCAL = 5
+    CLOSED = 6
 
 class StreamInputs(Enum):
-    SEND_HEADERS: int
-    SEND_PUSH_PROMISE: int
-    SEND_RST_STREAM: int
-    SEND_DATA: int
-    SEND_WINDOW_UPDATE: int
-    SEND_END_STREAM: int
-    RECV_HEADERS: int
-    RECV_PUSH_PROMISE: int
-    RECV_RST_STREAM: int
-    RECV_DATA: int
-    RECV_WINDOW_UPDATE: int
-    RECV_END_STREAM: int
-    RECV_CONTINUATION: int
-    SEND_INFORMATIONAL_HEADERS: int
-    RECV_INFORMATIONAL_HEADERS: int
-    SEND_ALTERNATIVE_SERVICE: int
-    RECV_ALTERNATIVE_SERVICE: int
-    UPGRADE_CLIENT: int
-    UPGRADE_SERVER: int
+    SEND_HEADERS = 0
+    SEND_PUSH_PROMISE = 1
+    SEND_RST_STREAM = 2
+    SEND_DATA = 3
+    SEND_WINDOW_UPDATE = 4
+    SEND_END_STREAM = 5
+    RECV_HEADERS = 6
+    RECV_PUSH_PROMISE = 7
+    RECV_RST_STREAM = 8
+    RECV_DATA = 9
+    RECV_WINDOW_UPDATE = 10
+    RECV_END_STREAM = 11
+    RECV_CONTINUATION = 12
+    SEND_INFORMATIONAL_HEADERS = 13
+    RECV_INFORMATIONAL_HEADERS = 14
+    SEND_ALTERNATIVE_SERVICE = 15
+    RECV_ALTERNATIVE_SERVICE = 16
+    UPGRADE_CLIENT = 17
+    UPGRADE_SERVER = 18
 
 class StreamClosedBy(Enum):
-    SEND_END_STREAM: int
-    RECV_END_STREAM: int
-    SEND_RST_STREAM: int
-    RECV_RST_STREAM: int
+    SEND_END_STREAM = 0
+    RECV_END_STREAM = 1
+    SEND_RST_STREAM = 2
+    RECV_RST_STREAM = 3
 
 STREAM_OPEN: Incomplete
 
@@ -121,32 +59,32 @@ class H2StreamStateMachine:
     headers_received: Incomplete
     trailers_received: Incomplete
     stream_closed_by: Incomplete
-    def __init__(self, stream_id) -> None: ...
-    def process_input(self, input_): ...
-    def request_sent(self, previous_state): ...
-    def response_sent(self, previous_state): ...
-    def request_received(self, previous_state): ...
-    def response_received(self, previous_state): ...
-    def data_received(self, previous_state): ...
-    def window_updated(self, previous_state): ...
-    def stream_half_closed(self, previous_state): ...
-    def stream_ended(self, previous_state): ...
-    def stream_reset(self, previous_state): ...
-    def send_new_pushed_stream(self, previous_state): ...
-    def recv_new_pushed_stream(self, previous_state): ...
-    def send_push_promise(self, previous_state): ...
-    def recv_push_promise(self, previous_state): ...
-    def send_end_stream(self, previous_state) -> None: ...
-    def send_reset_stream(self, previous_state) -> None: ...
-    def reset_stream_on_error(self, previous_state) -> None: ...
-    def recv_on_closed_stream(self, previous_state) -> None: ...
-    def send_on_closed_stream(self, previous_state) -> None: ...
-    def recv_push_on_closed_stream(self, previous_state) -> None: ...
-    def send_push_on_closed_stream(self, previous_state) -> None: ...
-    def send_informational_response(self, previous_state): ...
-    def recv_informational_response(self, previous_state): ...
-    def recv_alt_svc(self, previous_state): ...
-    def send_alt_svc(self, previous_state) -> None: ...
+    def __init__(self, stream_id: int) -> None: ...
+    def process_input(self, input_: StreamInputs) -> Any: ...
+    def request_sent(self, previous_state: StreamState) -> list[Event]: ...
+    def response_sent(self, previous_state: StreamState) -> list[Event]: ...
+    def request_received(self, previous_state: StreamState) -> list[Event]: ...
+    def response_received(self, previous_state: StreamState) -> list[Event]: ...
+    def data_received(self, previous_state: StreamState) -> list[Event]: ...
+    def window_updated(self, previous_state: StreamState) -> list[Event]: ...
+    def stream_half_closed(self, previous_state: StreamState) -> list[Event]: ...
+    def stream_ended(self, previous_state: StreamState) -> list[Event]: ...
+    def stream_reset(self, previous_state: StreamState) -> list[Event]: ...
+    def send_new_pushed_stream(self, previous_state: StreamState) -> list[Event]: ...
+    def recv_new_pushed_stream(self, previous_state: StreamState) -> list[Event]: ...
+    def send_push_promise(self, previous_state: StreamState) -> list[Event]: ...
+    def recv_push_promise(self, previous_state: StreamState) -> list[Event]: ...
+    def send_end_stream(self, previous_state: StreamState) -> None: ...
+    def send_reset_stream(self, previous_state: StreamState) -> None: ...
+    def reset_stream_on_error(self, previous_state: StreamState) -> None: ...
+    def recv_on_closed_stream(self, previous_state: StreamState) -> None: ...
+    def send_on_closed_stream(self, previous_state: StreamState) -> None: ...
+    def recv_push_on_closed_stream(self, previous_state: StreamState) -> None: ...
+    def send_push_on_closed_stream(self, previous_state: StreamState) -> None: ...
+    def send_informational_response(self, previous_state: StreamState) -> list[Event]: ...
+    def recv_informational_response(self, previous_state: StreamState) -> list[Event]: ...
+    def recv_alt_svc(self, previous_state: StreamState) -> list[Event]: ...
+    def send_alt_svc(self, previous_state: StreamState) -> None: ...
 
 class H2Stream:
     state_machine: Incomplete
@@ -155,30 +93,30 @@ class H2Stream:
     request_method: Incomplete
     outbound_flow_control_window: Incomplete
     config: Incomplete
-    def __init__(self, stream_id, config, inbound_window_size, outbound_window_size) -> None: ...
+    def __init__(self, stream_id: int, config: H2Configuration, inbound_window_size: int, outbound_window_size: int) -> None: ...
     @property
-    def inbound_flow_control_window(self): ...
+    def inbound_flow_control_window(self) -> int: ...
     @property
-    def open(self): ...
+    def open(self) -> bool: ...
     @property
-    def closed(self): ...
+    def closed(self) -> bool: ...
     @property
-    def closed_by(self): ...
-    def upgrade(self, client_side) -> None: ...
-    def send_headers(self, headers, encoder, end_stream: bool = ...): ...
-    def push_stream_in_band(self, related_stream_id, headers, encoder): ...
-    def locally_pushed(self): ...
-    def send_data(self, data, end_stream: bool = ..., pad_length: Incomplete | None = ...): ...
-    def end_stream(self): ...
-    def advertise_alternative_service(self, field_value): ...
-    def increase_flow_control_window(self, increment): ...
-    def receive_push_promise_in_band(self, promised_stream_id, headers, header_encoding): ...
-    def remotely_pushed(self, pushed_headers): ...
-    def receive_headers(self, headers, end_stream, header_encoding): ...
-    def receive_data(self, data, end_stream, flow_control_len): ...
-    def receive_window_update(self, increment): ...
+    def closed_by(self) -> StreamClosedBy | None: ...
+    def upgrade(self, client_side: bool) -> None: ...
+    def send_headers(self, headers: Iterable[HeaderWeaklyTyped], encoder: Encoder, end_stream: bool = False) -> list[HeadersFrame | ContinuationFrame | PushPromiseFrame]: ...
+    def push_stream_in_band(self, related_stream_id: int, headers: Iterable[HeaderWeaklyTyped], encoder: Encoder) -> list[HeadersFrame | ContinuationFrame | PushPromiseFrame]: ...
+    def locally_pushed(self) -> list[Frame]: ...
+    def send_data(self, data: bytes | memoryview, end_stream: bool = False, pad_length: int | None = None) -> list[Frame]: ...
+    def end_stream(self) -> list[Frame]: ...
+    def advertise_alternative_service(self, field_value: bytes) -> list[Frame]: ...
+    def increase_flow_control_window(self, increment: int) -> list[Frame]: ...
+    def receive_push_promise_in_band(self, promised_stream_id: int, headers: Iterable[Header], header_encoding: bool | str | None) -> tuple[list[Frame], list[Event]]: ...
+    def remotely_pushed(self, pushed_headers: Iterable[Header]) -> tuple[list[Frame], list[Event]]: ...
+    def receive_headers(self, headers: Iterable[Header], end_stream: bool, header_encoding: bool | str | None) -> tuple[list[Frame], list[Event]]: ...
+    def receive_data(self, data: bytes, end_stream: bool, flow_control_len: int) -> tuple[list[Frame], list[Event]]: ...
+    def receive_window_update(self, increment: int) -> tuple[list[Frame], list[Event]]: ...
     def receive_continuation(self) -> None: ...
-    def receive_alt_svc(self, frame): ...
-    def reset_stream(self, error_code: int = ...): ...
-    def stream_reset(self, frame): ...
-    def acknowledge_received_data(self, acknowledged_size): ...
+    def receive_alt_svc(self, frame: AltSvcFrame) -> tuple[list[Frame], list[Event]]: ...
+    def reset_stream(self, error_code: ErrorCodes | int = 0) -> list[Frame]: ...
+    def stream_reset(self, frame: RstStreamFrame) -> tuple[list[Frame], list[Event]]: ...
+    def acknowledge_received_data(self, acknowledged_size: int) -> list[Frame]: ...
diff --git a/test_runner/stubs/h2/utilities.pyi b/test_runner/stubs/h2/utilities.pyi
index e0a8d55d1d..8802087e4c 100644
--- a/test_runner/stubs/h2/utilities.pyi
+++ b/test_runner/stubs/h2/utilities.pyi
@@ -1,25 +1,32 @@
-from typing import NamedTuple
-
+import collections
+from .exceptions import FlowControlError as FlowControlError, ProtocolError as ProtocolError
 from _typeshed import Incomplete
-
-from .exceptions import FlowControlError as FlowControlError
-from .exceptions import ProtocolError as ProtocolError
+from collections.abc import Generator, Iterable
+from hpack.struct import Header as Header, HeaderWeaklyTyped as HeaderWeaklyTyped
+from typing import Any, NamedTuple
 
 UPPER_RE: Incomplete
+SIGIL: Incomplete
+INFORMATIONAL_START: Incomplete
 CONNECTION_HEADERS: Incomplete
 
-def extract_method_header(headers): ...
-def is_informational_response(headers): ...
-def guard_increment_window(current, increment): ...
-def authority_from_headers(headers): ...
+def extract_method_header(headers: Iterable[Header]) -> bytes | None: ...
+def is_informational_response(headers: Iterable[Header]) -> bool: ...
+def guard_increment_window(current: int, increment: int) -> int: ...
+def authority_from_headers(headers: Iterable[Header]) -> bytes | None: ...
 
 class HeaderValidationFlags(NamedTuple):
-    is_client: Incomplete
-    is_trailer: Incomplete
-    is_response_header: Incomplete
-    is_push_promise: Incomplete
+    is_client: bool
+    is_trailer: bool
+    is_response_header: bool
+    is_push_promise: bool
 
-def validate_headers(headers, hdr_validation_flags): ...
-def normalize_outbound_headers(headers, hdr_validation_flags): ...
-def normalize_inbound_headers(headers, hdr_validation_flags): ...
-def validate_outbound_headers(headers, hdr_validation_flags): ...
+def validate_headers(headers: Iterable[Header], hdr_validation_flags: HeaderValidationFlags) -> Iterable[Header]: ...
+def utf8_encode_headers(headers: Iterable[HeaderWeaklyTyped]) -> list[Header]: ...
+def normalize_outbound_headers(headers: Iterable[Header], hdr_validation_flags: HeaderValidationFlags | None, should_split_outbound_cookies: bool = False) -> Generator[Header, None, None]: ...
+def normalize_inbound_headers(headers: Iterable[Header], hdr_validation_flags: HeaderValidationFlags) -> Generator[Header, None, None]: ...
+def validate_outbound_headers(headers: Iterable[Header], hdr_validation_flags: HeaderValidationFlags) -> Generator[Header, None, None]: ...
+
+class SizeLimitDict(collections.OrderedDict[int, Any]):
+    def __init__(self, *args: dict[int, int], **kwargs: Any) -> None: ...
+    def __setitem__(self, key: int, value: Any | int) -> None: ...
diff --git a/test_runner/stubs/h2/windows.pyi b/test_runner/stubs/h2/windows.pyi
index 7dc78e431c..b132ee610c 100644
--- a/test_runner/stubs/h2/windows.pyi
+++ b/test_runner/stubs/h2/windows.pyi
@@ -1,13 +1,12 @@
-from _typeshed import Incomplete
-
 from .exceptions import FlowControlError as FlowControlError
+from _typeshed import Incomplete
 
 LARGEST_FLOW_CONTROL_WINDOW: Incomplete
 
 class WindowManager:
     max_window_size: Incomplete
     current_window_size: Incomplete
-    def __init__(self, max_window_size) -> None: ...
-    def window_consumed(self, size) -> None: ...
-    def window_opened(self, size) -> None: ...
-    def process_bytes(self, size): ...
+    def __init__(self, max_window_size: int) -> None: ...
+    def window_consumed(self, size: int) -> None: ...
+    def window_opened(self, size: int) -> None: ...
+    def process_bytes(self, size: int) -> int | None: ...

From 9f1408fdf3c4cc79ad3e1810c4e4c76a80a695a9 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 24 Jan 2025 16:57:32 +0200
Subject: [PATCH 261/583] Do not assign max(lsn) to maxLastWrittenLsn in
 SetLastWrittenLSNForblokv (#10474)

## Problem

See https://github.com/neondatabase/neon/issues/10281

`SetLastWrittenLSNForBlockv` is assigning max(lsn) to
`maxLastWrittenLsn` while its should contain only max LSN not present in
LwLSN cache. It case unnecessary waits in PS.

## Summary of changes

Restore status-quo for pg17.

Related Postgres PR: https://github.com/neondatabase/postgres/pull/563

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 4276717f6e..46f9b96555 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 4276717f6e91023e504de355f4f21d4824074de8
+Subproject commit 46f9b96555e084c35dd975da9485996db9e86181
diff --git a/vendor/revisions.json b/vendor/revisions.json
index dba0e67fb4..3aa42d22c5 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v17": [
     "17.2",
-    "4276717f6e91023e504de355f4f21d4824074de8"
+    "46f9b96555e084c35dd975da9485996db9e86181"
   ],
   "v16": [
     "16.6",

From be718ed12194421b9867f1d8357fd60b6cec2da6 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Sat, 25 Jan 2025 17:51:54 +0100
Subject: [PATCH 262/583] pageserver: disable L0 flush stalls, tune delay
 threshold (#10507)

## Problem

In ingest benchmarks, we see L0 compaction delays of over 10 minutes due
to image compaction. We can't stall L0 flushes for that long.

## Summary of changes

Disable L0 flush stalls, and bump the default L0 flush delay threshold
from 20 to 30 L0 layers.
---
 pageserver/src/tenant/timeline.rs | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ee43512501..07689b5e76 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2168,8 +2168,8 @@ impl Timeline {
     }
 
     fn get_l0_flush_delay_threshold(&self) -> Option<usize> {
-        // Default to delay L0 flushes at 2x compaction threshold.
-        const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 2;
+        // Default to delay L0 flushes at 3x compaction threshold.
+        const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 3;
 
         // If compaction is disabled, don't delay.
         if self.get_compaction_period() == Duration::ZERO {
@@ -2197,8 +2197,10 @@ impl Timeline {
     }
 
     fn get_l0_flush_stall_threshold(&self) -> Option<usize> {
-        // Default to stall L0 flushes at 4x compaction threshold.
-        const DEFAULT_L0_FLUSH_STALL_FACTOR: usize = 4;
+        // Default to stall L0 flushes at 5x compaction threshold.
+        // TODO: stalls are temporarily disabled by default, see below.
+        #[allow(unused)]
+        const DEFAULT_L0_FLUSH_STALL_FACTOR: usize = 5;
 
         // If compaction is disabled, don't stall.
         if self.get_compaction_period() == Duration::ZERO {
@@ -2230,8 +2232,13 @@ impl Timeline {
             return None;
         }
 
-        let l0_flush_stall_threshold = l0_flush_stall_threshold
-            .unwrap_or(DEFAULT_L0_FLUSH_STALL_FACTOR * compaction_threshold);
+        // Disable stalls by default. In ingest benchmarks, we see image compaction take >10
+        // minutes, blocking L0 compaction, and we can't stall L0 flushes for that long.
+        //
+        // TODO: fix this.
+        // let l0_flush_stall_threshold = l0_flush_stall_threshold
+        //    .unwrap_or(DEFAULT_L0_FLUSH_STALL_FACTOR * compaction_threshold);
+        let l0_flush_stall_threshold = l0_flush_stall_threshold?;
 
         // 0 disables backpressure.
         if l0_flush_stall_threshold == 0 {

From 4dd4096f11b4dd214b143c1a4bda739a37a9abdb Mon Sep 17 00:00:00 2001
From: Mikhail Kot <mikhail@neon.tech>
Date: Mon, 27 Jan 2025 14:09:21 +0000
Subject: [PATCH 263/583] Pgbouncer exporter in compute image (#10503)

https://github.com/neondatabase/cloud/issues/19081
Include pgbouncer_exporter in compute image and run it at port 9127
---
 compute/compute-node.Dockerfile     | 4 +++-
 compute/etc/pgbouncer.ini           | 2 ++
 compute/vm-image-spec-bookworm.yaml | 4 ++++
 compute/vm-image-spec-bullseye.yaml | 4 ++++
 4 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index a80c701b45..539135470e 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1266,11 +1266,12 @@ RUN set -e \
 
 #########################################################################################
 #
-# Layers "postgres-exporter" and "sql-exporter"
+# Layers "postgres-exporter", "pgbouncer-exporter", and "sql-exporter"
 #
 #########################################################################################
 
 FROM quay.io/prometheuscommunity/postgres-exporter:v0.16.0 AS postgres-exporter
+FROM quay.io/prometheuscommunity/pgbouncer-exporter:v0.10.2 AS pgbouncer-exporter
 
 # Keep the version the same as in build-tools.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
@@ -1402,6 +1403,7 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
 
 # Metrics exporter binaries and  configuration files
 COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
+COPY --from=pgbouncer-exporter /bin/pgbouncer_exporter /bin/pgbouncer_exporter
 COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter
 
 COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml
diff --git a/compute/etc/pgbouncer.ini b/compute/etc/pgbouncer.ini
index 604b4e41ea..9d68cbb8d5 100644
--- a/compute/etc/pgbouncer.ini
+++ b/compute/etc/pgbouncer.ini
@@ -19,6 +19,8 @@ max_prepared_statements=0
 admin_users=postgres
 unix_socket_dir=/tmp/
 unix_socket_mode=0777
+; required for pgbouncer_exporter
+ignore_startup_parameters=extra_float_digits
 
 ;; Disable connection logging. It produces a lot of logs that no one looks at,
 ;; and we can get similar log entries from the proxy too. We had incidents in
diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml
index ac9f5c6904..005143fff3 100644
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -27,6 +27,10 @@ commands:
     user: nobody
     sysvInitAction: respawn
     shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
+  - name: pgbouncer-exporter
+    user: postgres
+    sysvInitAction: respawn
+    shell: '/bin/pgbouncer_exporter --pgBouncer.connectionString="postgres:///pgbouncer?host=/tmp&port=6432&dbname=pgbouncer&user=pgbouncer"'
   - name: sql-exporter
     user: nobody
     sysvInitAction: respawn
diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml
index 0d178e1c24..2fe50c3a45 100644
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -27,6 +27,10 @@ commands:
     user: nobody
     sysvInitAction: respawn
     shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
+  - name: pgbouncer-exporter
+    user: postgres
+    sysvInitAction: respawn
+    shell: '/bin/pgbouncer_exporter --pgBouncer.connectionString="postgres:///pgbouncer?host=/tmp&port=6432&dbname=pgbouncer&user=pgbouncer"'
   - name: sql-exporter
     user: nobody
     sysvInitAction: respawn

From b0b4b7dd8f0dc6a6e780b7ed880b26f70d9016a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 27 Jan 2025 15:25:11 +0100
Subject: [PATCH 264/583] storcon: switch to diesel-async and tokio-postgres
 (#10280)

Switches the storcon away from using diesel's synchronous APIs in favour
of `diesel-async`.

Advantages:

* less C dependencies, especially no openssl, which might be behind the
bug: https://github.com/neondatabase/cloud/issues/21010
* Better to only have async than mix of async plus `spawn_blocking`

We had to turn off usage of the connection pool for migrations, as
diesel migrations don't support async APIs. Thus we still use
`spawn_blocking` in that one place. But this is explicitly done in one
of the `diesel-async` examples.
---
 Cargo.lock                            | 159 ++++--
 Makefile                              |   2 -
 storage_controller/Cargo.toml         |   5 +-
 storage_controller/src/main.rs        |   2 +-
 storage_controller/src/persistence.rs | 783 +++++++++++++++-----------
 5 files changed, 561 insertions(+), 390 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1f090a27e4..a201a6abae 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -942,6 +942,18 @@ version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
 
+[[package]]
+name = "bb8"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d89aabfae550a5c44b43ab941844ffcd2e993cb6900b342debf59e9ea74acdb8"
+dependencies = [
+ "async-trait",
+ "futures-util",
+ "parking_lot 0.12.1",
+ "tokio",
+]
+
 [[package]]
 name = "bcder"
 version = "0.7.4"
@@ -1301,7 +1313,7 @@ dependencies = [
  "tar",
  "thiserror",
  "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "tokio-stream",
  "tokio-util",
  "tower 0.5.2",
@@ -1410,7 +1422,7 @@ dependencies = [
  "storage_broker",
  "thiserror",
  "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "tokio-util",
  "toml",
  "toml_edit",
@@ -1786,11 +1798,24 @@ dependencies = [
  "chrono",
  "diesel_derives",
  "itoa",
- "pq-sys",
- "r2d2",
  "serde_json",
 ]
 
+[[package]]
+name = "diesel-async"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51a307ac00f7c23f526a04a77761a0519b9f0eb2838ebf5b905a58580095bdcb"
+dependencies = [
+ "async-trait",
+ "bb8",
+ "diesel",
+ "futures-util",
+ "scoped-futures",
+ "tokio",
+ "tokio-postgres 0.7.12",
+]
+
 [[package]]
 name = "diesel_derives"
 version = "2.2.1"
@@ -4042,8 +4067,8 @@ dependencies = [
  "pageserver_compaction",
  "pin-project-lite",
  "postgres",
- "postgres-protocol",
- "postgres-types",
+ "postgres-protocol 0.6.4",
+ "postgres-types 0.2.4",
  "postgres_backend",
  "postgres_connection",
  "postgres_ffi",
@@ -4074,7 +4099,7 @@ dependencies = [
  "tokio",
  "tokio-epoll-uring",
  "tokio-io-timeout",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
@@ -4132,7 +4157,7 @@ dependencies = [
  "serde",
  "thiserror",
  "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "tokio-stream",
  "tokio-util",
  "utils",
@@ -4438,7 +4463,7 @@ dependencies = [
  "futures-util",
  "log",
  "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
 ]
 
 [[package]]
@@ -4459,6 +4484,24 @@ dependencies = [
  "stringprep",
 ]
 
+[[package]]
+name = "postgres-protocol"
+version = "0.6.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "acda0ebdebc28befa84bee35e651e4c5f09073d668c7aed4cf7e23c3cda84b23"
+dependencies = [
+ "base64 0.22.1",
+ "byteorder",
+ "bytes",
+ "fallible-iterator",
+ "hmac",
+ "md-5",
+ "memchr",
+ "rand 0.8.5",
+ "sha2",
+ "stringprep",
+]
+
 [[package]]
 name = "postgres-protocol2"
 version = "0.1.0"
@@ -4482,7 +4525,18 @@ source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f
 dependencies = [
  "bytes",
  "fallible-iterator",
- "postgres-protocol",
+ "postgres-protocol 0.6.4",
+]
+
+[[package]]
+name = "postgres-types"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f66ea23a2d0e5734297357705193335e0a957696f34bed2f2faefacb2fec336f"
+dependencies = [
+ "bytes",
+ "fallible-iterator",
+ "postgres-protocol 0.6.7",
 ]
 
 [[package]]
@@ -4507,7 +4561,7 @@ dependencies = [
  "serde",
  "thiserror",
  "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "tokio-postgres-rustls",
  "tokio-rustls 0.26.0",
  "tokio-util",
@@ -4522,7 +4576,7 @@ dependencies = [
  "itertools 0.10.5",
  "once_cell",
  "postgres",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "url",
 ]
 
@@ -4609,15 +4663,6 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
-[[package]]
-name = "pq-sys"
-version = "0.6.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6cc05d7ea95200187117196eee9edd0644424911821aeb28a18ce60ea0b8793"
-dependencies = [
- "vcpkg",
-]
-
 [[package]]
 name = "pq_proto"
 version = "0.1.0"
@@ -4625,7 +4670,7 @@ dependencies = [
  "byteorder",
  "bytes",
  "itertools 0.10.5",
- "postgres-protocol",
+ "postgres-protocol 0.6.4",
  "rand 0.8.5",
  "serde",
  "thiserror",
@@ -4873,7 +4918,7 @@ dependencies = [
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
  "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "tokio-postgres2",
  "tokio-rustls 0.26.0",
  "tokio-tungstenite 0.21.0",
@@ -4930,17 +4975,6 @@ dependencies = [
  "proc-macro2",
 ]
 
-[[package]]
-name = "r2d2"
-version = "0.8.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93"
-dependencies = [
- "log",
- "parking_lot 0.12.1",
- "scheduled-thread-pool",
-]
-
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -5672,7 +5706,7 @@ dependencies = [
  "pageserver_api",
  "parking_lot 0.12.1",
  "postgres",
- "postgres-protocol",
+ "postgres-protocol 0.6.4",
  "postgres_backend",
  "postgres_ffi",
  "pprof",
@@ -5696,7 +5730,7 @@ dependencies = [
  "tikv-jemallocator",
  "tokio",
  "tokio-io-timeout",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
@@ -5755,12 +5789,12 @@ dependencies = [
 ]
 
 [[package]]
-name = "scheduled-thread-pool"
-version = "0.2.7"
+name = "scoped-futures"
+version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19"
+checksum = "1b24aae2d0636530f359e9d5ef0c04669d11c5e756699b27a6a6d845d8329091"
 dependencies = [
- "parking_lot 0.12.1",
+ "pin-project-lite",
 ]
 
 [[package]]
@@ -6295,6 +6329,7 @@ dependencies = [
  "clap",
  "control_plane",
  "diesel",
+ "diesel-async",
  "diesel_migrations",
  "fail",
  "futures",
@@ -6309,10 +6344,10 @@ dependencies = [
  "pageserver_api",
  "pageserver_client",
  "postgres_connection",
- "r2d2",
  "rand 0.8.5",
  "reqwest",
  "routerify",
+ "scoped-futures",
  "scopeguard",
  "serde",
  "serde_json",
@@ -6365,7 +6400,7 @@ dependencies = [
  "serde_json",
  "storage_controller_client",
  "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "tokio-postgres-rustls",
  "tokio-stream",
  "tokio-util",
@@ -6824,13 +6859,39 @@ dependencies = [
  "percent-encoding",
  "phf",
  "pin-project-lite",
- "postgres-protocol",
- "postgres-types",
+ "postgres-protocol 0.6.4",
+ "postgres-types 0.2.4",
  "socket2",
  "tokio",
  "tokio-util",
 ]
 
+[[package]]
+name = "tokio-postgres"
+version = "0.7.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b5d3742945bc7d7f210693b0c58ae542c6fd47b17adbbda0885f3dcb34a6bdb"
+dependencies = [
+ "async-trait",
+ "byteorder",
+ "bytes",
+ "fallible-iterator",
+ "futures-channel",
+ "futures-util",
+ "log",
+ "parking_lot 0.12.1",
+ "percent-encoding",
+ "phf",
+ "pin-project-lite",
+ "postgres-protocol 0.6.7",
+ "postgres-types 0.2.8",
+ "rand 0.8.5",
+ "socket2",
+ "tokio",
+ "tokio-util",
+ "whoami",
+]
+
 [[package]]
 name = "tokio-postgres-rustls"
 version = "0.12.0"
@@ -6840,7 +6901,7 @@ dependencies = [
  "ring",
  "rustls 0.23.18",
  "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "tokio-rustls 0.26.0",
  "x509-certificate",
 ]
@@ -7498,12 +7559,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
 
-[[package]]
-name = "vcpkg"
-version = "0.2.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
-
 [[package]]
 name = "version_check"
 version = "0.9.4"
@@ -7523,7 +7578,7 @@ dependencies = [
  "serde_json",
  "sysinfo",
  "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "tokio-util",
  "tracing",
  "tracing-subscriber",
diff --git a/Makefile b/Makefile
index 22ebfea7d5..d1238caebf 100644
--- a/Makefile
+++ b/Makefile
@@ -64,8 +64,6 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
 CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
 # Force cargo not to print progress bar
 CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
-# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
-CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
 
 CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
 
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index caaa22d0a5..9860bd5d0e 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -45,12 +45,11 @@ strum_macros.workspace = true
 
 diesel = { version = "2.2.6", features = [
     "serde_json",
-    "postgres",
-    "r2d2",
     "chrono",
 ] }
+diesel-async = { version = "0.5.2", features = ["postgres", "bb8", "async-connection-wrapper"] }
 diesel_migrations = { version = "2.2.0" }
-r2d2 = { version = "0.8.10" }
+scoped-futures = "0.1.4"
 
 utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 801409d612..659c088d51 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -308,7 +308,7 @@ async fn async_main() -> anyhow::Result<()> {
     // Validate that we can connect to the database
     Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
 
-    let persistence = Arc::new(Persistence::new(secrets.database_url));
+    let persistence = Arc::new(Persistence::new(secrets.database_url).await);
 
     let service = Service::spawn(config, persistence.clone()).await?;
 
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 37bfaf1139..35eb15b297 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -5,9 +5,12 @@ use std::time::Duration;
 use std::time::Instant;
 
 use self::split_state::SplitState;
-use diesel::pg::PgConnection;
 use diesel::prelude::*;
-use diesel::Connection;
+use diesel_async::async_connection_wrapper::AsyncConnectionWrapper;
+use diesel_async::pooled_connection::bb8::Pool;
+use diesel_async::pooled_connection::AsyncDieselConnectionManager;
+use diesel_async::RunQueryDsl;
+use diesel_async::{AsyncConnection, AsyncPgConnection};
 use itertools::Itertools;
 use pageserver_api::controller_api::AvailabilityZone;
 use pageserver_api::controller_api::MetadataHealthRecord;
@@ -20,6 +23,7 @@ use pageserver_api::shard::ShardConfigError;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
+use scoped_futures::ScopedBoxFuture;
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};
@@ -60,7 +64,7 @@ const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
 /// updated, and reads of nodes are always from memory, not the database.  We only require that
 /// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
 pub struct Persistence {
-    connection_pool: diesel::r2d2::Pool<diesel::r2d2::ConnectionManager<PgConnection>>,
+    connection_pool: Pool<AsyncPgConnection>,
 }
 
 /// Legacy format, for use in JSON compat objects in test environment
@@ -76,7 +80,7 @@ pub(crate) enum DatabaseError {
     #[error(transparent)]
     Connection(#[from] diesel::result::ConnectionError),
     #[error(transparent)]
-    ConnectionPool(#[from] r2d2::Error),
+    ConnectionPool(#[from] diesel_async::pooled_connection::bb8::RunError),
     #[error("Logical error: {0}")]
     Logical(String),
     #[error("Migration error: {0}")]
@@ -124,6 +128,7 @@ pub(crate) enum AbortShardSplitStatus {
 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
 
 /// Some methods can operate on either a whole tenant or a single shard
+#[derive(Clone)]
 pub(crate) enum TenantFilter {
     Tenant(TenantId),
     Shard(TenantShardId),
@@ -136,6 +141,11 @@ pub(crate) struct ShardGenerationState {
     pub(crate) generation_pageserver: Option<NodeId>,
 }
 
+// A generous allowance for how many times we may retry serializable transactions
+// before giving up.  This is not expected to be hit: it is a defensive measure in case we
+// somehow engineer a situation where duelling transactions might otherwise live-lock.
+const MAX_RETRIES: usize = 128;
+
 impl Persistence {
     // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
     // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
@@ -145,12 +155,12 @@ impl Persistence {
     const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10);
     const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60);
 
-    pub fn new(database_url: String) -> Self {
-        let manager = diesel::r2d2::ConnectionManager::<PgConnection>::new(database_url);
+    pub async fn new(database_url: String) -> Self {
+        let manager = AsyncDieselConnectionManager::<AsyncPgConnection>::new(database_url);
 
         // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time
         // to execute queries (database queries are not generally on latency-sensitive paths).
-        let connection_pool = diesel::r2d2::Pool::builder()
+        let connection_pool = Pool::builder()
             .max_size(Self::MAX_CONNECTIONS)
             .max_lifetime(Some(Self::MAX_CONNECTION_LIFETIME))
             .idle_timeout(Some(Self::IDLE_CONNECTION_TIMEOUT))
@@ -158,6 +168,7 @@ impl Persistence {
             .min_idle(Some(1))
             .test_on_check_out(true)
             .build(manager)
+            .await
             .expect("Could not build connection pool");
 
         Self { connection_pool }
@@ -171,7 +182,7 @@ impl Persistence {
     ) -> Result<(), diesel::ConnectionError> {
         let started_at = Instant::now();
         loop {
-            match PgConnection::establish(database_url) {
+            match AsyncPgConnection::establish(database_url).await {
                 Ok(_) => {
                     tracing::info!("Connected to database.");
                     return Ok(());
@@ -192,57 +203,22 @@ impl Persistence {
     pub(crate) async fn migration_run(&self) -> DatabaseResult<()> {
         use diesel_migrations::{HarnessWithOutput, MigrationHarness};
 
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            HarnessWithOutput::write_to_stdout(conn)
-                .run_pending_migrations(MIGRATIONS)
-                .map(|_| ())
-                .map_err(|e| DatabaseError::Migration(e.to_string()))
-        })
-        .await
-    }
-
-    /// Wraps `with_conn` in order to collect latency and error metrics
-    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
-    where
-        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
-        R: Send + 'static,
-    {
-        let latency = &METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_database_query_latency;
-        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
-
-        let res = self.with_conn(func).await;
-
-        if let Err(err) = &res {
-            let error_counter = &METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_database_query_error;
-            error_counter.inc(DatabaseQueryErrorLabelGroup {
-                error_type: err.error_label(),
-                operation: op,
-            })
-        }
-
-        res
-    }
-
-    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
-    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
-    where
-        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
-        R: Send + 'static,
-    {
-        // A generous allowance for how many times we may retry serializable transactions
-        // before giving up.  This is not expected to be hit: it is a defensive measure in case we
-        // somehow engineer a situation where duelling transactions might otherwise live-lock.
-        const MAX_RETRIES: usize = 128;
-
-        let mut conn = self.connection_pool.get()?;
-        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
+        // Can't use self.with_conn here as we do spawn_blocking which requires static.
+        let conn = self
+            .connection_pool
+            .dedicated_connection()
+            .await
+            .map_err(|e| DatabaseError::Migration(e.to_string()))?;
+        let mut async_wrapper: AsyncConnectionWrapper<AsyncPgConnection> =
+            AsyncConnectionWrapper::from(conn);
+        tokio::task::spawn_blocking(move || {
             let mut retry_count = 0;
             loop {
-                match conn.build_transaction().serializable().run(|c| func(c)) {
+                let result = HarnessWithOutput::write_to_stdout(&mut async_wrapper)
+                    .run_pending_migrations(MIGRATIONS)
+                    .map(|_| ())
+                    .map_err(|e| DatabaseError::Migration(e.to_string()));
+                match result {
                     Ok(r) => break Ok(r),
                     Err(
                         err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
@@ -271,33 +247,112 @@ impl Persistence {
             }
         })
         .await
-        .expect("Task panic")
+        .map_err(|e| DatabaseError::Migration(e.to_string()))??;
+        Ok(())
+    }
+
+    /// Wraps `with_conn` in order to collect latency and error metrics
+    async fn with_measured_conn<'a, 'b, F, R>(
+        &self,
+        op: DatabaseOperation,
+        func: F,
+    ) -> DatabaseResult<R>
+    where
+        F: for<'r> Fn(&'r mut AsyncPgConnection) -> ScopedBoxFuture<'b, 'r, DatabaseResult<R>>
+            + Send
+            + std::marker::Sync
+            + 'a,
+        R: Send + 'b,
+    {
+        let latency = &METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_database_query_latency;
+        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
+
+        let res = self.with_conn(func).await;
+
+        if let Err(err) = &res {
+            let error_counter = &METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_database_query_error;
+            error_counter.inc(DatabaseQueryErrorLabelGroup {
+                error_type: err.error_label(),
+                operation: op,
+            })
+        }
+
+        res
+    }
+
+    /// Call the provided function with a Diesel database connection in a retry loop
+    async fn with_conn<'a, 'b, F, R>(&self, func: F) -> DatabaseResult<R>
+    where
+        F: for<'r> Fn(&'r mut AsyncPgConnection) -> ScopedBoxFuture<'b, 'r, DatabaseResult<R>>
+            + Send
+            + std::marker::Sync
+            + 'a,
+        R: Send + 'b,
+    {
+        let mut retry_count = 0;
+        loop {
+            let mut conn = self.connection_pool.get().await?;
+            match conn
+                .build_transaction()
+                .serializable()
+                .run(|c| func(c))
+                .await
+            {
+                Ok(r) => break Ok(r),
+                Err(
+                    err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
+                        diesel::result::DatabaseErrorKind::SerializationFailure,
+                        _,
+                    )),
+                ) => {
+                    retry_count += 1;
+                    if retry_count > MAX_RETRIES {
+                        tracing::error!(
+                            "Exceeded max retries on SerializationFailure errors: {err:?}"
+                        );
+                        break Err(err);
+                    } else {
+                        // Retry on serialization errors: these are expected, because even though our
+                        // transactions don't fight for the same rows, they will occasionally collide
+                        // on index pages (e.g. increment_generation for unrelated shards can collide)
+                        tracing::debug!("Retrying transaction on serialization failure {err:?}");
+                        continue;
+                    }
+                }
+                Err(e) => break Err(e),
+            }
+        }
     }
 
     /// When a node is first registered, persist it before using it for anything
     pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
-        let np = node.to_persistent();
-        self.with_measured_conn(
-            DatabaseOperation::InsertNode,
-            move |conn| -> DatabaseResult<()> {
+        let np = &node.to_persistent();
+        self.with_measured_conn(DatabaseOperation::InsertNode, move |conn| {
+            Box::pin(async move {
                 diesel::insert_into(crate::schema::nodes::table)
-                    .values(&np)
-                    .execute(conn)?;
+                    .values(np)
+                    .execute(conn)
+                    .await?;
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
     /// At startup, populate the list of nodes which our shards may be placed on
     pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
         let nodes: Vec<NodePersistence> = self
-            .with_measured_conn(
-                DatabaseOperation::ListNodes,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
-                },
-            )
+            .with_measured_conn(DatabaseOperation::ListNodes, move |conn| {
+                Box::pin(async move {
+                    Ok(crate::schema::nodes::table
+                        .load::<NodePersistence>(conn)
+                        .await?)
+                })
+            })
             .await?;
 
         tracing::info!("list_nodes: loaded {} nodes", nodes.len());
@@ -313,11 +368,14 @@ impl Persistence {
         use crate::schema::nodes::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
-                let updated = diesel::update(nodes)
-                    .filter(node_id.eq(input_node_id.0 as i64))
-                    .set((scheduling_policy.eq(String::from(input_scheduling)),))
-                    .execute(conn)?;
-                Ok(updated)
+                Box::pin(async move {
+                    let updated = diesel::update(nodes)
+                        .filter(node_id.eq(input_node_id.0 as i64))
+                        .set((scheduling_policy.eq(String::from(input_scheduling)),))
+                        .execute(conn)
+                        .await?;
+                    Ok(updated)
+                })
             })
             .await?;
 
@@ -339,17 +397,16 @@ impl Persistence {
         &self,
     ) -> DatabaseResult<Vec<TenantShardPersistence>> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::ListTenantShards,
-            move |conn| -> DatabaseResult<_> {
+        self.with_measured_conn(DatabaseOperation::ListTenantShards, move |conn| {
+            Box::pin(async move {
                 let query = tenant_shards.filter(
                     placement_policy.ne(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
                 );
-                let result = query.load::<TenantShardPersistence>(conn)?;
+                let result = query.load::<TenantShardPersistence>(conn).await?;
 
                 Ok(result)
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -359,15 +416,14 @@ impl Persistence {
         filter_tenant_id: TenantId,
     ) -> DatabaseResult<Vec<TenantShardPersistence>> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::LoadTenant,
-            move |conn| -> DatabaseResult<_> {
+        self.with_measured_conn(DatabaseOperation::LoadTenant, move |conn| {
+            Box::pin(async move {
                 let query = tenant_shards.filter(tenant_id.eq(filter_tenant_id.to_string()));
-                let result = query.load::<TenantShardPersistence>(conn)?;
+                let result = query.load::<TenantShardPersistence>(conn).await?;
 
                 Ok(result)
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -393,19 +449,22 @@ impl Persistence {
             })
             .collect::<Vec<_>>();
 
-        self.with_measured_conn(
-            DatabaseOperation::InsertTenantShards,
-            move |conn| -> DatabaseResult<()> {
+        let shards = &shards;
+        let metadata_health_records = &metadata_health_records;
+        self.with_measured_conn(DatabaseOperation::InsertTenantShards, move |conn| {
+            Box::pin(async move {
                 diesel::insert_into(tenant_shards::table)
-                    .values(&shards)
-                    .execute(conn)?;
+                    .values(shards)
+                    .execute(conn)
+                    .await?;
 
                 diesel::insert_into(metadata_health::table)
-                    .values(&metadata_health_records)
-                    .execute(conn)?;
+                    .values(metadata_health_records)
+                    .execute(conn)
+                    .await?;
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -413,31 +472,31 @@ impl Persistence {
     /// the tenant from memory on this server.
     pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::DeleteTenant,
-            move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(DatabaseOperation::DeleteTenant, move |conn| {
+            Box::pin(async move {
                 // `metadata_health` status (if exists) is also deleted based on the cascade behavior.
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(del_tenant_id.to_string()))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
     pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
         use crate::schema::nodes::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::DeleteNode,
-            move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(DatabaseOperation::DeleteNode, move |conn| {
+            Box::pin(async move {
                 diesel::delete(nodes)
                     .filter(node_id.eq(del_node_id.0 as i64))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -454,34 +513,41 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
-                let rows_updated = diesel::update(tenant_shards)
-                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
-                    .set(generation.eq(generation + 1))
-                    .execute(conn)?;
+                Box::pin(async move {
+                    let rows_updated = diesel::update(tenant_shards)
+                        .filter(generation_pageserver.eq(input_node_id.0 as i64))
+                        .set(generation.eq(generation + 1))
+                        .execute(conn)
+                        .await?;
 
-                tracing::info!("Incremented {} tenants' generations", rows_updated);
+                    tracing::info!("Incremented {} tenants' generations", rows_updated);
 
-                // TODO: UPDATE+SELECT in one query
+                    // TODO: UPDATE+SELECT in one query
 
-                let updated = tenant_shards
-                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
-                    .select(TenantShardPersistence::as_select())
-                    .load(conn)?;
+                    let updated = tenant_shards
+                        .filter(generation_pageserver.eq(input_node_id.0 as i64))
+                        .select(TenantShardPersistence::as_select())
+                        .load(conn)
+                        .await?;
 
-                // If the node went through a drain and restart phase before re-attaching,
-                // then reset it's node scheduling policy to active.
-                diesel::update(nodes)
-                    .filter(node_id.eq(input_node_id.0 as i64))
-                    .filter(
-                        scheduling_policy
-                            .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
-                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Draining)))
-                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Filling))),
-                    )
-                    .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
-                    .execute(conn)?;
+                    // If the node went through a drain and restart phase before re-attaching,
+                    // then reset it's node scheduling policy to active.
+                    diesel::update(nodes)
+                        .filter(node_id.eq(input_node_id.0 as i64))
+                        .filter(
+                            scheduling_policy
+                                .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
+                                .or(scheduling_policy
+                                    .eq(String::from(NodeSchedulingPolicy::Draining)))
+                                .or(scheduling_policy
+                                    .eq(String::from(NodeSchedulingPolicy::Filling))),
+                        )
+                        .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
+                        .execute(conn)
+                        .await?;
 
-                Ok(updated)
+                    Ok(updated)
+                })
             })
             .await?;
 
@@ -518,19 +584,22 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .set((
-                        generation.eq(generation + 1),
-                        generation_pageserver.eq(node_id.0 as i64),
-                    ))
-                    // TODO: only returning() the generation column
-                    .returning(TenantShardPersistence::as_returning())
-                    .get_result(conn)?;
+                Box::pin(async move {
+                    let updated = diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                        .set((
+                            generation.eq(generation + 1),
+                            generation_pageserver.eq(node_id.0 as i64),
+                        ))
+                        // TODO: only returning() the generation column
+                        .returning(TenantShardPersistence::as_returning())
+                        .get_result(conn)
+                        .await?;
 
-                Ok(updated)
+                    Ok(updated)
+                })
             })
             .await?;
 
@@ -562,12 +631,15 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let rows = self
             .with_measured_conn(DatabaseOperation::TenantGenerations, move |conn| {
-                let result = tenant_shards
-                    .filter(tenant_id.eq(filter_tenant_id.to_string()))
-                    .select(TenantShardPersistence::as_select())
-                    .order(shard_number)
-                    .load(conn)?;
-                Ok(result)
+                Box::pin(async move {
+                    let result = tenant_shards
+                        .filter(tenant_id.eq(filter_tenant_id.to_string()))
+                        .select(TenantShardPersistence::as_select())
+                        .order(shard_number)
+                        .load(conn)
+                        .await?;
+                    Ok(result)
+                })
             })
             .await?;
 
@@ -615,15 +687,18 @@ impl Persistence {
                 break;
             }
 
+            let in_clause = &in_clause;
             let chunk_rows = self
                 .with_measured_conn(DatabaseOperation::ShardGenerations, move |conn| {
-                    // diesel doesn't support multi-column IN queries, so we compose raw SQL.  No escaping is required because
-                    // the inputs are strongly typed and cannot carry any user-supplied raw string content.
-                    let result : Vec<TenantShardPersistence> = diesel::sql_query(
-                        format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str()
-                    ).load(conn)?;
+                    Box::pin(async move {
+                        // diesel doesn't support multi-column IN queries, so we compose raw SQL.  No escaping is required because
+                        // the inputs are strongly typed and cannot carry any user-supplied raw string content.
+                        let result : Vec<TenantShardPersistence> = diesel::sql_query(
+                            format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str()
+                        ).load(conn).await?;
 
-                    Ok(result)
+                        Ok(result)
+                    })
                 })
                 .await?;
             rows.extend(chunk_rows.into_iter())
@@ -657,51 +732,58 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
 
+        let tenant = &tenant;
+        let input_placement_policy = &input_placement_policy;
+        let input_config = &input_config;
+        let input_generation = &input_generation;
+        let input_scheduling_policy = &input_scheduling_policy;
         self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
-            let query = match tenant {
-                TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .into_boxed(),
-                TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(input_tenant_id.to_string()))
-                    .into_boxed(),
-            };
+            Box::pin(async move {
+                let query = match tenant {
+                    TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                        .into_boxed(),
+                    TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(input_tenant_id.to_string()))
+                        .into_boxed(),
+                };
 
-            // Clear generation_pageserver if we are moving into a state where we won't have
-            // any attached pageservers.
-            let input_generation_pageserver = match input_placement_policy {
-                None | Some(PlacementPolicy::Attached(_)) => None,
-                Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
-            };
+                // Clear generation_pageserver if we are moving into a state where we won't have
+                // any attached pageservers.
+                let input_generation_pageserver = match input_placement_policy {
+                    None | Some(PlacementPolicy::Attached(_)) => None,
+                    Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
+                };
 
-            #[derive(AsChangeset)]
-            #[diesel(table_name = crate::schema::tenant_shards)]
-            struct ShardUpdate {
-                generation: Option<i32>,
-                placement_policy: Option<String>,
-                config: Option<String>,
-                scheduling_policy: Option<String>,
-                generation_pageserver: Option<Option<i64>>,
-            }
+                #[derive(AsChangeset)]
+                #[diesel(table_name = crate::schema::tenant_shards)]
+                struct ShardUpdate {
+                    generation: Option<i32>,
+                    placement_policy: Option<String>,
+                    config: Option<String>,
+                    scheduling_policy: Option<String>,
+                    generation_pageserver: Option<Option<i64>>,
+                }
 
-            let update = ShardUpdate {
-                generation: input_generation.map(|g| g.into().unwrap() as i32),
-                placement_policy: input_placement_policy
-                    .as_ref()
-                    .map(|p| serde_json::to_string(&p).unwrap()),
-                config: input_config
-                    .as_ref()
-                    .map(|c| serde_json::to_string(&c).unwrap()),
-                scheduling_policy: input_scheduling_policy
-                    .map(|p| serde_json::to_string(&p).unwrap()),
-                generation_pageserver: input_generation_pageserver,
-            };
+                let update = ShardUpdate {
+                    generation: input_generation.map(|g| g.into().unwrap() as i32),
+                    placement_policy: input_placement_policy
+                        .as_ref()
+                        .map(|p| serde_json::to_string(&p).unwrap()),
+                    config: input_config
+                        .as_ref()
+                        .map(|c| serde_json::to_string(&c).unwrap()),
+                    scheduling_policy: input_scheduling_policy
+                        .map(|p| serde_json::to_string(&p).unwrap()),
+                    generation_pageserver: input_generation_pageserver,
+                };
 
-            query.set(update).execute(conn)?;
+                query.set(update).execute(conn).await?;
 
-            Ok(())
+                Ok(())
+            })
         })
         .await?;
 
@@ -715,23 +797,27 @@ impl Persistence {
     ) -> DatabaseResult<Vec<(TenantShardId, Option<AvailabilityZone>)>> {
         use crate::schema::tenant_shards::dsl::*;
 
+        let preferred_azs = preferred_azs.as_slice();
         self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
-            let mut shards_updated = Vec::default();
+            Box::pin(async move {
+                let mut shards_updated = Vec::default();
 
-            for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
-                    .execute(conn)?;
+                for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
+                    let updated = diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                        .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
+                        .execute(conn)
+                        .await?;
 
-                if updated == 1 {
-                    shards_updated.push((*tenant_shard_id, preferred_az.clone()));
+                    if updated == 1 {
+                        shards_updated.push((*tenant_shard_id, preferred_az.clone()));
+                    }
                 }
-            }
 
-            Ok(shards_updated)
+                Ok(shards_updated)
+            })
         })
         .await
     }
@@ -739,17 +825,21 @@ impl Persistence {
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
         use crate::schema::tenant_shards::dsl::*;
         self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
-            let updated = diesel::update(tenant_shards)
-                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                .set((
-                    generation_pageserver.eq(Option::<i64>::None),
-                    placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
-                ))
-                .execute(conn)?;
+            Box::pin(async move {
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .set((
+                        generation_pageserver.eq(Option::<i64>::None),
+                        placement_policy
+                            .eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
+                    ))
+                    .execute(conn)
+                    .await?;
 
-            Ok(updated)
+                Ok(updated)
+            })
         })
         .await?;
 
@@ -768,14 +858,16 @@ impl Persistence {
         parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
+        let parent_to_children = parent_to_children.as_slice();
+        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| {
+            Box::pin(async move {
             // Mark parent shards as splitting
 
             let updated = diesel::update(tenant_shards)
                 .filter(tenant_id.eq(split_tenant_id.to_string()))
                 .filter(shard_count.eq(old_shard_count.literal() as i32))
                 .set((splitting.eq(1),))
-                .execute(conn)?;
+                .execute(conn).await?;
             if u8::try_from(updated)
                 .map_err(|_| DatabaseError::Logical(
                     format!("Overflow existing shard count {} while splitting", updated))
@@ -788,7 +880,7 @@ impl Persistence {
             }
 
             // FIXME: spurious clone to sidestep closure move rules
-            let parent_to_children = parent_to_children.clone();
+            let parent_to_children = parent_to_children.to_vec();
 
             // Insert child shards
             for (parent_shard_id, children) in parent_to_children {
@@ -796,7 +888,7 @@ impl Persistence {
                     .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
                     .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
                     .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
-                    .load::<TenantShardPersistence>(conn)?;
+                    .load::<TenantShardPersistence>(conn).await?;
                 let parent = if parent.len() != 1 {
                     return Err(DatabaseError::Logical(format!(
                         "Parent shard {parent_shard_id} not found"
@@ -811,12 +903,13 @@ impl Persistence {
                     debug_assert!(shard.splitting == SplitState::Splitting);
                     diesel::insert_into(tenant_shards)
                         .values(shard)
-                        .execute(conn)?;
+                        .execute(conn).await?;
                 }
             }
 
             Ok(())
         })
+        })
         .await
     }
 
@@ -828,25 +921,26 @@ impl Persistence {
         old_shard_count: ShardCount,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::CompleteShardSplit,
-            move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(DatabaseOperation::CompleteShardSplit, move |conn| {
+            Box::pin(async move {
                 // Drop parent shards
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.eq(old_shard_count.literal() as i32))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 // Clear sharding flag
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .set((splitting.eq(0),))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
                 debug_assert!(updated > 0);
 
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -858,15 +952,15 @@ impl Persistence {
         new_shard_count: ShardCount,
     ) -> DatabaseResult<AbortShardSplitStatus> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::AbortShardSplit,
-            move |conn| -> DatabaseResult<AbortShardSplitStatus> {
+        self.with_measured_conn(DatabaseOperation::AbortShardSplit, move |conn| {
+            Box::pin(async move {
                 // Clear the splitting state on parent shards
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.ne(new_shard_count.literal() as i32))
                     .set((splitting.eq(0),))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 // Parent shards are already gone: we cannot abort.
                 if updated == 0 {
@@ -886,11 +980,12 @@ impl Persistence {
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.eq(new_shard_count.literal() as i32))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 Ok(AbortShardSplitStatus::Aborted)
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -906,25 +1001,28 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::metadata_health::dsl::*;
 
-        self.with_measured_conn(
-            DatabaseOperation::UpdateMetadataHealth,
-            move |conn| -> DatabaseResult<_> {
+        let healthy_records = healthy_records.as_slice();
+        let unhealthy_records = unhealthy_records.as_slice();
+        self.with_measured_conn(DatabaseOperation::UpdateMetadataHealth, move |conn| {
+            Box::pin(async move {
                 diesel::insert_into(metadata_health)
-                    .values(&healthy_records)
+                    .values(healthy_records)
                     .on_conflict((tenant_id, shard_number, shard_count))
                     .do_update()
                     .set((healthy.eq(true), last_scrubbed_at.eq(now)))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 diesel::insert_into(metadata_health)
-                    .values(&unhealthy_records)
+                    .values(unhealthy_records)
                     .on_conflict((tenant_id, shard_number, shard_count))
                     .do_update()
                     .set((healthy.eq(false), last_scrubbed_at.eq(now)))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -933,15 +1031,13 @@ impl Persistence {
     pub(crate) async fn list_metadata_health_records(
         &self,
     ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
-        self.with_measured_conn(
-            DatabaseOperation::ListMetadataHealth,
-            move |conn| -> DatabaseResult<_> {
-                Ok(
-                    crate::schema::metadata_health::table
-                        .load::<MetadataHealthPersistence>(conn)?,
-                )
-            },
-        )
+        self.with_measured_conn(DatabaseOperation::ListMetadataHealth, move |conn| {
+            Box::pin(async {
+                Ok(crate::schema::metadata_health::table
+                    .load::<MetadataHealthPersistence>(conn)
+                    .await?)
+            })
+        })
         .await
     }
 
@@ -953,10 +1049,15 @@ impl Persistence {
         use crate::schema::metadata_health::dsl::*;
         self.with_measured_conn(
             DatabaseOperation::ListMetadataHealthUnhealthy,
-            move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::metadata_health::table
-                    .filter(healthy.eq(false))
-                    .load::<MetadataHealthPersistence>(conn)?)
+            move |conn| {
+                Box::pin(async {
+                    DatabaseResult::Ok(
+                        crate::schema::metadata_health::table
+                            .filter(healthy.eq(false))
+                            .load::<MetadataHealthPersistence>(conn)
+                            .await?,
+                    )
+                })
             },
         )
         .await
@@ -970,15 +1071,14 @@ impl Persistence {
     ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
         use crate::schema::metadata_health::dsl::*;
 
-        self.with_measured_conn(
-            DatabaseOperation::ListMetadataHealthOutdated,
-            move |conn| -> DatabaseResult<_> {
+        self.with_measured_conn(DatabaseOperation::ListMetadataHealthOutdated, move |conn| {
+            Box::pin(async move {
                 let query = metadata_health.filter(last_scrubbed_at.lt(earlier));
-                let res = query.load::<MetadataHealthPersistence>(conn)?;
+                let res = query.load::<MetadataHealthPersistence>(conn).await?;
 
                 Ok(res)
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -986,12 +1086,13 @@ impl Persistence {
     /// It is an error for the table to contain more than one entry.
     pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
         let mut leader: Vec<ControllerPersistence> = self
-            .with_measured_conn(
-                DatabaseOperation::GetLeader,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::controllers::table.load::<ControllerPersistence>(conn)?)
-                },
-            )
+            .with_measured_conn(DatabaseOperation::GetLeader, move |conn| {
+                Box::pin(async move {
+                    Ok(crate::schema::controllers::table
+                        .load::<ControllerPersistence>(conn)
+                        .await?)
+                })
+            })
             .await?;
 
         if leader.len() > 1 {
@@ -1014,26 +1115,33 @@ impl Persistence {
         use crate::schema::controllers::dsl::*;
 
         let updated = self
-            .with_measured_conn(
-                DatabaseOperation::UpdateLeader,
-                move |conn| -> DatabaseResult<usize> {
+            .with_measured_conn(DatabaseOperation::UpdateLeader, move |conn| {
+                let prev = prev.clone();
+                let new = new.clone();
+                Box::pin(async move {
                     let updated = match &prev {
-                        Some(prev) => diesel::update(controllers)
-                            .filter(address.eq(prev.address.clone()))
-                            .filter(started_at.eq(prev.started_at))
-                            .set((
-                                address.eq(new.address.clone()),
-                                started_at.eq(new.started_at),
-                            ))
-                            .execute(conn)?,
-                        None => diesel::insert_into(controllers)
-                            .values(new.clone())
-                            .execute(conn)?,
+                        Some(prev) => {
+                            diesel::update(controllers)
+                                .filter(address.eq(prev.address.clone()))
+                                .filter(started_at.eq(prev.started_at))
+                                .set((
+                                    address.eq(new.address.clone()),
+                                    started_at.eq(new.started_at),
+                                ))
+                                .execute(conn)
+                                .await?
+                        }
+                        None => {
+                            diesel::insert_into(controllers)
+                                .values(new.clone())
+                                .execute(conn)
+                                .await?
+                        }
                     };
 
                     Ok(updated)
-                },
-            )
+                })
+            })
             .await?;
 
         if updated == 0 {
@@ -1048,12 +1156,13 @@ impl Persistence {
     /// At startup, populate the list of nodes which our shards may be placed on
     pub(crate) async fn list_safekeepers(&self) -> DatabaseResult<Vec<SafekeeperPersistence>> {
         let safekeepers: Vec<SafekeeperPersistence> = self
-            .with_measured_conn(
-                DatabaseOperation::ListNodes,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::safekeepers::table.load::<SafekeeperPersistence>(conn)?)
-                },
-            )
+            .with_measured_conn(DatabaseOperation::ListNodes, move |conn| {
+                Box::pin(async move {
+                    Ok(crate::schema::safekeepers::table
+                        .load::<SafekeeperPersistence>(conn)
+                        .await?)
+                })
+            })
             .await?;
 
         tracing::info!("list_safekeepers: loaded {} nodes", safekeepers.len());
@@ -1066,11 +1175,14 @@ impl Persistence {
         id: i64,
     ) -> Result<SafekeeperPersistence, DatabaseError> {
         use crate::schema::safekeepers::dsl::{id as id_column, safekeepers};
-        self.with_conn(move |conn| -> DatabaseResult<SafekeeperPersistence> {
-            Ok(safekeepers
-                .filter(id_column.eq(&id))
-                .select(SafekeeperPersistence::as_select())
-                .get_result(conn)?)
+        self.with_conn(move |conn| {
+            Box::pin(async move {
+                Ok(safekeepers
+                    .filter(id_column.eq(&id))
+                    .select(SafekeeperPersistence::as_select())
+                    .get_result(conn)
+                    .await?)
+            })
         })
         .await
     }
@@ -1081,26 +1193,30 @@ impl Persistence {
     ) -> Result<(), DatabaseError> {
         use crate::schema::safekeepers::dsl::*;
 
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            let bind = record
-                .as_insert_or_update()
-                .map_err(|e| DatabaseError::Logical(format!("{e}")))?;
+        self.with_conn(move |conn| {
+            let record = record.clone();
+            Box::pin(async move {
+                let bind = record
+                    .as_insert_or_update()
+                    .map_err(|e| DatabaseError::Logical(format!("{e}")))?;
 
-            let inserted_updated = diesel::insert_into(safekeepers)
-                .values(&bind)
-                .on_conflict(id)
-                .do_update()
-                .set(&bind)
-                .execute(conn)?;
+                let inserted_updated = diesel::insert_into(safekeepers)
+                    .values(&bind)
+                    .on_conflict(id)
+                    .do_update()
+                    .set(&bind)
+                    .execute(conn)
+                    .await?;
 
-            if inserted_updated != 1 {
-                return Err(DatabaseError::Logical(format!(
-                    "unexpected number of rows ({})",
-                    inserted_updated
-                )));
-            }
+                if inserted_updated != 1 {
+                    return Err(DatabaseError::Logical(format!(
+                        "unexpected number of rows ({})",
+                        inserted_updated
+                    )));
+                }
 
-            Ok(())
+                Ok(())
+            })
         })
         .await
     }
@@ -1112,26 +1228,29 @@ impl Persistence {
     ) -> Result<(), DatabaseError> {
         use crate::schema::safekeepers::dsl::*;
 
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            #[derive(Insertable, AsChangeset)]
-            #[diesel(table_name = crate::schema::safekeepers)]
-            struct UpdateSkSchedulingPolicy<'a> {
-                id: i64,
-                scheduling_policy: &'a str,
-            }
-            let scheduling_policy_ = String::from(scheduling_policy_);
+        self.with_conn(move |conn| {
+            Box::pin(async move {
+                #[derive(Insertable, AsChangeset)]
+                #[diesel(table_name = crate::schema::safekeepers)]
+                struct UpdateSkSchedulingPolicy<'a> {
+                    id: i64,
+                    scheduling_policy: &'a str,
+                }
+                let scheduling_policy_ = String::from(scheduling_policy_);
 
-            let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
-                .set(scheduling_policy.eq(scheduling_policy_))
-                .execute(conn)?;
+                let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
+                    .set(scheduling_policy.eq(scheduling_policy_))
+                    .execute(conn)
+                    .await?;
 
-            if rows_affected != 1 {
-                return Err(DatabaseError::Logical(format!(
-                    "unexpected number of rows ({rows_affected})",
-                )));
-            }
+                if rows_affected != 1 {
+                    return Err(DatabaseError::Logical(format!(
+                        "unexpected number of rows ({rows_affected})",
+                    )));
+                }
 
-            Ok(())
+                Ok(())
+            })
         })
         .await
     }

From aec92bfc34a2e0bf86a47b5664746dc67a82dcf3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 27 Jan 2025 17:03:32 +0000
Subject: [PATCH 265/583] pageserver: decrease utilization MAX_SHARDS (#10489)

## Problem

The intent of this parameter is to have pageservers consider themselves
"full" if they've got lots of shards, even if they have plenty of
capacity. It works, but because we typically successfully oversubscribe
capacity up to 200%, the MAX_SHARDS limit is effectively doubled, so
this 20,000 value ends up meaning 40,000, whereas the original intent
was to limit nodes to ~10000 shards.

## Summary of changes

- Change MAX_SHARDS to 5000, so that a node with 5000 will get a 100%
utilization, which is equivalent in practice to being considered "half
full" by the storage controller in capacity terms.

This is all a bit subtle and indiret. Originally the limit was baked
into the pageserver with the idea that the pageserver knows better what
its own resources tolerate than the storage controller does, but in
practice it would be probably be easier to understand all this if we
just did it controller-side. So there's scope to refactor here in
future.
---
 pageserver/src/utilization.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs
index a0223f3bce..093a944777 100644
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -49,7 +49,7 @@ pub(crate) fn regenerate(
     };
 
     // Express a static value for how many shards we may schedule on one node
-    const MAX_SHARDS: u32 = 20000;
+    const MAX_SHARDS: u32 = 5000;
 
     let mut doc = PageserverUtilization {
         disk_usage_bytes: used,

From aabf455dfbeb3efa00673c069249aaa90004f793 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 27 Jan 2025 17:24:42 +0000
Subject: [PATCH 266/583] README: clarify that neon_local is a dev/test tool
 (#10512)

## Problem

From time to time, folks discover our `control_plane/` folder and make
the (reasonable) mistake of thinking it's a tool for running full-sized
Neon systems, whereas in reality it is a tool for dev/test.

## Summary of changes

- Change control_plane's readme title to "Local Development Control
Plane (`neon_local`)`
- Change "Running local installation" to "Running a local development
environment" in the main readme
---
 README.md               | 6 ++++--
 control_plane/README.md | 8 ++++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 1417d6b9e7..4453904346 100644
--- a/README.md
+++ b/README.md
@@ -21,8 +21,10 @@ The Neon storage engine consists of two major components:
 
 See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information.
 
-## Running local installation
+## Running a local development environment
 
+Neon can be run on a workstation for small experiments and to test code changes, by
+following these instructions.
 
 #### Installing dependencies on Linux
 1. Install build dependencies and other applicable packages
@@ -238,7 +240,7 @@ postgres=# select * from t;
 > cargo neon stop
 ```
 
-More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md).
+More advanced usages can be found at [Local Development Control Plane (`neon_local`))](./control_plane/README.md).
 
 #### Handling build failures
 
diff --git a/control_plane/README.md b/control_plane/README.md
index 827aba5c1f..aa6f935e27 100644
--- a/control_plane/README.md
+++ b/control_plane/README.md
@@ -1,6 +1,10 @@
-# Control Plane and Neon Local
+# Local Development Control Plane (`neon_local`)
 
-This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command.
+This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command.  This is a convenience to invoke
+the `neon_local` binary.
+
+**Note**: this is a dev/test tool -- a minimal control plane suitable for testing
+code changes locally, but not suitable for running production systems.
 
 ## Example: Start with Postgres 16
 

From ebf44210ba2fdd66de25f2689182dcb1c4a6d3f7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 27 Jan 2025 17:44:18 +0000
Subject: [PATCH 267/583] remote_storage: less sensitive timeout logging in ABS
 listings (#10518)

## Problem

We were logging a warning after a single request timeout, while listing
objects.

Closes: https://github.com/neondatabase/neon/issues/10166

## Summary of changes

- These timeouts are a pretty normal part of life, so back it off to
only log a warning after two in a row.
---
 libs/remote_storage/src/azure_blob.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index c89f50ef2b..9027a8bf55 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -377,7 +377,8 @@ impl RemoteStorage for AzureBlobStorage {
 
                 let next_item = next_item?;
 
-                if timeout_try_cnt >= 2 {
+                // Log a warning if we saw two timeouts in a row before a successful request
+                if timeout_try_cnt > 2 {
                     tracing::warn!("Azure Blob Storage list timed out and succeeded after {} tries", timeout_try_cnt);
                 }
                 timeout_try_cnt = 1;

From 3d36dfe5337504d108ecb475ab4f983ccbde77cc Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 27 Jan 2025 20:19:55 +0100
Subject: [PATCH 268/583] fix: noisy `broker subscription failed` error during
 storage broker deploys (#10521)

During broker deploys, pageservers log this noisy WARN en masse.

I can trivially reproduce the WARN message in neon_local by SIGKILLing
broker during e.g. `pgbench -i`.

I don't understand why tonic is not detecting the error as
`Code::Unavailable`.

Until we find time to understand that / fix upstream, this PR adds the
error message to the existing list of known error messages that get
demoted to INFO level.

Refs:
-  refs https://github.com/neondatabase/neon/issues/9562
---
 .../src/tenant/timeline/walreceiver/connection_manager.rs      | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 583d6309ab..d8ea32cd45 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -164,9 +164,10 @@ pub(super) async fn connection_manager_loop_step(
                     Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
                     Err(status) => {
                         match status.code() {
-                            Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") => {
+                            Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") || status.message().contains("error reading a body from connection") => {
                                 // tonic's error handling doesn't provide a clear code for disconnections: we get
                                 // "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
+                                // => https://github.com/neondatabase/neon/issues/9562
                                 info!("broker disconnected: {status}");
                             },
                             _ => {

From eb9832d8460a80d9111b20124863444866d93d14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 27 Jan 2025 20:38:18 +0100
Subject: [PATCH 269/583] Remove PQ_LIB_DIR env var (#10526)

We now don't need libpq any more for the build of the storage
controller, as we use `diesel-async` since #10280. Therefore, we remove
the env var that gave cargo/rustc the location for libpq.

Follow-up of #10280
---
 .github/workflows/_build-and-test-locally.yml | 4 ----
 .github/workflows/build-macos.yml             | 2 +-
 .github/workflows/neon_extra_builds.yml       | 2 +-
 Dockerfile                                    | 2 +-
 4 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 2daed90386..f97402a90b 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -158,8 +158,6 @@ jobs:
 
       - name: Run cargo build
         run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
       # Do install *before* running rust tests because they might recompile the
@@ -217,8 +215,6 @@ jobs:
         env:
           NEXTEST_RETRIES: 3
         run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
           LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib
           export LD_LIBRARY_PATH
 
diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml
index 01d82a1ed2..347a511e98 100644
--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -235,7 +235,7 @@ jobs:
           echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
 
       - name: Run cargo build (only for v17)
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release -j$(sysctl -n hw.ncpu)
+        run: cargo build --all --release -j$(sysctl -n hw.ncpu)
 
       - name: Check that no warnings are produced (only for v17)
         run: ./run_clippy.sh
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 5b5910badf..f077e04d1c 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -114,7 +114,7 @@ jobs:
         run: make walproposer-lib -j$(nproc)
 
       - name: Produce the build stats
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)
+        run: cargo build --all --release --timings -j$(nproc)
 
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@v4
diff --git a/Dockerfile b/Dockerfile
index 2e4f8e5546..a8f7ae0a62 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,7 +45,7 @@ COPY --chown=nonroot . .
 
 ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
-    && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
+    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
       --bin pg_sni_router  \
       --bin pageserver  \
       --bin pagectl  \

From 5477d7db93f47112e0aa3d6625997c06db99d23c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 27 Jan 2025 11:47:49 -0800
Subject: [PATCH 270/583] fast_import: fixes for Postgres v17 (#10414)

Now that the tests are run on v17, they're also run in debug mode, which
is slow. Increase statement_timeout in the test to work around that.
---
 pageserver/src/tenant/timeline/import_pgdata.rs  |  2 +-
 .../import_pgdata/importbucket_client.rs         |  2 +-
 test_runner/regress/test_import_pgdata.py        | 16 ++++------------
 3 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs
index de56468580..6940179ae9 100644
--- a/pageserver/src/tenant/timeline/import_pgdata.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata.rs
@@ -113,7 +113,7 @@ pub async fn doit(
             match res {
                 Ok(_) => break,
                 Err(err) => {
-                    info!(?err, "indefintely waiting for pgdata to finish");
+                    info!(?err, "indefinitely waiting for pgdata to finish");
                     if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled())
                         .await
                         .is_ok()
diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
index bc4d148a29..68937e535d 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
@@ -308,7 +308,7 @@ impl ControlFile {
             202107181 => 14,
             202209061 => 15,
             202307071 => 16,
-            /* XXX pg17 */
+            202406281 => 17,
             catversion => {
                 anyhow::bail!("unrecognized catalog version {catversion}")
             }
diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py
index d02a9d19db..182f715b0e 100644
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -14,10 +14,8 @@ from fixtures.pageserver.http import (
     ImportPgdataIdemptencyKey,
     PageserverApiException,
 )
-from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.utils import run_only_on_postgres
 from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -39,10 +37,6 @@ smoke_params = [
 ]
 
 
-@run_only_on_postgres(
-    [PgVersion.V14, PgVersion.V15, PgVersion.V16],
-    "newer control file catalog version and struct format isn't supported",
-)
 @pytest.mark.parametrize("shard_count,stripe_size,rel_block_size", smoke_params)
 def test_pgdata_import_smoke(
     vanilla_pg: VanillaPostgres,
@@ -117,13 +111,15 @@ def test_pgdata_import_smoke(
         # TODO: would be nicer to just compare pgdump
 
         # Enable IO concurrency for batching on large sequential scan, to avoid making
-        # this test unnecessarily onerous on CPU
+        # this test unnecessarily onerous on CPU. Especially on debug mode, it's still
+        # pretty onerous though, so increase statement_timeout to avoid timeouts.
         assert ep.safe_psql_many(
             [
                 "set effective_io_concurrency=32;",
+                "SET statement_timeout='300s';",
                 "select count(*), sum(data::bigint)::bigint from t",
             ]
-        ) == [[], [(expect_nrows, expect_sum)]]
+        ) == [[], [], [(expect_nrows, expect_sum)]]
 
     validate_vanilla_equivalence(vanilla_pg)
 
@@ -317,10 +313,6 @@ def test_pgdata_import_smoke(
         br_initdb_endpoint.safe_psql("select * from othertable")
 
 
-@run_only_on_postgres(
-    [PgVersion.V14, PgVersion.V15, PgVersion.V16],
-    "newer control file catalog version and struct format isn't supported",
-)
 def test_fast_import_binary(
     test_output_dir,
     vanilla_pg: VanillaPostgres,

From d73f4a6470b022a4ca5a163adaf8efc79f6b7e15 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 27 Jan 2025 21:02:25 +0000
Subject: [PATCH 271/583] pageserver: retry wrapper on manifest upload (#10524)

## Problem

On remote storage errors (e.g. I/O timeout) uploading tenant manifest,
all of compaction could fail. This is a problem IRL because we shouldn't
abort compaction on a single IO error, and in tests because it generates
spurious failures.

Related:
https://github.com/orgs/neondatabase/projects/51/views/2?sliceBy%5Bvalue%5D=jcsp&pane=issue&itemId=93692919&issue=neondatabase%7Cneon%7C10389

## Summary of changes

- Use `backoff::retry` when uploading tenant manifest
---
 pageserver/src/tenant.rs | 46 +++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index efe89cb982..d359270be4 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -37,6 +37,8 @@ use remote_timeline_client::manifest::{
     OffloadedTimelineManifest, TenantManifest, LATEST_TENANT_MANIFEST_VERSION,
 };
 use remote_timeline_client::UploadQueueNotReadyError;
+use remote_timeline_client::FAILED_REMOTE_OP_RETRIES;
+use remote_timeline_client::FAILED_UPLOAD_WARN_THRESHOLD;
 use std::collections::BTreeMap;
 use std::fmt;
 use std::future::Future;
@@ -5308,27 +5310,37 @@ impl Tenant {
             return Ok(());
         }
 
-        upload_tenant_manifest(
-            &self.remote_storage,
-            &self.tenant_shard_id,
-            self.generation,
-            &manifest,
+        // Remote storage does no retries internally, so wrap it
+        match backoff::retry(
+            || async {
+                upload_tenant_manifest(
+                    &self.remote_storage,
+                    &self.tenant_shard_id,
+                    self.generation,
+                    &manifest,
+                    &self.cancel,
+                )
+                .await
+            },
+            |_e| self.cancel.is_cancelled(),
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "uploading tenant manifest",
             &self.cancel,
         )
         .await
-        .map_err(|e| {
-            if self.cancel.is_cancelled() {
-                TenantManifestError::Cancelled
-            } else {
-                TenantManifestError::RemoteStorage(e)
+        {
+            None => Err(TenantManifestError::Cancelled),
+            Some(Err(_)) if self.cancel.is_cancelled() => Err(TenantManifestError::Cancelled),
+            Some(Err(e)) => Err(TenantManifestError::RemoteStorage(e)),
+            Some(Ok(_)) => {
+                // Store the successfully uploaded manifest, so that future callers can avoid
+                // re-uploading the same thing.
+                *guard = Some(manifest);
+
+                Ok(())
             }
-        })?;
-
-        // Store the successfully uploaded manifest, so that future callers can avoid
-        // re-uploading the same thing.
-        *guard = Some(manifest);
-
-        Ok(())
+        }
     }
 }
 

From c8fbbb9b65587d25b9dbd3c8f21266ce07159d02 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Mon, 27 Jan 2025 22:06:05 +0100
Subject: [PATCH 272/583] Test ingest_benchmark with different stripe size and
 also PostgreSQL version 17 (#10510)

We want to verify if pageserver stripe size has an impact on ingest
performance.
We want to verify if ingest performance has improved or regressed with
postgres version 17.

## Summary of changes

- Allow to create new project with different postgres versions
- allow to pre-shard new project with different stripe sizes instead of
relying on storage manager to shard_split the project once a threshold
is exceeded

Replaces https://github.com/neondatabase/neon/pull/10509

Test run https://github.com/neondatabase/neon/actions/runs/12986410381
---
 .../actions/neon-project-create/action.yml    | 48 +++++++++++++++++++
 .github/workflows/ingest_benchmark.yml        | 33 ++++++++++---
 2 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index f4a194639f..11f46bce8e 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -17,6 +17,31 @@ inputs:
   compute_units:
     description: '[Min, Max] compute units'
     default: '[1, 1]'
+  # settings below only needed if you want the project to be sharded from the beginning
+  shard_split_project:
+    description: 'by default new projects are not shard-split, specify true to shard-split'
+    required: false
+    default: 'false'
+  admin_api_key:
+    description: 'Admin API Key needed for shard-splitting. Must be specified if shard_split_project is true'
+    required: false
+  shard_count:
+    description: 'Number of shards to split the project into, only applies if shard_split_project is true'
+    required: false
+    default: '8'
+  stripe_size:
+    description: 'Stripe size, optional, in 8kiB pages.  e.g. set 2048 for 16MB stripes. Default is 128 MiB, only applies if shard_split_project is true'
+    required: false
+    default: '32768'
+  psql_path:
+    description: 'Path to psql binary - it is caller responsibility to provision the psql binary'
+    required: false
+    default: '/tmp/neon/pg_install/v16/bin/psql'
+  libpq_lib_path:
+    description: 'Path to directory containing libpq library - it is caller responsibility to provision the libpq library'
+    required: false
+    default: '/tmp/neon/pg_install/v16/lib'
+  
 
 outputs:
   dsn:
@@ -63,6 +88,23 @@ runs:
         echo "project_id=${project_id}" >> $GITHUB_OUTPUT
 
         echo "Project ${project_id} has been created"
+
+        if [ "${SHARD_SPLIT_PROJECT}" = "true" ]; then
+          # determine tenant ID
+          TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"`
+          
+          echo "Splitting project ${project_id} with tenant_id ${TENANT_ID} into $((SHARD_COUNT)) shards with stripe size $((STRIPE_SIZE))"
+
+          echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split"
+          echo "with body {\"new_shard_count\": $((SHARD_COUNT)), \"new_stripe_size\": $((STRIPE_SIZE))}"
+          
+          # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set)
+          curl -X PUT \
+            "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split" \
+            -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
+            -d "{\"new_shard_count\": $SHARD_COUNT, \"new_stripe_size\": $STRIPE_SIZE}"
+        fi
+
       env:
         API_HOST: ${{ inputs.api_host }}
         API_KEY: ${{ inputs.api_key }}
@@ -70,3 +112,9 @@ runs:
         POSTGRES_VERSION: ${{ inputs.postgres_version }}
         MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
         MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
+        SHARD_SPLIT_PROJECT: ${{ inputs.shard_split_project }}
+        ADMIN_API_KEY: ${{ inputs.admin_api_key }}
+        SHARD_COUNT: ${{ inputs.shard_count }}
+        STRIPE_SIZE: ${{ inputs.stripe_size }}
+        PSQL: ${{ inputs.psql_path }}
+        LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }}
diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml
index fc33c0a980..7b303fa37a 100644
--- a/.github/workflows/ingest_benchmark.yml
+++ b/.github/workflows/ingest_benchmark.yml
@@ -28,7 +28,24 @@ jobs:
     strategy:
       fail-fast: false # allow other variants to continue even if one fails
       matrix:
-        target_project: [new_empty_project, large_existing_project]
+        include:
+          - target_project: new_empty_project_stripe_size_2048 
+            stripe_size: 2048 # 16 MiB
+            postgres_version: 16
+          - target_project: new_empty_project_stripe_size_32768
+            stripe_size: 32768 # 256 MiB # note that this is different from null because using null will shard_split the project only if it reaches the threshold
+                               # while here it is sharded from the beginning with a shard size of 256 MiB
+            postgres_version: 16
+          - target_project: new_empty_project
+            stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
+            postgres_version: 16
+          - target_project: new_empty_project
+            stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
+            postgres_version: 17
+          - target_project: large_existing_project
+            stripe_size: null # cannot re-shared or choose different stripe size for existing, already sharded project
+            postgres_version: 16
+      max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
     permissions:
       contents: write
       statuses: write
@@ -67,17 +84,21 @@ jobs:
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Create Neon Project
-      if: ${{ matrix.target_project == 'new_empty_project' }}
+      if: ${{ startsWith(matrix.target_project, 'new_empty_project') }}
       id: create-neon-project-ingest-target
       uses: ./.github/actions/neon-project-create
       with:
         region_id: aws-us-east-2
-        postgres_version: 16
+        postgres_version: ${{ matrix.postgres_version }}
         compute_units: '[7, 7]' # we want to test large compute here to avoid compute-side bottleneck
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        shard_split_project: ${{ matrix.stripe_size != null && 'true' || 'false' }}
+        admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }} 
+        shard_count: 8
+        stripe_size: ${{ matrix.stripe_size }}
 
     - name: Initialize Neon project
-      if: ${{ matrix.target_project == 'new_empty_project' }}
+      if: ${{ startsWith(matrix.target_project, 'new_empty_project') }}
       env:
           BENCHMARK_INGEST_TARGET_CONNSTR: ${{ steps.create-neon-project-ingest-target.outputs.dsn }}
           NEW_PROJECT_ID: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
@@ -130,7 +151,7 @@ jobs:
         test_selection: performance/test_perf_ingest_using_pgcopydb.py
         run_in_parallel: false
         extra_params: -s -m remote_cluster --timeout 86400 -k test_ingest_performance_using_pgcopydb
-        pg_version: v16
+        pg_version: v${{ matrix.postgres_version }}
         save_perf_report: true
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
@@ -146,7 +167,7 @@ jobs:
         ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "\dt+"
 
     - name: Delete Neon Project
-      if: ${{ always() && matrix.target_project == 'new_empty_project' }}
+      if: ${{ always() && startsWith(matrix.target_project, 'new_empty_project') }}
       uses: ./.github/actions/neon-project-delete
       with:
         project_id: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}

From 62a717a2ca8218bacf90a36cc84ebb6aa2976711 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 28 Jan 2025 13:11:51 +0000
Subject: [PATCH 273/583] pageserver: use PS node id for SK appname (#10522)

## Problem

This one is fairly embarrassing. Safekeeper node id was used in the
pageserver application name
when connecting to safekeepers.

## Summary of changes

Use the right node id.

Closes https://github.com/neondatabase/neon/issues/10461
---
 .../tenant/timeline/walreceiver/walreceiver_connection.rs   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 01c272633c..d69e7dbd32 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -118,7 +118,7 @@ pub(super) async fn handle_walreceiver_connection(
     cancellation: CancellationToken,
     connect_timeout: Duration,
     ctx: RequestContext,
-    node: NodeId,
+    safekeeper_node: NodeId,
     ingest_batch_size: u64,
 ) -> Result<(), WalReceiverError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
@@ -140,7 +140,7 @@ pub(super) async fn handle_walreceiver_connection(
 
     let (replication_client, connection) = {
         let mut config = wal_source_connconf.to_tokio_postgres_config();
-        config.application_name(format!("pageserver-{}", node.0).as_str());
+        config.application_name(format!("pageserver-{}", timeline.conf.id.0).as_str());
         config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
         match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
             Ok(client_and_conn) => client_and_conn?,
@@ -162,7 +162,7 @@ pub(super) async fn handle_walreceiver_connection(
         latest_wal_update: Utc::now().naive_utc(),
         streaming_lsn: None,
         commit_lsn: None,
-        node,
+        node: safekeeper_node,
     };
     if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
         warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}");

From ed942b05f7af31082e9e8c893910b757e3de76f3 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 28 Jan 2025 14:33:58 +0100
Subject: [PATCH 274/583] Revert "pageserver: revert flush backpressure"
 (#10402)" (#10533)

This reverts commit 9e55d798036d646cbc27cd0bb2c849ed2e48ce85.

We'll still need this until we can tune L0 flush backpressure and
compaction. I'll add a setting to disable this separately.
---
 pageserver/src/metrics.rs                  | 25 ++++++++++-
 pageserver/src/tenant/timeline.rs          | 38 +++++++++++++----
 test_runner/fixtures/metrics.py            |  1 +
 test_runner/regress/test_branching.py      | 13 ++++--
 test_runner/regress/test_remote_storage.py | 48 ++++++++++++++++++++++
 5 files changed, 112 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 5247a4a2ac..d2c778276d 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -3,7 +3,7 @@ use metrics::{
     register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
     register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
     register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
-    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
+    Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
     IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
@@ -398,6 +398,15 @@ pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static FLUSH_WAIT_UPLOAD_TIME: Lazy<GaugeVec> = Lazy::new(|| {
+    register_gauge_vec!(
+        "pageserver_flush_wait_upload_seconds",
+        "Time spent waiting for preceding uploads during layer flush",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
         "pageserver_last_record_lsn",
@@ -2569,6 +2578,7 @@ pub(crate) struct TimelineMetrics {
     timeline_id: String,
     pub flush_time_histo: StorageTimeMetrics,
     pub flush_delay_histo: StorageTimeMetrics,
+    pub flush_wait_upload_time_gauge: Gauge,
     pub compact_time_histo: StorageTimeMetrics,
     pub create_images_time_histo: StorageTimeMetrics,
     pub logical_size_histo: StorageTimeMetrics,
@@ -2620,6 +2630,9 @@ impl TimelineMetrics {
             &shard_id,
             &timeline_id,
         );
+        let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
         let compact_time_histo = StorageTimeMetrics::new(
             StorageTimeOperation::Compact,
             &tenant_id,
@@ -2766,6 +2779,7 @@ impl TimelineMetrics {
             timeline_id,
             flush_time_histo,
             flush_delay_histo,
+            flush_wait_upload_time_gauge,
             compact_time_histo,
             create_images_time_histo,
             logical_size_histo,
@@ -2815,6 +2829,14 @@ impl TimelineMetrics {
         self.resident_physical_size_gauge.get()
     }
 
+    pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) {
+        self.flush_wait_upload_time_gauge.add(duration);
+        crate::metrics::FLUSH_WAIT_UPLOAD_TIME
+            .get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id])
+            .unwrap()
+            .add(duration);
+    }
+
     pub(crate) fn shutdown(&self) {
         let was_shutdown = self
             .shutdown
@@ -2832,6 +2854,7 @@ impl TimelineMetrics {
         let shard_id = &self.shard_id;
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         {
             RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 07689b5e76..61981db24a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -144,15 +144,19 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::config::TenantConf;
-use super::remote_timeline_client::index::IndexPart;
-use super::remote_timeline_client::RemoteTimelineClient;
-use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
-use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
-use super::upload_queue::NotInitialized;
-use super::GcError;
 use super::{
-    debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf, MaybeOffloaded,
+    config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized,
+    MaybeOffloaded,
+};
+use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
+use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
+use super::{
+    remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
+    storage_layer::ReadableLayer,
+};
+use super::{
+    secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
+    GcError,
 };
 
 #[cfg(test)]
@@ -4034,6 +4038,24 @@ impl Timeline {
             // release lock on 'layers'
         };
 
+        // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
+        // This makes us refuse ingest until the new layers have been persisted to the remote
+        let start = Instant::now();
+        self.remote_client
+            .wait_completion()
+            .await
+            .map_err(|e| match e {
+                WaitCompletionError::UploadQueueShutDownOrStopped
+                | WaitCompletionError::NotInitialized(
+                    NotInitialized::ShuttingDown | NotInitialized::Stopped,
+                ) => FlushLayerError::Cancelled,
+                WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => {
+                    FlushLayerError::Other(anyhow!(e).into())
+                }
+            })?;
+        let duration = start.elapsed().as_secs_f64();
+        self.metrics.flush_wait_upload_time_gauge_add(duration);
+
         // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
         // a compaction can delete the file and then it won't be available for uploads any more.
         // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 37859901d4..fd7e193778 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -165,6 +165,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     "pageserver_evictions_with_low_residence_duration_total",
     "pageserver_aux_file_estimated_size",
     "pageserver_valid_lsn_lease_count",
+    "pageserver_flush_wait_upload_seconds",
     counter("pageserver_tenant_throttling_count_accounted_start"),
     counter("pageserver_tenant_throttling_count_accounted_finish"),
     counter("pageserver_tenant_throttling_wait_usecs_sum"),
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index a4056404f0..34e4e994cb 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -19,7 +19,6 @@ from fixtures.pageserver.utils import wait_until_tenant_active
 from fixtures.utils import query_scalar
 from performance.test_perf_pgbench import get_scales_matrix
 from requests import RequestException
-from requests.exceptions import RetryError
 
 
 # Test branch creation
@@ -177,8 +176,11 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
 
         env.neon_cli.mappings_map_branch(initial_branch, env.initial_tenant, env.initial_timeline)
 
-        with pytest.raises(RuntimeError, match="is not active, state: Loading"):
-            env.endpoints.create_start(initial_branch, tenant_id=env.initial_tenant)
+        with pytest.raises(RuntimeError, match="ERROR: Not found: Timeline"):
+            env.endpoints.create_start(
+                initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2
+            )
+        ps_http.configure_failpoints(("before-upload-index-pausable", "off"))
     finally:
         env.pageserver.stop(immediate=True)
 
@@ -219,7 +221,10 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
 
         branch_id = TimelineId.generate()
 
-        with pytest.raises(RetryError, match="too many 503 error responses"):
+        with pytest.raises(
+            PageserverApiException,
+            match="Cannot branch off the timeline that's not present in pageserver",
+        ):
             ps_http.timeline_create(
                 env.pg_version,
                 env.initial_tenant,
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index f6bc6f6f41..c39c74fa2a 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -786,6 +786,54 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
         create_thread.join()
 
 
+def test_paused_upload_stalls_checkpoint(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    This test checks that checkpoints block on uploads to remote storage.
+    """
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            # Set a small compaction threshold
+            "compaction_threshold": "3",
+            # Disable GC
+            "gc_period": "0s",
+            # disable PITR
+            "pitr_interval": "0s",
+        }
+    )
+
+    env.pageserver.allowed_errors.append(
+        f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
+    )
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    client = env.pageserver.http_client()
+    layers_at_creation = client.layer_map_info(tenant_id, timeline_id)
+    deltas_at_creation = len(layers_at_creation.delta_layers())
+    assert (
+        deltas_at_creation == 1
+    ), "are you fixing #5863? make sure we end up with 2 deltas at the end of endpoint lifecycle"
+
+    # Make new layer uploads get stuck.
+    # Note that timeline creation waits for the initial layers to reach remote storage.
+    # So at this point, the `layers_at_creation` are in remote storage.
+    client.configure_failpoints(("before-upload-layer-pausable", "pause"))
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        # Build two tables with some data inside
+        endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)")
+        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+
+        with pytest.raises(ReadTimeout):
+            client.timeline_checkpoint(tenant_id, timeline_id, timeout=5)
+        client.configure_failpoints(("before-upload-layer-pausable", "off"))
+
+
 def wait_upload_queue_empty(
     client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
 ):

From 83b6bfa229e2b983ab93a1e5401f3a31b59e3c03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 28 Jan 2025 14:39:53 +0100
Subject: [PATCH 275/583] Re-download layer if its local and on-disk metadata
 diverge (#10529)

In #10308, we noticed many warnings about the local layer having
different sizes on-disk compared to the metadata.

However, the layer downloader would never redownload layer files if the
sizes or generation numbers change. This is obviously a bug, which we
aim to fix with this PR.

This change also moves the code deciding what to do about a layer to a
dedicated function: before we handled the "routing" via control flow,
but now it's become too complicated and it is nicer to have the
different verdicts for a layer spelled out in a list/match.
---
 .../tenant/remote_timeline_client/index.rs    |   4 +
 pageserver/src/tenant/secondary/downloader.rs | 155 +++++++++++-------
 2 files changed, 99 insertions(+), 60 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 3824bc8f11..b8b18005fd 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -222,6 +222,10 @@ impl LayerFileMetadata {
             shard,
         }
     }
+    /// Helper to get both generation and file size in a tuple
+    pub fn generation_file_size(&self) -> (Generation, u64) {
+        (self.generation, self.file_size)
+    }
 }
 
 /// Limited history of earlier ancestors.
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 395e34e404..cf524fcb25 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -559,6 +559,13 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
     }
 }
 
+enum LayerAction {
+    Download,
+    NoAction,
+    Skip,
+    Touch,
+}
+
 /// This type is a convenience to group together the various functions involved in
 /// freshening a secondary tenant.
 struct TenantDownloader<'a> {
@@ -1008,69 +1015,17 @@ impl<'a> TenantDownloader<'a> {
                 return (Err(UpdateError::Restart), touched);
             }
 
-            // Existing on-disk layers: just update their access time.
-            if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
-                tracing::debug!("Layer {} is already on disk", layer.name);
-
-                if cfg!(debug_assertions) {
-                    // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
-                    // are already present on disk are really there.
-                    match tokio::fs::metadata(&on_disk.local_path).await {
-                        Ok(meta) => {
-                            tracing::debug!(
-                                "Layer {} present at {}, size {}",
-                                layer.name,
-                                on_disk.local_path,
-                                meta.len(),
-                            );
-                        }
-                        Err(e) => {
-                            tracing::warn!(
-                                "Layer {} not found at {} ({})",
-                                layer.name,
-                                on_disk.local_path,
-                                e
-                            );
-                            debug_assert!(false);
-                        }
-                    }
-                }
-
-                if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time {
-                    // We already have this layer on disk.  Update its access time.
-                    tracing::debug!(
-                        "Access time updated for layer {}: {} -> {}",
-                        layer.name,
-                        strftime(&on_disk.access_time),
-                        strftime(&layer.access_time)
-                    );
-                    touched.push(layer);
-                }
-                continue;
-            } else {
-                tracing::debug!("Layer {} not present on disk yet", layer.name);
-            }
-
-            // Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
-            // recently than it was evicted.
-            if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
-                if &layer.access_time > evicted_at {
-                    tracing::info!(
-                        "Re-downloading evicted layer {}, accessed at {}, evicted at {}",
-                        layer.name,
-                        strftime(&layer.access_time),
-                        strftime(evicted_at)
-                    );
-                } else {
-                    tracing::trace!(
-                        "Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
-                        layer.name,
-                        strftime(&layer.access_time),
-                        strftime(evicted_at)
-                    );
+            match self.layer_action(&timeline_state, &layer).await {
+                LayerAction::Download => (),
+                LayerAction::NoAction => continue,
+                LayerAction::Skip => {
                     self.skip_layer(layer);
                     continue;
                 }
+                LayerAction::Touch => {
+                    touched.push(layer);
+                    continue;
+                }
             }
 
             match self
@@ -1091,6 +1046,86 @@ impl<'a> TenantDownloader<'a> {
         (Ok(()), touched)
     }
 
+    async fn layer_action(
+        &self,
+        timeline_state: &SecondaryDetailTimeline,
+        layer: &HeatMapLayer,
+    ) -> LayerAction {
+        // Existing on-disk layers: just update their access time.
+        if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
+            tracing::debug!("Layer {} is already on disk", layer.name);
+
+            if cfg!(debug_assertions) {
+                // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
+                // are already present on disk are really there.
+                match tokio::fs::metadata(&on_disk.local_path).await {
+                    Ok(meta) => {
+                        tracing::debug!(
+                            "Layer {} present at {}, size {}",
+                            layer.name,
+                            on_disk.local_path,
+                            meta.len(),
+                        );
+                    }
+                    Err(e) => {
+                        tracing::warn!(
+                            "Layer {} not found at {} ({})",
+                            layer.name,
+                            on_disk.local_path,
+                            e
+                        );
+                        debug_assert!(false);
+                    }
+                }
+            }
+
+            if on_disk.metadata.generation_file_size() != on_disk.metadata.generation_file_size() {
+                tracing::info!(
+                    "Re-downloading layer {} with changed size or generation: {:?}->{:?}",
+                    layer.name,
+                    on_disk.metadata.generation_file_size(),
+                    on_disk.metadata.generation_file_size()
+                );
+                return LayerAction::Download;
+            }
+            if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time {
+                // We already have this layer on disk.  Update its access time.
+                tracing::debug!(
+                    "Access time updated for layer {}: {} -> {}",
+                    layer.name,
+                    strftime(&on_disk.access_time),
+                    strftime(&layer.access_time)
+                );
+                return LayerAction::Touch;
+            }
+            return LayerAction::NoAction;
+        } else {
+            tracing::debug!("Layer {} not present on disk yet", layer.name);
+        }
+
+        // Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
+        // recently than it was evicted.
+        if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
+            if &layer.access_time > evicted_at {
+                tracing::info!(
+                    "Re-downloading evicted layer {}, accessed at {}, evicted at {}",
+                    layer.name,
+                    strftime(&layer.access_time),
+                    strftime(evicted_at)
+                );
+            } else {
+                tracing::trace!(
+                    "Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
+                    layer.name,
+                    strftime(&layer.access_time),
+                    strftime(evicted_at)
+                );
+                return LayerAction::Skip;
+            }
+        }
+        LayerAction::Download
+    }
+
     async fn download_timeline(
         &self,
         timeline: HeatMapTimeline,

From 47677ba578619cce5aa961840489515fbbfbbe3a Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 28 Jan 2025 15:51:30 +0100
Subject: [PATCH 276/583] pageserver: disable L0 backpressure by default
 (#10535)

## Problem

We'll need further improvements to compaction before enabling L0 flush
backpressure by default. See:
https://neondb.slack.com/archives/C033RQ5SPDH/p1738066068960519?thread_ts=1737818888.474179&cid=C033RQ5SPDH.

Touches #5415.

## Summary of changes

Disable `l0_flush_delay_threshold` by default.
---
 libs/pageserver_api/src/config.rs |  7 +++----
 pageserver/src/tenant/timeline.rs | 20 +++++++-------------
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 5866145690..cc6e4a3699 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -260,11 +260,10 @@ pub struct TenantConfigToml {
     /// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure,
     /// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer
     /// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification
-    /// blowing up. Should be >compaction_threshold. If None, defaults to 2 * compaction_threshold.
-    /// 0 to disable.
+    /// blowing up. Should be >compaction_threshold. 0 to disable. Disabled by default.
     pub l0_flush_delay_threshold: Option<usize>,
-    /// Level0 delta layer threshold at which to stall layer flushes. 0 to disable. If None,
-    /// defaults to 4 * compaction_threshold. Must be >compaction_threshold to avoid deadlock.
+    /// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold
+    /// to avoid deadlock. 0 to disable. Disabled by default.
     pub l0_flush_stall_threshold: Option<usize>,
     // Determines how much history is retained, to allow
     // branching and read replicas at an older point in time.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 61981db24a..280a3baa21 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2172,8 +2172,8 @@ impl Timeline {
     }
 
     fn get_l0_flush_delay_threshold(&self) -> Option<usize> {
-        // Default to delay L0 flushes at 3x compaction threshold.
-        const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 3;
+        // Disable L0 flushes by default. This and compaction needs further tuning.
+        const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 0; // TODO: default to e.g. 3
 
         // If compaction is disabled, don't delay.
         if self.get_compaction_period() == Duration::ZERO {
@@ -2201,10 +2201,9 @@ impl Timeline {
     }
 
     fn get_l0_flush_stall_threshold(&self) -> Option<usize> {
-        // Default to stall L0 flushes at 5x compaction threshold.
-        // TODO: stalls are temporarily disabled by default, see below.
-        #[allow(unused)]
-        const DEFAULT_L0_FLUSH_STALL_FACTOR: usize = 5;
+        // Disable L0 stalls by default. In ingest benchmarks, we see image compaction take >10
+        // minutes, blocking L0 compaction, and we can't stall L0 flushes for that long.
+        const DEFAULT_L0_FLUSH_STALL_FACTOR: usize = 0; // TODO: default to e.g. 5
 
         // If compaction is disabled, don't stall.
         if self.get_compaction_period() == Duration::ZERO {
@@ -2236,13 +2235,8 @@ impl Timeline {
             return None;
         }
 
-        // Disable stalls by default. In ingest benchmarks, we see image compaction take >10
-        // minutes, blocking L0 compaction, and we can't stall L0 flushes for that long.
-        //
-        // TODO: fix this.
-        // let l0_flush_stall_threshold = l0_flush_stall_threshold
-        //    .unwrap_or(DEFAULT_L0_FLUSH_STALL_FACTOR * compaction_threshold);
-        let l0_flush_stall_threshold = l0_flush_stall_threshold?;
+        let l0_flush_stall_threshold = l0_flush_stall_threshold
+            .unwrap_or(DEFAULT_L0_FLUSH_STALL_FACTOR * compaction_threshold);
 
         // 0 disables backpressure.
         if l0_flush_stall_threshold == 0 {

From 15fecb847476bcdbd0044a85b1a0a7e1d7d50d30 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 28 Jan 2025 09:32:59 -0600
Subject: [PATCH 277/583] Update axum to 0.8.1 (#10332)

Only a few things that needed updating:

- async_trait was removed
- Message::Text takes a Utf8Bytes object instead of a String

Signed-off-by: Tristan Partin <tristan@neon.tech>
Co-authored-by: Conrad Ludgate <connor@neon.tech>
---
 Cargo.lock                                  | 125 +++++++++++---------
 Cargo.toml                                  |   2 +-
 compute_tools/src/http/extract/json.rs      |   6 +-
 compute_tools/src/http/extract/path.rs      |   6 +-
 compute_tools/src/http/extract/query.rs     |   6 +-
 compute_tools/src/http/server.rs            |   2 +-
 deny.toml                                   |   2 +-
 libs/vm_monitor/src/dispatcher.rs           |  12 +-
 pageserver/compaction/src/simulator/draw.rs |  13 +-
 9 files changed, 87 insertions(+), 87 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a201a6abae..3c33901247 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -179,7 +179,7 @@ dependencies = [
  "nom",
  "num-traits",
  "rusticata-macros",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
 ]
 
@@ -718,14 +718,14 @@ dependencies = [
 
 [[package]]
 name = "axum"
-version = "0.7.9"
+version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
+checksum = "6d6fd624c75e18b3b4c6b9caf42b1afe24437daaee904069137d8bab077be8b8"
 dependencies = [
- "async-trait",
  "axum-core",
  "base64 0.22.1",
  "bytes",
+ "form_urlencoded",
  "futures-util",
  "http 1.1.0",
  "http-body 1.0.0",
@@ -733,7 +733,7 @@ dependencies = [
  "hyper 1.4.1",
  "hyper-util",
  "itoa",
- "matchit 0.7.0",
+ "matchit",
  "memchr",
  "mime",
  "percent-encoding",
@@ -746,7 +746,7 @@ dependencies = [
  "sha1",
  "sync_wrapper 1.0.1",
  "tokio",
- "tokio-tungstenite 0.24.0",
+ "tokio-tungstenite 0.26.1",
  "tower 0.5.2",
  "tower-layer",
  "tower-service",
@@ -755,11 +755,10 @@ dependencies = [
 
 [[package]]
 name = "axum-core"
-version = "0.4.5"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
+checksum = "df1362f362fd16024ae199c1970ce98f9661bf5ef94b9808fee734bc3698b733"
 dependencies = [
- "async-trait",
  "bytes",
  "futures-util",
  "http 1.1.0",
@@ -1130,7 +1129,7 @@ dependencies = [
  "log",
  "nix 0.25.1",
  "regex",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -1311,7 +1310,7 @@ dependencies = [
  "serde_with",
  "signal-hook",
  "tar",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-postgres 0.7.7",
  "tokio-stream",
@@ -1420,7 +1419,7 @@ dependencies = [
  "serde",
  "serde_json",
  "storage_broker",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-postgres 0.7.7",
  "tokio-util",
@@ -2264,7 +2263,7 @@ dependencies = [
  "pin-project",
  "rand 0.8.5",
  "sha1",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-util",
 ]
@@ -3390,12 +3389,6 @@ dependencies = [
  "regex-automata 0.1.10",
 ]
 
-[[package]]
-name = "matchit"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40"
-
 [[package]]
 name = "matchit"
 version = "0.8.4"
@@ -3786,7 +3779,7 @@ dependencies = [
  "serde_json",
  "serde_path_to_error",
  "sha2",
- "thiserror",
+ "thiserror 1.0.69",
  "url",
 ]
 
@@ -3836,7 +3829,7 @@ dependencies = [
  "futures-sink",
  "js-sys",
  "pin-project-lite",
- "thiserror",
+ "thiserror 1.0.69",
  "tracing",
 ]
 
@@ -3868,7 +3861,7 @@ dependencies = [
  "opentelemetry_sdk",
  "prost",
  "reqwest",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -3904,7 +3897,7 @@ dependencies = [
  "percent-encoding",
  "rand 0.8.5",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-stream",
  "tracing",
@@ -4018,7 +4011,7 @@ dependencies = [
  "remote_storage",
  "serde_json",
  "svg_fmt",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-util",
  "utils",
@@ -4094,7 +4087,7 @@ dependencies = [
  "strum_macros",
  "sysinfo",
  "tenant_size_model",
- "thiserror",
+ "thiserror 1.0.69",
  "tikv-jemallocator",
  "tokio",
  "tokio-epoll-uring",
@@ -4140,7 +4133,7 @@ dependencies = [
  "storage_broker",
  "strum",
  "strum_macros",
- "thiserror",
+ "thiserror 1.0.69",
  "utils",
 ]
 
@@ -4155,7 +4148,7 @@ dependencies = [
  "postgres",
  "reqwest",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-postgres 0.7.7",
  "tokio-stream",
@@ -4559,7 +4552,7 @@ dependencies = [
  "rustls 0.23.18",
  "rustls-pemfile 2.1.1",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-postgres 0.7.7",
  "tokio-postgres-rustls",
@@ -4597,7 +4590,7 @@ dependencies = [
  "pprof",
  "regex",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
  "tracing",
  "utils",
 ]
@@ -4608,7 +4601,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "camino",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "workspace_hack",
 ]
@@ -4641,7 +4634,7 @@ dependencies = [
  "smallvec",
  "symbolic-demangle",
  "tempfile",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -4673,7 +4666,7 @@ dependencies = [
  "postgres-protocol 0.6.4",
  "rand 0.8.5",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
 ]
 
@@ -4744,7 +4737,7 @@ dependencies = [
  "memchr",
  "parking_lot 0.12.1",
  "procfs",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -4914,7 +4907,7 @@ dependencies = [
  "strum",
  "strum_macros",
  "subtle",
- "thiserror",
+ "thiserror 1.0.69",
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
  "tokio",
@@ -5311,7 +5304,7 @@ dependencies = [
  "http 1.1.0",
  "reqwest",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
  "tower-service",
 ]
 
@@ -5331,7 +5324,7 @@ dependencies = [
  "reqwest",
  "reqwest-middleware",
  "retry-policies",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tracing",
  "wasm-timer",
@@ -5347,7 +5340,7 @@ dependencies = [
  "async-trait",
  "getrandom 0.2.11",
  "http 1.1.0",
- "matchit 0.8.4",
+ "matchit",
  "opentelemetry",
  "reqwest",
  "reqwest-middleware",
@@ -5726,7 +5719,7 @@ dependencies = [
  "storage_broker",
  "strum",
  "strum_macros",
- "thiserror",
+ "thiserror 1.0.69",
  "tikv-jemallocator",
  "tokio",
  "tokio-io-timeout",
@@ -5765,7 +5758,7 @@ dependencies = [
  "reqwest",
  "safekeeper_api",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
  "utils",
  "workspace_hack",
 ]
@@ -5974,7 +5967,7 @@ dependencies = [
  "rand 0.8.5",
  "serde",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
  "url",
  "uuid",
@@ -6046,7 +6039,7 @@ checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6"
 dependencies = [
  "percent-encoding",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -6208,7 +6201,7 @@ checksum = "adc4e5204eb1910f40f9cfa375f6f05b68c3abac4b6fd879c8ff5e7ae8a0a085"
 dependencies = [
  "num-bigint",
  "num-traits",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
 ]
 
@@ -6353,7 +6346,7 @@ dependencies = [
  "serde_json",
  "strum",
  "strum_macros",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-util",
  "tracing",
@@ -6645,7 +6638,16 @@ version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
 dependencies = [
- "thiserror-impl",
+ "thiserror-impl 1.0.69",
+]
+
+[[package]]
+name = "thiserror"
+version = "2.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc"
+dependencies = [
+ "thiserror-impl 2.0.11",
 ]
 
 [[package]]
@@ -6659,6 +6661,17 @@ dependencies = [
  "syn 2.0.90",
 ]
 
+[[package]]
+name = "thiserror-impl"
+version = "2.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
+]
+
 [[package]]
 name = "thread_local"
 version = "1.1.7"
@@ -6815,7 +6828,7 @@ dependencies = [
  "nix 0.26.4",
  "once_cell",
  "scopeguard",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-util",
  "tracing",
@@ -6998,14 +7011,14 @@ dependencies = [
 
 [[package]]
 name = "tokio-tungstenite"
-version = "0.24.0"
+version = "0.26.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edc5f74e248dc973e0dbb7b74c7e0d6fcc301c694ff50049504004ef4d0cdcd9"
+checksum = "be4bf6fecd69fcdede0ec680aaf474cdab988f9de6bc73d3758f0160e3b7025a"
 dependencies = [
  "futures-util",
  "log",
  "tokio",
- "tungstenite 0.24.0",
+ "tungstenite 0.26.1",
 ]
 
 [[package]]
@@ -7315,16 +7328,16 @@ dependencies = [
  "log",
  "rand 0.8.5",
  "sha1",
- "thiserror",
+ "thiserror 1.0.69",
  "url",
  "utf-8",
 ]
 
 [[package]]
 name = "tungstenite"
-version = "0.24.0"
+version = "0.26.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "18e5b8366ee7a95b16d32197d0b2604b43a0be89dc5fac9f8e96ccafbaedda8a"
+checksum = "413083a99c579593656008130e29255e54dcaae495be556cc26888f211648c24"
 dependencies = [
  "byteorder",
  "bytes",
@@ -7334,7 +7347,7 @@ dependencies = [
  "log",
  "rand 0.8.5",
  "sha1",
- "thiserror",
+ "thiserror 2.0.11",
  "utf-8",
 ]
 
@@ -7529,7 +7542,7 @@ dependencies = [
  "signal-hook",
  "strum",
  "strum_macros",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-stream",
  "tokio-tar",
@@ -7629,7 +7642,7 @@ dependencies = [
  "remote_storage",
  "serde",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
  "tikv-jemallocator",
  "tokio",
  "tokio-util",
@@ -8158,7 +8171,7 @@ dependencies = [
  "ring",
  "signature 2.2.0",
  "spki 0.7.3",
- "thiserror",
+ "thiserror 1.0.69",
  "zeroize",
 ]
 
@@ -8175,7 +8188,7 @@ dependencies = [
  "nom",
  "oid-registry",
  "rusticata-macros",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
 ]
 
diff --git a/Cargo.toml b/Cargo.toml
index 6e1e288895..9ccdb45f6d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -65,7 +65,7 @@ aws-smithy-types = "1.2"
 aws-credential-types = "1.2.0"
 aws-sigv4 = { version = "1.2", features = ["sign-http"] }
 aws-types = "1.3"
-axum = { version = "0.7.9", features = ["ws"] }
+axum = { version = "0.8.1", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
 bindgen = "0.70"
diff --git a/compute_tools/src/http/extract/json.rs b/compute_tools/src/http/extract/json.rs
index 41f13625ad..104cc25d5f 100644
--- a/compute_tools/src/http/extract/json.rs
+++ b/compute_tools/src/http/extract/json.rs
@@ -1,9 +1,6 @@
 use std::ops::{Deref, DerefMut};
 
-use axum::{
-    async_trait,
-    extract::{rejection::JsonRejection, FromRequest, Request},
-};
+use axum::extract::{rejection::JsonRejection, FromRequest, Request};
 use compute_api::responses::GenericAPIError;
 use http::StatusCode;
 
@@ -12,7 +9,6 @@ use http::StatusCode;
 #[derive(Debug, Clone, Copy, Default)]
 pub(crate) struct Json<T>(pub T);
 
-#[async_trait]
 impl<S, T> FromRequest<S> for Json<T>
 where
     axum::Json<T>: FromRequest<S, Rejection = JsonRejection>,
diff --git a/compute_tools/src/http/extract/path.rs b/compute_tools/src/http/extract/path.rs
index 95edc657f2..09637a96a4 100644
--- a/compute_tools/src/http/extract/path.rs
+++ b/compute_tools/src/http/extract/path.rs
@@ -1,9 +1,6 @@
 use std::ops::{Deref, DerefMut};
 
-use axum::{
-    async_trait,
-    extract::{rejection::PathRejection, FromRequestParts},
-};
+use axum::extract::{rejection::PathRejection, FromRequestParts};
 use compute_api::responses::GenericAPIError;
 use http::{request::Parts, StatusCode};
 
@@ -12,7 +9,6 @@ use http::{request::Parts, StatusCode};
 #[derive(Debug, Clone, Copy, Default)]
 pub(crate) struct Path<T>(pub T);
 
-#[async_trait]
 impl<S, T> FromRequestParts<S> for Path<T>
 where
     axum::extract::Path<T>: FromRequestParts<S, Rejection = PathRejection>,
diff --git a/compute_tools/src/http/extract/query.rs b/compute_tools/src/http/extract/query.rs
index a1f1b0cef0..9dec3642cf 100644
--- a/compute_tools/src/http/extract/query.rs
+++ b/compute_tools/src/http/extract/query.rs
@@ -1,9 +1,6 @@
 use std::ops::{Deref, DerefMut};
 
-use axum::{
-    async_trait,
-    extract::{rejection::QueryRejection, FromRequestParts},
-};
+use axum::extract::{rejection::QueryRejection, FromRequestParts};
 use compute_api::responses::GenericAPIError;
 use http::{request::Parts, StatusCode};
 
@@ -12,7 +9,6 @@ use http::{request::Parts, StatusCode};
 #[derive(Debug, Clone, Copy, Default)]
 pub(crate) struct Query<T>(pub T);
 
-#[async_trait]
 impl<S, T> FromRequestParts<S> for Query<T>
 where
     axum::extract::Query<T>: FromRequestParts<S, Rejection = QueryRejection>,
diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs
index 40fb1f4b4d..da650585fc 100644
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -55,7 +55,7 @@ async fn serve(port: u16, compute: Arc<ComputeNode>) {
         .route("/database_schema", get(database_schema::get_schema_dump))
         .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
         .route(
-            "/extension_server/*filename",
+            "/extension_server/{*filename}",
             post(extension_server::download_extension),
         )
         .route("/extensions", post(extensions::install_extension))
diff --git a/deny.toml b/deny.toml
index ff8d71cda5..df00a34c60 100644
--- a/deny.toml
+++ b/deny.toml
@@ -41,8 +41,8 @@ allow = [
     "MIT",
     "MPL-2.0",
     "OpenSSL",
-    "Unicode-DFS-2016",
     "Unicode-3.0",
+    "Zlib",
 ]
 confidence-threshold = 0.8
 exceptions = [
diff --git a/libs/vm_monitor/src/dispatcher.rs b/libs/vm_monitor/src/dispatcher.rs
index 6a965ace9b..c81848cb70 100644
--- a/libs/vm_monitor/src/dispatcher.rs
+++ b/libs/vm_monitor/src/dispatcher.rs
@@ -7,7 +7,7 @@
 //! (notifying it of upscale).
 
 use anyhow::{bail, Context};
-use axum::extract::ws::{Message, WebSocket};
+use axum::extract::ws::{Message, Utf8Bytes, WebSocket};
 use futures::{
     stream::{SplitSink, SplitStream},
     SinkExt, StreamExt,
@@ -82,21 +82,21 @@ impl Dispatcher {
 
         let highest_shared_version = match monitor_range.highest_shared_version(&agent_range) {
             Ok(version) => {
-                sink.send(Message::Text(
+                sink.send(Message::Text(Utf8Bytes::from(
                     serde_json::to_string(&ProtocolResponse::Version(version)).unwrap(),
-                ))
+                )))
                 .await
                 .context("failed to notify agent of negotiated protocol version")?;
                 version
             }
             Err(e) => {
-                sink.send(Message::Text(
+                sink.send(Message::Text(Utf8Bytes::from(
                     serde_json::to_string(&ProtocolResponse::Error(format!(
                         "Received protocol version range {} which does not overlap with {}",
                         agent_range, monitor_range
                     )))
                     .unwrap(),
-                ))
+                )))
                 .await
                 .context("failed to notify agent of no overlap between protocol version ranges")?;
                 Err(e).context("error determining suitable protocol version range")?
@@ -126,7 +126,7 @@ impl Dispatcher {
 
         let json = serde_json::to_string(&message).context("failed to serialize message")?;
         self.sink
-            .send(Message::Text(json))
+            .send(Message::Text(Utf8Bytes::from(json)))
             .await
             .context("stream error sending message")
     }
diff --git a/pageserver/compaction/src/simulator/draw.rs b/pageserver/compaction/src/simulator/draw.rs
index 997925067f..4559db09f1 100644
--- a/pageserver/compaction/src/simulator/draw.rs
+++ b/pageserver/compaction/src/simulator/draw.rs
@@ -160,9 +160,12 @@ pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output:
 
         // Fill in and thicken rectangle if it's an
         // image layer so that we can see it.
-        let mut style = Style::default();
-        style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
-        style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5);
+        let mut style = Style {
+            fill: Fill::Color(rgb(0x80, 0x80, 0x80)),
+            stroke: Stroke::Color(rgb(0, 0, 0), 0.5),
+            opacity: 1.0,
+            stroke_opacity: 1.0,
+        };
 
         let y_start = lsn_max - lsn_start;
         let y_end = lsn_max - lsn_end;
@@ -214,10 +217,6 @@ pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output:
         files_seen.insert(f);
     }
 
-    let mut record_style = Style::default();
-    record_style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
-    record_style.stroke = Stroke::None;
-
     writeln!(svg, "{}", EndSvg)?;
 
     let mut layer_events_str = String::new();

From ae4b2af299d555ca66ecc8cb79602163ae86d48b Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 28 Jan 2025 18:08:17 +0100
Subject: [PATCH 278/583] fix(proxy): Use correct identifier for usage metrics
 upload (#10538)

## Problem

The request data and usage metrics S3 requests use the same identifier
shown in logs, causing confusion about what type of upload failed.

## Summary of changes

Use the correct identifier for usage metrics uploads.

neondatabase/cloud#23084
---
 proxy/src/context/parquet.rs | 4 ++--
 proxy/src/usage_metrics.rs   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index d7ffff0483..4f1dd39d92 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -423,11 +423,11 @@ async fn upload_parquet(
     .await
     .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
     .and_then(|x| x)
-    .context("request_data_upload")
+    .with_context(|| format!("request_data_upload: path={path}"))
     .err();
 
     if let Some(err) = maybe_err {
-        tracing::error!(%id, error = ?err, "failed to upload request data");
+        tracing::error!(%id, %path, error = ?err, "failed to upload request data");
     }
 
     Ok(buffer.writer())
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index 487504d709..e1cc7e87b4 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -396,13 +396,13 @@ async fn upload_backup_events(
         TimeoutOrCancel::caused_by_cancel,
         FAILED_UPLOAD_WARN_THRESHOLD,
         FAILED_UPLOAD_MAX_RETRIES,
-        "request_data_upload",
+        "usage_metrics_upload",
         cancel,
     )
     .await
     .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
     .and_then(|x| x)
-    .context("request_data_upload")?;
+    .with_context(|| format!("usage_metrics_upload: path={remote_path}"))?;
     Ok(())
 }
 

From 1010b8add432da557928d4708f21ed002893cfb7 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 28 Jan 2025 18:21:05 +0100
Subject: [PATCH 279/583] pageserver: add `l0_flush_wait_upload` setting
 (#10534)

## Problem

We need a setting to disable the flush upload wait, to test L0 flush
backpressure in staging.

## Summary of changes

Add `l0_flush_wait_upload` setting.
---
 control_plane/src/pageserver.rs               |  5 +++
 libs/pageserver_api/src/config.rs             |  7 ++++
 libs/pageserver_api/src/models.rs             |  6 +++
 pageserver/src/tenant.rs                      |  1 +
 pageserver/src/tenant/config.rs               | 11 +++++
 pageserver/src/tenant/timeline.rs             | 41 ++++++++++++-------
 .../regress/test_attach_tenant_config.py      |  1 +
 7 files changed, 57 insertions(+), 15 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 967810ee06..52527ffa90 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -357,6 +357,11 @@ impl PageServerNode {
                 .map(|x| x.parse::<usize>())
                 .transpose()
                 .context("Failed to parse 'l0_flush_delay_threshold' as an integer")?,
+            l0_flush_wait_upload: settings
+                .remove("l0_flush_wait_upload")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'l0_flush_wait_upload' as a boolean")?,
             l0_flush_stall_threshold: settings
                 .remove("l0_flush_stall_threshold")
                 .map(|x| x.parse::<usize>())
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index cc6e4a3699..40c8837af5 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -265,6 +265,10 @@ pub struct TenantConfigToml {
     /// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold
     /// to avoid deadlock. 0 to disable. Disabled by default.
     pub l0_flush_stall_threshold: Option<usize>,
+    /// If true, Level0 delta layer flushes will wait for S3 upload before flushing the next
+    /// layer. This is a temporary backpressure mechanism which should be removed once
+    /// l0_flush_{delay,stall}_threshold is fully enabled.
+    pub l0_flush_wait_upload: bool,
     // Determines how much history is retained, to allow
     // branching and read replicas at an older point in time.
     // The unit is #of bytes of WAL.
@@ -522,6 +526,8 @@ pub mod tenant_conf_defaults {
     pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
         crate::models::CompactionAlgorithm::Legacy;
 
+    pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = true;
+
     pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
 
     // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
@@ -562,6 +568,7 @@ impl Default for TenantConfigToml {
             },
             l0_flush_delay_threshold: None,
             l0_flush_stall_threshold: None,
+            l0_flush_wait_upload: DEFAULT_L0_FLUSH_WAIT_UPLOAD,
             gc_horizon: DEFAULT_GC_HORIZON,
             gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                 .expect("cannot parse default gc period"),
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 16473415b4..dcc233c7c4 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -466,6 +466,8 @@ pub struct TenantConfigPatch {
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub l0_flush_stall_threshold: FieldPatch<usize>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub l0_flush_wait_upload: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub gc_horizon: FieldPatch<u64>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub gc_period: FieldPatch<String>,
@@ -524,6 +526,7 @@ pub struct TenantConfig {
     pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
     pub l0_flush_delay_threshold: Option<usize>,
     pub l0_flush_stall_threshold: Option<usize>,
+    pub l0_flush_wait_upload: Option<bool>,
     pub gc_horizon: Option<u64>,
     pub gc_period: Option<String>,
     pub image_creation_threshold: Option<usize>,
@@ -559,6 +562,7 @@ impl TenantConfig {
             mut compaction_algorithm,
             mut l0_flush_delay_threshold,
             mut l0_flush_stall_threshold,
+            mut l0_flush_wait_upload,
             mut gc_horizon,
             mut gc_period,
             mut image_creation_threshold,
@@ -597,6 +601,7 @@ impl TenantConfig {
         patch
             .l0_flush_stall_threshold
             .apply(&mut l0_flush_stall_threshold);
+        patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload);
         patch.gc_horizon.apply(&mut gc_horizon);
         patch.gc_period.apply(&mut gc_period);
         patch
@@ -651,6 +656,7 @@ impl TenantConfig {
             compaction_algorithm,
             l0_flush_delay_threshold,
             l0_flush_stall_threshold,
+            l0_flush_wait_upload,
             gc_horizon,
             gc_period,
             image_creation_threshold,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d359270be4..4385fe9a9b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5467,6 +5467,7 @@ pub(crate) mod harness {
                 compaction_algorithm: Some(tenant_conf.compaction_algorithm),
                 l0_flush_delay_threshold: tenant_conf.l0_flush_delay_threshold,
                 l0_flush_stall_threshold: tenant_conf.l0_flush_stall_threshold,
+                l0_flush_wait_upload: Some(tenant_conf.l0_flush_wait_upload),
                 gc_horizon: Some(tenant_conf.gc_horizon),
                 gc_period: Some(tenant_conf.gc_period),
                 image_creation_threshold: Some(tenant_conf.image_creation_threshold),
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index c870ca97b8..50da998c30 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -289,6 +289,10 @@ pub struct TenantConfOpt {
     #[serde(default)]
     pub l0_flush_stall_threshold: Option<usize>,
 
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub l0_flush_wait_upload: Option<bool>,
+
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
     pub gc_horizon: Option<u64>,
@@ -408,6 +412,9 @@ impl TenantConfOpt {
             l0_flush_stall_threshold: self
                 .l0_flush_stall_threshold
                 .or(global_conf.l0_flush_stall_threshold),
+            l0_flush_wait_upload: self
+                .l0_flush_wait_upload
+                .unwrap_or(global_conf.l0_flush_wait_upload),
             gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
             gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
             image_creation_threshold: self
@@ -474,6 +481,7 @@ impl TenantConfOpt {
             mut compaction_algorithm,
             mut l0_flush_delay_threshold,
             mut l0_flush_stall_threshold,
+            mut l0_flush_wait_upload,
             mut gc_horizon,
             mut gc_period,
             mut image_creation_threshold,
@@ -518,6 +526,7 @@ impl TenantConfOpt {
         patch
             .l0_flush_stall_threshold
             .apply(&mut l0_flush_stall_threshold);
+        patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload);
         patch.gc_horizon.apply(&mut gc_horizon);
         patch
             .gc_period
@@ -590,6 +599,7 @@ impl TenantConfOpt {
             compaction_algorithm,
             l0_flush_delay_threshold,
             l0_flush_stall_threshold,
+            l0_flush_wait_upload,
             gc_horizon,
             gc_period,
             image_creation_threshold,
@@ -649,6 +659,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             compaction_threshold: value.compaction_threshold,
             l0_flush_delay_threshold: value.l0_flush_delay_threshold,
             l0_flush_stall_threshold: value.l0_flush_stall_threshold,
+            l0_flush_wait_upload: value.l0_flush_wait_upload,
             gc_horizon: value.gc_horizon,
             gc_period: value.gc_period.map(humantime),
             image_creation_threshold: value.image_creation_threshold,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 280a3baa21..de990a9fe4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2250,6 +2250,14 @@ impl Timeline {
         Some(max(l0_flush_stall_threshold, compaction_threshold))
     }
 
+    fn get_l0_flush_wait_upload(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .l0_flush_wait_upload
+            .unwrap_or(self.conf.default_tenant_conf.l0_flush_wait_upload)
+    }
+
     fn get_image_creation_threshold(&self) -> usize {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
@@ -4034,21 +4042,24 @@ impl Timeline {
 
         // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
         // This makes us refuse ingest until the new layers have been persisted to the remote
-        let start = Instant::now();
-        self.remote_client
-            .wait_completion()
-            .await
-            .map_err(|e| match e {
-                WaitCompletionError::UploadQueueShutDownOrStopped
-                | WaitCompletionError::NotInitialized(
-                    NotInitialized::ShuttingDown | NotInitialized::Stopped,
-                ) => FlushLayerError::Cancelled,
-                WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => {
-                    FlushLayerError::Other(anyhow!(e).into())
-                }
-            })?;
-        let duration = start.elapsed().as_secs_f64();
-        self.metrics.flush_wait_upload_time_gauge_add(duration);
+        // TODO: remove this, and rely on l0_flush_{delay,stall}_threshold instead.
+        if self.get_l0_flush_wait_upload() {
+            let start = Instant::now();
+            self.remote_client
+                .wait_completion()
+                .await
+                .map_err(|e| match e {
+                    WaitCompletionError::UploadQueueShutDownOrStopped
+                    | WaitCompletionError::NotInitialized(
+                        NotInitialized::ShuttingDown | NotInitialized::Stopped,
+                    ) => FlushLayerError::Cancelled,
+                    WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => {
+                        FlushLayerError::Other(anyhow!(e).into())
+                    }
+                })?;
+            let duration = start.elapsed().as_secs_f64();
+            self.metrics.flush_wait_upload_time_gauge_add(duration);
+        }
 
         // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
         // a compaction can delete the file and then it won't be available for uploads any more.
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 1fdba223ad..8b92e4c442 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -141,6 +141,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "compaction_threshold": 13,
         "l0_flush_delay_threshold": 25,
         "l0_flush_stall_threshold": 42,
+        "l0_flush_wait_upload": True,
         "compaction_target_size": 1048576,
         "checkpoint_distance": 10000,
         "checkpoint_timeout": "13m",

From c54cd9e76ab3ace8acd9ec945c7d439ac872756d Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 28 Jan 2025 17:33:07 +0000
Subject: [PATCH 280/583] storcon: signal LSN wait to pageserver during live
 migration (#10452)

## Problem

We've seen the ingest connection manager get stuck shortly after a
migration.

## Summary of changes

A speculative mitigation is to use the same mechanism as get page
requests for kicking LSN ingest. The connection manager monitors
LSN waits and queries the broker if no updates are received for the
timeline.

Closes https://github.com/neondatabase/neon/issues/10351
---
 libs/pageserver_api/src/models.rs             |  7 ++
 pageserver/client/src/mgmt_api.rs             | 15 +++++
 pageserver/src/http/routes.rs                 | 64 +++++++++++++++++++
 pageserver/src/page_service.rs                |  2 +
 pageserver/src/tenant.rs                      |  7 +-
 pageserver/src/tenant/mgr.rs                  |  1 +
 pageserver/src/tenant/timeline.rs             | 28 ++++++--
 .../walreceiver/connection_manager.rs         |  2 +-
 storage_controller/src/pageserver_client.rs   | 18 +++++-
 storage_controller/src/reconciler.rs          | 61 +++++++++++++++++-
 10 files changed, 193 insertions(+), 12 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index dcc233c7c4..16f89ae13b 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1021,6 +1021,13 @@ pub struct TenantConfigPatchRequest {
     pub config: TenantConfigPatch, // as we have a flattened field, we should reject all unknown fields in it
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantWaitLsnRequest {
+    #[serde(flatten)]
+    pub timelines: HashMap<TimelineId, Lsn>,
+    pub timeout: Duration,
+}
+
 /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
 #[derive(Serialize, Deserialize, Clone)]
 #[serde(tag = "slug", content = "data", rename_all = "snake_case")]
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 4e9b11879d..0359bfcd0b 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -763,4 +763,19 @@ impl Client {
             .await
             .map_err(Error::ReceiveBody)
     }
+
+    pub async fn wait_lsn(
+        &self,
+        tenant_shard_id: TenantShardId,
+        request: TenantWaitLsnRequest,
+    ) -> Result<StatusCode> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/wait_lsn",
+            self.mgmt_api_endpoint,
+        );
+
+        self.request_noerror(Method::POST, uri, request)
+            .await
+            .map(|resp| resp.status())
+    }
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 5452719bcd..0f3e9fdab6 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -10,6 +10,7 @@ use std::time::Duration;
 
 use anyhow::{anyhow, Context, Result};
 use enumset::EnumSet;
+use futures::future::join_all;
 use futures::StreamExt;
 use futures::TryFutureExt;
 use humantime::format_rfc3339;
@@ -40,6 +41,7 @@ use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantSorting;
 use pageserver_api::models::TenantState;
+use pageserver_api::models::TenantWaitLsnRequest;
 use pageserver_api::models::TimelineArchivalConfigRequest;
 use pageserver_api::models::TimelineCreateRequestMode;
 use pageserver_api::models::TimelineCreateRequestModeImportPgdata;
@@ -95,6 +97,8 @@ use crate::tenant::timeline::CompactOptions;
 use crate::tenant::timeline::CompactRequest;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
+use crate::tenant::timeline::WaitLsnTimeout;
+use crate::tenant::timeline::WaitLsnWaiter;
 use crate::tenant::GetTimelineError;
 use crate::tenant::OffloadedTimeline;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
@@ -2790,6 +2794,63 @@ async fn secondary_download_handler(
     json_response(status, progress)
 }
 
+async fn wait_lsn_handler(
+    mut request: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let wait_lsn_request: TenantWaitLsnRequest = json_request(&mut request).await?;
+
+    let state = get_state(&request);
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+
+    let mut wait_futures = Vec::default();
+    for timeline in tenant.list_timelines() {
+        let Some(lsn) = wait_lsn_request.timelines.get(&timeline.timeline_id) else {
+            continue;
+        };
+
+        let fut = {
+            let timeline = timeline.clone();
+            let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
+            async move {
+                timeline
+                    .wait_lsn(
+                        *lsn,
+                        WaitLsnWaiter::HttpEndpoint,
+                        WaitLsnTimeout::Custom(wait_lsn_request.timeout),
+                        &ctx,
+                    )
+                    .await
+            }
+        };
+        wait_futures.push(fut);
+    }
+
+    if wait_futures.is_empty() {
+        return json_response(StatusCode::NOT_FOUND, ());
+    }
+
+    let all_done = tokio::select! {
+        results = join_all(wait_futures) => {
+            results.iter().all(|res| res.is_ok())
+        },
+        _ = cancel.cancelled() => {
+            return Err(ApiError::Cancelled);
+        }
+    };
+
+    let status = if all_done {
+        StatusCode::OK
+    } else {
+        StatusCode::ACCEPTED
+    };
+
+    json_response(status, ())
+}
+
 async fn secondary_status_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -3577,6 +3638,9 @@ pub fn make_router(
         .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
             api_handler(r, secondary_download_handler)
         })
+        .post("/v1/tenant/:tenant_shard_id/wait_lsn", |r| {
+            api_handler(r, wait_lsn_handler)
+        })
         .put("/v1/tenant/:tenant_shard_id/break", |r| {
             testing_api_handler("set tenant state to broken", r, handle_tenant_break)
         })
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index e5063b7fc2..e103338c7c 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1708,6 +1708,7 @@ impl PageServerHandler {
                 .wait_lsn(
                     not_modified_since,
                     crate::tenant::timeline::WaitLsnWaiter::PageService,
+                    timeline::WaitLsnTimeout::Default,
                     ctx,
                 )
                 .await?;
@@ -2044,6 +2045,7 @@ impl PageServerHandler {
                 .wait_lsn(
                     lsn,
                     crate::tenant::timeline::WaitLsnWaiter::PageService,
+                    crate::tenant::timeline::WaitLsnTimeout::Default,
                     ctx,
                 )
                 .await?;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4385fe9a9b..4361fa3d66 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2560,7 +2560,12 @@ impl Tenant {
                     // sizes etc. and that would get confused if the previous page versions
                     // are not in the repository yet.
                     ancestor_timeline
-                        .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
+                        .wait_lsn(
+                            *lsn,
+                            timeline::WaitLsnWaiter::Tenant,
+                            timeline::WaitLsnTimeout::Default,
+                            ctx,
+                        )
                         .await
                         .map_err(|e| match e {
                             e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState { .. }) => {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index e8b0d1d4dd..dfa89a765c 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1643,6 +1643,7 @@ impl TenantManager {
                         .wait_lsn(
                             *target_lsn,
                             crate::tenant::timeline::WaitLsnWaiter::Tenant,
+                            crate::tenant::timeline::WaitLsnTimeout::Default,
                             ctx,
                         )
                         .await
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index de990a9fe4..076220df51 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -901,10 +901,17 @@ impl From<GetReadyAncestorError> for PageReconstructError {
     }
 }
 
+pub(crate) enum WaitLsnTimeout {
+    Custom(Duration),
+    // Use the [`PageServerConf::wait_lsn_timeout`] default
+    Default,
+}
+
 pub(crate) enum WaitLsnWaiter<'a> {
     Timeline(&'a Timeline),
     Tenant,
     PageService,
+    HttpEndpoint,
 }
 
 /// Argument to [`Timeline::shutdown`].
@@ -1301,6 +1308,7 @@ impl Timeline {
         &self,
         lsn: Lsn,
         who_is_waiting: WaitLsnWaiter<'_>,
+        timeout: WaitLsnTimeout,
         ctx: &RequestContext, /* Prepare for use by cancellation */
     ) -> Result<(), WaitLsnError> {
         let state = self.current_state();
@@ -1317,7 +1325,7 @@ impl Timeline {
                 | TaskKind::WalReceiverConnectionPoller => {
                     let is_myself = match who_is_waiting {
                         WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself),
-                        WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()),
+                        WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService | WaitLsnWaiter::HttpEndpoint => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()),
                     };
                     if is_myself {
                         if let Err(current) = self.last_record_lsn.would_wait_for(lsn) {
@@ -1333,13 +1341,14 @@ impl Timeline {
             }
         }
 
+        let timeout = match timeout {
+            WaitLsnTimeout::Custom(t) => t,
+            WaitLsnTimeout::Default => self.conf.wait_lsn_timeout,
+        };
+
         let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();
 
-        match self
-            .last_record_lsn
-            .wait_for_timeout(lsn, self.conf.wait_lsn_timeout)
-            .await
-        {
+        match self.last_record_lsn.wait_for_timeout(lsn, timeout).await {
             Ok(()) => Ok(()),
             Err(e) => {
                 use utils::seqwait::SeqWaitError::*;
@@ -3590,7 +3599,12 @@ impl Timeline {
             }
         }
         ancestor
-            .wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx)
+            .wait_lsn(
+                self.ancestor_lsn,
+                WaitLsnWaiter::Timeline(self),
+                WaitLsnTimeout::Default,
+                ctx,
+            )
             .await
             .map_err(|e| match e {
                 e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index d8ea32cd45..65f9d39078 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -274,7 +274,7 @@ pub(super) async fn connection_manager_loop_step(
                     };
 
                 last_discovery_ts = Some(std::time::Instant::now());
-                debug!("No active connection and no candidates, sending discovery request to the broker");
+                info!("No active connection and no candidates, sending discovery request to the broker");
 
                 // Cancellation safety: we want to send a message to the broker, but publish_one()
                 // function can get cancelled by the other select! arm. This is absolutely fine, because
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index b19cbc4fa3..141ff6f720 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -2,8 +2,9 @@ use pageserver_api::{
     models::{
         detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse,
         PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse,
-        TenantShardSplitRequest, TenantShardSplitResponse, TimelineArchivalConfigRequest,
-        TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse,
+        TenantShardSplitRequest, TenantShardSplitResponse, TenantWaitLsnRequest,
+        TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest,
+        TopTenantShardsResponse,
     },
     shard::TenantShardId,
 };
@@ -299,4 +300,17 @@ impl PageserverClient {
             self.inner.top_tenant_shards(request).await
         )
     }
+
+    pub(crate) async fn wait_lsn(
+        &self,
+        tenant_shard_id: TenantShardId,
+        request: TenantWaitLsnRequest,
+    ) -> Result<StatusCode> {
+        measured_request!(
+            "wait_lsn",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner.wait_lsn(tenant_shard_id, request).await
+        )
+    }
 }
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index adced3b77d..03db947263 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -3,7 +3,7 @@ use crate::persistence::Persistence;
 use crate::{compute_hook, service};
 use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy};
 use pageserver_api::models::{
-    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
+    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, TenantWaitLsnRequest,
 };
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_client::mgmt_api;
@@ -348,6 +348,32 @@ impl Reconciler {
         Ok(())
     }
 
+    async fn wait_lsn(
+        &self,
+        node: &Node,
+        tenant_shard_id: TenantShardId,
+        timelines: HashMap<TimelineId, Lsn>,
+    ) -> Result<StatusCode, ReconcileError> {
+        const TIMEOUT: Duration = Duration::from_secs(10);
+
+        let client = PageserverClient::new(
+            node.get_id(),
+            node.base_url(),
+            self.service_config.jwt_token.as_deref(),
+        );
+
+        client
+            .wait_lsn(
+                tenant_shard_id,
+                TenantWaitLsnRequest {
+                    timelines,
+                    timeout: TIMEOUT,
+                },
+            )
+            .await
+            .map_err(|e| e.into())
+    }
+
     async fn get_lsns(
         &self,
         tenant_shard_id: TenantShardId,
@@ -461,6 +487,39 @@ impl Reconciler {
         node: &Node,
         baseline: HashMap<TimelineId, Lsn>,
     ) -> anyhow::Result<()> {
+        // Signal to the pageserver that it should ingest up to the baseline LSNs.
+        loop {
+            match self.wait_lsn(node, tenant_shard_id, baseline.clone()).await {
+                Ok(StatusCode::OK) => {
+                    // Everything is caught up
+                    return Ok(());
+                }
+                Ok(StatusCode::ACCEPTED) => {
+                    // Some timelines are not caught up yet.
+                    // They'll be polled below.
+                    break;
+                }
+                Ok(StatusCode::NOT_FOUND) => {
+                    // None of the timelines are present on the pageserver.
+                    // This is correct if they've all been deleted, but
+                    // let let the polling loop below cross check.
+                    break;
+                }
+                Ok(status_code) => {
+                    tracing::warn!(
+                        "Unexpected status code ({status_code}) returned by wait_lsn endpoint"
+                    );
+                    break;
+                }
+                Err(e) => {
+                    tracing::info!("🕑 Can't trigger LSN wait on {node} yet, waiting ({e})",);
+                    tokio::time::sleep(Duration::from_millis(500)).await;
+                    continue;
+                }
+            }
+        }
+
+        // Poll the LSNs until they catch up
         loop {
             let latest = match self.get_lsns(tenant_shard_id, node).await {
                 Ok(l) => l,

From f5fdaa6dc6d68df1979c9766a4300a364a20bb4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Tue, 28 Jan 2025 20:13:39 +0100
Subject: [PATCH 281/583] feat(ci): generate basic release notes with links
 (#10511)

## Problem
https://github.com/neondatabase/neon/pull/10448 removed release notes,
because if their generation failed, the whole release was failing.
People liked them though, and wanted some basic release notes as a
fall-back instead of completely removing them.

## Summary of changes
Include basic release notes that link to the release PR and to a diff to
the previous release.
---
 .github/workflows/build_and_test.yml | 41 ++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 32b99d9c38..b23e3612d6 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1050,6 +1050,7 @@ jobs:
           retries: 5
           script: |
             const tag = "${{ needs.tag.outputs.build-tag }}";
+            const branch = "${{ github.ref_name }}";
 
             try {
               const existingRef = await github.rest.git.getRef({
@@ -1092,12 +1093,48 @@ jobs:
               }
 
               console.log(`Release for tag ${tag} does not exist. Creating it...`);
+
+              // Find the PR number using the commit SHA
+              const pullRequests = await github.rest.pulls.list({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                state: 'closed',
+                base: branch,
+              });
+              
+              const pr = pullRequests.data.find(pr => pr.merge_commit_sha === context.sha);
+              const prNumber = pr ? pr.number : null;
+
+              // Find the previous release on the branch
+              const releases = await github.rest.repos.listReleases({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                per_page: 100,
+              });
+
+              const branchReleases = releases.data
+                .filter((release) => {
+                  const regex = new RegExp(`^${branch}-\\d+$`);
+                  return regex.test(release.tag_name) && !release.draft && !release.prerelease;
+                })
+                .sort((a, b) => new Date(b.created_at) - new Date(a.created_at));
+
+              const previousTag = branchReleases.length > 0 ? branchReleases[0].tag_name : null;
+
+              const releaseNotes = [
+                prNumber
+                  ? `Release PR https://github.com/${context.repo.owner}/${context.repo.repo}/pull/${prNumber}.`
+                  : 'Release PR not found.',
+                previousTag
+                  ? `Diff with the previous release https://github.com/${context.repo.owner}/${context.repo.repo}/compare/${previousTag}...${tag}.`
+                  : `No previous release found on branch ${branch}.`,
+              ].join('\n\n');
+
               await github.rest.repos.createRelease({
                 owner: context.repo.owner,
                 repo: context.repo.repo,
                 tag_name: tag,
-                // TODO: Automate release notes properly
-                generate_release_notes: false,
+                body: releaseNotes,
               });
               console.log(`Release for tag ${tag} created successfully.`);
             }

From d04d924649069a5adaf6baad4ac62af9e50724b5 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 28 Jan 2025 20:24:07 +0100
Subject: [PATCH 282/583] feat(compute): Add some basic compute_ctl metrics
 (#10504)

## Problem

There are several parts of `compute_ctl` with a very low visibility of
errors:
1. DB migrations that run async in the background after compute start.
2. Requests made to control plane (currently only `GetSpec`).
3. Requests made to the remote extensions server.

## Summary of changes

Add new counters to quickly evaluate the amount of errors among the
fleet.

Part of neondatabase/cloud#17590
---
 compute_tools/src/extension_server.rs         | 65 +++++++++++---
 compute_tools/src/http/routes/metrics.rs      |  7 +-
 compute_tools/src/installed_extensions.rs     | 18 +---
 compute_tools/src/lib.rs                      |  1 +
 compute_tools/src/metrics.rs                  | 90 +++++++++++++++++++
 compute_tools/src/migration.rs                | 44 +++++----
 compute_tools/src/spec.rs                     | 41 ++++++---
 .../regress/test_compute_migrations.py        | 12 +++
 .../regress/test_download_extensions.py       | 14 ++-
 9 files changed, 230 insertions(+), 62 deletions(-)
 create mode 100644 compute_tools/src/metrics.rs

diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index f13b2308e7..fa638c74b3 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -85,6 +85,8 @@ use tracing::info;
 use tracing::log::warn;
 use zstd::stream::read::Decoder;
 
+use crate::metrics::{REMOTE_EXT_REQUESTS_FAILED, REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
+
 fn get_pg_config(argument: &str, pgbin: &str) -> String {
     // gives the result of `pg_config [argument]`
     // where argument is a flag like `--version` or `--sharedir`
@@ -258,21 +260,60 @@ async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Res
 
     info!("Download extension {:?} from uri {:?}", ext_path, uri);
 
-    let resp = reqwest::get(uri).await?;
+    REMOTE_EXT_REQUESTS_TOTAL.with_label_values(&[]).inc();
 
-    match resp.status() {
+    match do_extension_server_request(&uri).await {
+        Ok(resp) => {
+            info!(
+                "Successfully downloaded remote extension data {:?}",
+                ext_path
+            );
+            Ok(resp)
+        }
+        Err((msg, status)) => {
+            let status_str = status
+                .map(|s| s.to_string())
+                .unwrap_or(UNKNOWN_HTTP_STATUS.to_string());
+            REMOTE_EXT_REQUESTS_FAILED
+                .with_label_values(&[&status_str])
+                .inc();
+            bail!(msg);
+        }
+    }
+}
+
+// Do a single remote extensions server request.
+// Return result or (error message + status code) in case of any failures.
+async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, Option<StatusCode>)> {
+    let resp = reqwest::get(uri).await.map_err(|e| {
+        (
+            format!("could not perform remote extensions server request: {}", e),
+            None,
+        )
+    })?;
+    let status = resp.status();
+
+    match status {
         StatusCode::OK => match resp.bytes().await {
-            Ok(resp) => {
-                info!("Download extension {:?} completed successfully", ext_path);
-                Ok(resp)
-            }
-            Err(e) => bail!("could not deserialize remote extension response: {}", e),
+            Ok(resp) => Ok(resp),
+            Err(e) => Err((
+                format!("could not read remote extensions server response: {}", e),
+                // It's fine to return and report error with status as 200 OK,
+                // because we still failed to read the response.
+                Some(status),
+            )),
         },
-        StatusCode::SERVICE_UNAVAILABLE => bail!("remote extension is temporarily unavailable"),
-        _ => bail!(
-            "unexpected remote extension response status code: {}",
-            resp.status()
-        ),
+        StatusCode::SERVICE_UNAVAILABLE => Err((
+            "remote extensions server is temporarily unavailable".to_string(),
+            Some(status),
+        )),
+        _ => Err((
+            format!(
+                "unexpected remote extensions server response status code: {}",
+                status
+            ),
+            Some(status),
+        )),
     }
 }
 
diff --git a/compute_tools/src/http/routes/metrics.rs b/compute_tools/src/http/routes/metrics.rs
index 40d71b5de7..13150a7588 100644
--- a/compute_tools/src/http/routes/metrics.rs
+++ b/compute_tools/src/http/routes/metrics.rs
@@ -2,17 +2,16 @@ use axum::{body::Body, response::Response};
 use http::header::CONTENT_TYPE;
 use http::StatusCode;
 use metrics::proto::MetricFamily;
-use metrics::Encoder;
-use metrics::TextEncoder;
+use metrics::{Encoder, TextEncoder};
 
-use crate::{http::JsonResponse, installed_extensions};
+use crate::{http::JsonResponse, metrics::collect};
 
 /// Expose Prometheus metrics.
 pub(in crate::http) async fn get_metrics() -> Response {
     // When we call TextEncoder::encode() below, it will immediately return an
     // error if a metric family has no metrics, so we need to preemptively
     // filter out metric families with no metrics.
-    let metrics = installed_extensions::collect()
+    let metrics = collect()
         .into_iter()
         .filter(|m| !m.get_metric().is_empty())
         .collect::<Vec<MetricFamily>>();
diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
index 0ab259ddf1..173dbf40b0 100644
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -1,13 +1,10 @@
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
-use metrics::proto::MetricFamily;
 use std::collections::HashMap;
 
 use anyhow::Result;
 use postgres::{Client, NoTls};
 
-use metrics::core::Collector;
-use metrics::{register_uint_gauge_vec, UIntGaugeVec};
-use once_cell::sync::Lazy;
+use crate::metrics::INSTALLED_EXTENSIONS;
 
 /// We don't reuse get_existing_dbs() just for code clarity
 /// and to make database listing query here more explicit.
@@ -102,16 +99,3 @@ pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result<In
         extensions: extensions_map.into_values().collect(),
     })
 }
-
-static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "compute_installed_extensions",
-        "Number of databases where the version of extension is installed",
-        &["extension_name", "version", "owned_by_superuser"]
-    )
-    .expect("failed to define a metric")
-});
-
-pub fn collect() -> Vec<MetricFamily> {
-    INSTALLED_EXTENSIONS.collect()
-}
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index 12fea4e61a..b08df22134 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -16,6 +16,7 @@ pub mod extension_server;
 pub mod installed_extensions;
 pub mod local_proxy;
 pub mod lsn_lease;
+pub mod metrics;
 mod migration;
 pub mod monitor;
 pub mod params;
diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs
new file mode 100644
index 0000000000..3684338571
--- /dev/null
+++ b/compute_tools/src/metrics.rs
@@ -0,0 +1,90 @@
+use metrics::core::Collector;
+use metrics::proto::MetricFamily;
+use metrics::{register_int_counter_vec, register_uint_gauge_vec, IntCounterVec, UIntGaugeVec};
+use once_cell::sync::Lazy;
+
+pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "compute_installed_extensions",
+        "Number of databases where the version of extension is installed",
+        &["extension_name", "version", "owned_by_superuser"]
+    )
+    .expect("failed to define a metric")
+});
+
+// Normally, any HTTP API request is described by METHOD (e.g. GET, POST, etc.) + PATH,
+// but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec.
+// And it's fair to call it a 'RPC' (Remote Procedure Call).
+pub enum CPlaneRequestRPC {
+    GetSpec,
+}
+
+impl CPlaneRequestRPC {
+    pub fn as_str(&self) -> &str {
+        match self {
+            CPlaneRequestRPC::GetSpec => "GetSpec",
+        }
+    }
+}
+
+pub const UNKNOWN_HTTP_STATUS: &str = "unknown";
+
+pub(crate) static CPLANE_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "compute_ctl_cplane_requests_total",
+        "Total number of control plane requests made by compute_ctl",
+        &["rpc"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static CPLANE_REQUESTS_FAILED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "compute_ctl_cplane_requests_failed_total",
+        "Total number of failed control plane requests made by compute_ctl",
+        &["rpc", "http_status"]
+    )
+    .expect("failed to define a metric")
+});
+
+/// Total number of failed database migrations. Per-compute, this is actually a boolean metric,
+/// either empty or with a single value (1, migration_id) because we stop at the first failure.
+/// Yet, the sum over the fleet will provide the total number of failures.
+pub(crate) static DB_MIGRATION_FAILED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "compute_ctl_db_migration_failed_total",
+        "Total number of failed database migrations",
+        &["migration_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "compute_ctl_remote_ext_requests_total",
+        "Total number of requests made by compute_ctl to download extensions from S3 proxy",
+        // Do not use any labels like extension name yet.
+        // We can add them later if needed.
+        &[]
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static REMOTE_EXT_REQUESTS_FAILED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "compute_ctl_remote_ext_requests_failed_total",
+        "Total number of failed requests to S3 proxy",
+        &["http_status"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub fn collect() -> Vec<MetricFamily> {
+    let mut metrics = INSTALLED_EXTENSIONS.collect();
+    metrics.extend(CPLANE_REQUESTS_TOTAL.collect());
+    metrics.extend(CPLANE_REQUESTS_FAILED.collect());
+    metrics.extend(DB_MIGRATION_FAILED.collect());
+    metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect());
+    metrics.extend(REMOTE_EXT_REQUESTS_FAILED.collect());
+    metrics
+}
diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs
index 45c33172f7..aa3c6b01f0 100644
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -1,7 +1,9 @@
 use anyhow::{Context, Result};
 use fail::fail_point;
 use postgres::{Client, Transaction};
-use tracing::info;
+use tracing::{error, info};
+
+use crate::metrics::DB_MIGRATION_FAILED;
 
 /// Runs a series of migrations on a target database
 pub(crate) struct MigrationRunner<'m> {
@@ -78,24 +80,31 @@ impl<'m> MigrationRunner<'m> {
         Ok(())
     }
 
-    /// Run an individual migration
-    fn run_migration(txn: &mut Transaction, migration_id: i64, migration: &str) -> Result<()> {
+    /// Run an individual migration in a separate transaction block.
+    fn run_migration(client: &mut Client, migration_id: i64, migration: &str) -> Result<()> {
+        let mut txn = client
+            .transaction()
+            .with_context(|| format!("begin transaction for migration {migration_id}"))?;
+
         if migration.starts_with("-- SKIP") {
             info!("Skipping migration id={}", migration_id);
 
             // Even though we are skipping the migration, updating the
             // migration ID should help keep logic easy to understand when
             // trying to understand the state of a cluster.
-            Self::update_migration_id(txn, migration_id)?;
+            Self::update_migration_id(&mut txn, migration_id)?;
         } else {
             info!("Running migration id={}:\n{}\n", migration_id, migration);
 
             txn.simple_query(migration)
                 .with_context(|| format!("apply migration {migration_id}"))?;
 
-            Self::update_migration_id(txn, migration_id)?;
+            Self::update_migration_id(&mut txn, migration_id)?;
         }
 
+        txn.commit()
+            .with_context(|| format!("commit transaction for migration {migration_id}"))?;
+
         Ok(())
     }
 
@@ -109,19 +118,20 @@ impl<'m> MigrationRunner<'m> {
             // The index lags the migration ID by 1, so the current migration
             // ID is also the next index
             let migration_id = (current_migration + 1) as i64;
+            let migration = self.migrations[current_migration];
 
-            let mut txn = self
-                .client
-                .transaction()
-                .with_context(|| format!("begin transaction for migration {migration_id}"))?;
-
-            Self::run_migration(&mut txn, migration_id, self.migrations[current_migration])
-                .with_context(|| format!("running migration {migration_id}"))?;
-
-            txn.commit()
-                .with_context(|| format!("commit transaction for migration {migration_id}"))?;
-
-            info!("Finished migration id={}", migration_id);
+            match Self::run_migration(self.client, migration_id, migration) {
+                Ok(_) => {
+                    info!("Finished migration id={}", migration_id);
+                }
+                Err(e) => {
+                    error!("Failed to run migration id={}: {}", migration_id, e);
+                    DB_MIGRATION_FAILED
+                        .with_label_values(&[migration_id.to_string().as_str()])
+                        .inc();
+                    return Err(e);
+                }
+            }
 
             current_migration += 1;
         }
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index c7d2deb090..01de13811f 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -6,6 +6,9 @@ use std::path::Path;
 use tracing::{error, info, instrument, warn};
 
 use crate::config;
+use crate::metrics::{
+    CPlaneRequestRPC, CPLANE_REQUESTS_FAILED, CPLANE_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS,
+};
 use crate::migration::MigrationRunner;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;
@@ -19,7 +22,7 @@ use compute_api::spec::ComputeSpec;
 fn do_control_plane_request(
     uri: &str,
     jwt: &str,
-) -> Result<ControlPlaneSpecResponse, (bool, String)> {
+) -> Result<ControlPlaneSpecResponse, (bool, String, Option<StatusCode>)> {
     let resp = reqwest::blocking::Client::new()
         .get(uri)
         .header("Authorization", format!("Bearer {}", jwt))
@@ -28,34 +31,41 @@ fn do_control_plane_request(
             (
                 true,
                 format!("could not perform spec request to control plane: {}", e),
+                None,
             )
         })?;
 
-    match resp.status() {
+    let status = resp.status();
+    match status {
         StatusCode::OK => match resp.json::<ControlPlaneSpecResponse>() {
             Ok(spec_resp) => Ok(spec_resp),
             Err(e) => Err((
                 true,
                 format!("could not deserialize control plane response: {}", e),
+                Some(status),
             )),
         },
-        StatusCode::SERVICE_UNAVAILABLE => {
-            Err((true, "control plane is temporarily unavailable".to_string()))
-        }
+        StatusCode::SERVICE_UNAVAILABLE => Err((
+            true,
+            "control plane is temporarily unavailable".to_string(),
+            Some(status),
+        )),
         StatusCode::BAD_GATEWAY => {
             // We have a problem with intermittent 502 errors now
             // https://github.com/neondatabase/cloud/issues/2353
             // It's fine to retry GET request in this case.
-            Err((true, "control plane request failed with 502".to_string()))
+            Err((
+                true,
+                "control plane request failed with 502".to_string(),
+                Some(status),
+            ))
         }
         // Another code, likely 500 or 404, means that compute is unknown to the control plane
         // or some internal failure happened. Doesn't make much sense to retry in this case.
         _ => Err((
             false,
-            format!(
-                "unexpected control plane response status code: {}",
-                resp.status()
-            ),
+            format!("unexpected control plane response status code: {}", status),
+            Some(status),
         )),
     }
 }
@@ -82,6 +92,9 @@ pub fn get_spec_from_control_plane(
     // - no spec for compute yet (Empty state) -> return Ok(None)
     // - got spec -> return Ok(Some(spec))
     while attempt < 4 {
+        CPLANE_REQUESTS_TOTAL
+            .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str()])
+            .inc();
         spec = match do_control_plane_request(&cp_uri, &jwt) {
             Ok(spec_resp) => match spec_resp.status {
                 ControlPlaneComputeStatus::Empty => Ok(None),
@@ -93,7 +106,13 @@ pub fn get_spec_from_control_plane(
                     }
                 }
             },
-            Err((retry, msg)) => {
+            Err((retry, msg, status)) => {
+                let status_str = status
+                    .map(|s| s.to_string())
+                    .unwrap_or(UNKNOWN_HTTP_STATUS.to_string());
+                CPLANE_REQUESTS_FAILED
+                    .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status_str])
+                    .inc();
                 if retry {
                     Err(anyhow!(msg))
                 } else {
diff --git a/test_runner/regress/test_compute_migrations.py b/test_runner/regress/test_compute_migrations.py
index 803702a6f8..ec2e38f021 100644
--- a/test_runner/regress/test_compute_migrations.py
+++ b/test_runner/regress/test_compute_migrations.py
@@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, cast
 
 import pytest
 from fixtures.compute_migrations import COMPUTE_MIGRATIONS, NUM_COMPUTE_MIGRATIONS
+from fixtures.metrics import parse_metrics
 
 if TYPE_CHECKING:
     from fixtures.neon_fixtures import NeonEnv
@@ -33,6 +34,17 @@ def test_compute_migrations_retry(neon_simple_env: NeonEnv, compute_migrations_d
             migration_id = cast("int", cur.fetchall()[0][0])
             assert migration_id == i - 1
 
+        # Check that migration failure is properly recorded in the metrics
+        client = endpoint.http_client()
+        raw_metrics = client.metrics()
+        metrics = parse_metrics(raw_metrics)
+        failed_migration = metrics.query_all(
+            "compute_ctl_db_migration_failed_total",
+        )
+        assert len(failed_migration) == 1
+        for sample in failed_migration:
+            assert sample.value == 1
+
         endpoint.stop()
 
     endpoint.start()
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index f18f4e78bd..d7e6e9de56 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING
 
 import pytest
 from fixtures.log_helper import log
+from fixtures.metrics import parse_metrics
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
 )
@@ -128,6 +129,17 @@ def test_remote_extensions(
 
     httpserver.check()
 
+    # Check that we properly recorded downloads in the metrics
+    client = endpoint.http_client()
+    raw_metrics = client.metrics()
+    metrics = parse_metrics(raw_metrics)
+    remote_ext_requests = metrics.query_all(
+        "compute_ctl_remote_ext_requests_total",
+    )
+    assert len(remote_ext_requests) == 1
+    for sample in remote_ext_requests:
+        assert sample.value == 1
+
 
 # TODO
 # 1. Test downloading remote library.
@@ -137,7 +149,7 @@ def test_remote_extensions(
 #
 # 3.Test that extension is downloaded after endpoint restart,
 # when the library is used in the query.
-# Run the test with mutliple simultaneous connections to an endpoint.
+# Run the test with multiple simultaneous connections to an endpoint.
 # to ensure that the extension is downloaded only once.
 #
 # 4. Test that private extensions are only downloaded when they are present in the spec.

From 68cf0ba4399a2b027a331b74142108b41d5fc0d0 Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Tue, 28 Jan 2025 22:26:38 +0100
Subject: [PATCH 283/583] run benchmark tests on small-metal runners (#10549)

## Problem
Ref: https://github.com/neondatabase/cloud/issues/23314

We suspect some inconsistency in Benchmark tests runs could be due to
different type of runners they are landed in.
To have that aligned in both terms: failure rates and benchmark results,
lets run them for now on `small-metal` servers and see the progress for
the tests stability.

## Summary of changes
---
 .github/actionlint.yml               | 1 +
 .github/workflows/build_and_test.yml | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index aec5b4ee75..ecff0cc70b 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -4,6 +4,7 @@ self-hosted-runner:
     - large
     - large-arm64
     - small
+    - small-metal
     - small-arm64
     - us-east-2
 config-variables:
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b23e3612d6..99658187a8 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -242,7 +242,7 @@ jobs:
       statuses: write
       contents: write
       pull-requests: write
-    runs-on: [ self-hosted, small ]
+    runs-on: [ self-hosted, small-metal ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
       credentials:
@@ -1101,7 +1101,7 @@ jobs:
                 state: 'closed',
                 base: branch,
               });
-              
+
               const pr = pullRequests.data.find(pr => pr.merge_commit_sha === context.sha);
               const prNumber = pr ? pr.number : null;
 

From b735df6ff01472f017ffed25ac7da3ca615829ad Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 28 Jan 2025 16:29:51 -0500
Subject: [PATCH 284/583] fix(pageserver): make image layer generation atomic
 (#10516)

## Problem

close https://github.com/neondatabase/neon/issues/8362

## Summary of changes

Use `BatchLayerWriter` to ensure we clean up image layers after failed
compaction.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/storage_layer.rs        |   1 +
 .../storage_layer/batch_split_writer.rs       |  17 +++
 pageserver/src/tenant/timeline.rs             | 105 ++++++++++--------
 pageserver/src/tenant/timeline/compaction.rs  |   7 +-
 4 files changed, 79 insertions(+), 51 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index c24d037dde..c1fe67c87c 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -33,6 +33,7 @@ use utils::sync::gate::GateGuard;
 
 use utils::lsn::Lsn;
 
+pub use batch_split_writer::{BatchLayerWriter, SplitDeltaLayerWriter, SplitImageLayerWriter};
 pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
index 8a397ceb7a..22d8b81bcc 100644
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -87,6 +87,23 @@ impl BatchLayerWriter {
         ));
     }
 
+    pub(crate) async fn finish(
+        self,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<ResidentLayer>> {
+        let res = self
+            .finish_with_discard_fn(tline, ctx, |_| async { false })
+            .await?;
+        let mut output = Vec::new();
+        for r in res {
+            if let BatchWriterResult::Produced(layer) = r {
+                output.push(layer);
+            }
+        }
+        Ok(output)
+    }
+
     pub(crate) async fn finish_with_discard_fn<D, F>(
         self,
         tline: &Arc<Timeline>,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 076220df51..2033ebcdeb 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -70,6 +70,7 @@ use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
 use crate::l0_flush::{self, L0FlushGlobalState};
+use crate::tenant::storage_layer::ImageLayerName;
 use crate::{
     aux_file::AuxFileSizeEstimator,
     page_service::TenantManagerTypes,
@@ -78,7 +79,7 @@ use crate::{
         layer_map::{LayerMap, SearchResult},
         metadata::TimelineMetadata,
         storage_layer::{
-            inmemory_layer::IndexEntry, IoConcurrency, PersistentLayerDesc,
+            inmemory_layer::IndexEntry, BatchLayerWriter, IoConcurrency, PersistentLayerDesc,
             ValueReconstructSituation,
         },
     },
@@ -933,7 +934,7 @@ pub(crate) enum ShutdownMode {
 }
 
 struct ImageLayerCreationOutcome {
-    image: Option<ResidentLayer>,
+    unfinished_image_layer: Option<ImageLayerWriter>,
     next_start_key: Key,
 }
 
@@ -4405,11 +4406,15 @@ impl Timeline {
         if wrote_keys {
             // Normal path: we have written some data into the new image layer for this
             // partition, so flush it to disk.
-            let (desc, path) = image_layer_writer.finish(ctx).await?;
-            let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
-            info!("created image layer for rel {}", image_layer.local_path());
+            info!(
+                "produced image layer for rel {}",
+                ImageLayerName {
+                    key_range: img_range.clone(),
+                    lsn
+                },
+            );
             Ok(ImageLayerCreationOutcome {
-                image: Some(image_layer),
+                unfinished_image_layer: Some(image_layer_writer),
                 next_start_key: img_range.end,
             })
         } else {
@@ -4419,7 +4424,7 @@ impl Timeline {
             // layer we write will cover the key range that we just scanned.
             tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
             Ok(ImageLayerCreationOutcome {
-                image: None,
+                unfinished_image_layer: None,
                 next_start_key: start,
             })
         }
@@ -4468,7 +4473,7 @@ impl Timeline {
 
         if !trigger_generation && mode == ImageLayerCreationMode::Try {
             return Ok(ImageLayerCreationOutcome {
-                image: None,
+                unfinished_image_layer: None,
                 next_start_key: img_range.end,
             });
         }
@@ -4494,14 +4499,15 @@ impl Timeline {
         if wrote_any_image {
             // Normal path: we have written some data into the new image layer for this
             // partition, so flush it to disk.
-            let (desc, path) = image_layer_writer.finish(ctx).await?;
-            let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
             info!(
                 "created image layer for metadata {}",
-                image_layer.local_path()
+                ImageLayerName {
+                    key_range: img_range.clone(),
+                    lsn
+                }
             );
             Ok(ImageLayerCreationOutcome {
-                image: Some(image_layer),
+                unfinished_image_layer: Some(image_layer_writer),
                 next_start_key: img_range.end,
             })
         } else {
@@ -4511,7 +4517,7 @@ impl Timeline {
             // layer we write will cover the key range that we just scanned.
             tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
             Ok(ImageLayerCreationOutcome {
-                image: None,
+                unfinished_image_layer: None,
                 next_start_key: start,
             })
         }
@@ -4578,7 +4584,6 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
         let timer = self.metrics.create_images_time_histo.start_timer();
-        let mut image_layers = Vec::new();
 
         // We need to avoid holes between generated image layers.
         // Otherwise LayerMap::image_layer_exists will return false if key range of some layer is covered by more than one
@@ -4593,6 +4598,8 @@ impl Timeline {
 
         let check_for_image_layers = self.should_check_if_image_layers_required(lsn);
 
+        let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?;
+
         for partition in partitioning.parts.iter() {
             if self.cancel.is_cancelled() {
                 return Err(CreateImageLayersError::Cancelled);
@@ -4665,45 +4672,45 @@ impl Timeline {
                     .map_err(|_| CreateImageLayersError::Cancelled)?,
             );
 
-            if !compact_metadata {
-                let ImageLayerCreationOutcome {
-                    image,
-                    next_start_key,
-                } = self
-                    .create_image_layer_for_rel_blocks(
-                        partition,
-                        image_layer_writer,
-                        lsn,
-                        ctx,
-                        img_range,
-                        start,
-                        io_concurrency,
-                    )
-                    .await?;
-
-                start = next_start_key;
-                image_layers.extend(image);
+            let ImageLayerCreationOutcome {
+                unfinished_image_layer,
+                next_start_key,
+            } = if !compact_metadata {
+                self.create_image_layer_for_rel_blocks(
+                    partition,
+                    image_layer_writer,
+                    lsn,
+                    ctx,
+                    img_range.clone(),
+                    start,
+                    io_concurrency,
+                )
+                .await?
             } else {
-                let ImageLayerCreationOutcome {
-                    image,
-                    next_start_key,
-                } = self
-                    .create_image_layer_for_metadata_keys(
-                        partition,
-                        image_layer_writer,
-                        lsn,
-                        ctx,
-                        img_range,
-                        mode,
-                        start,
-                        io_concurrency,
-                    )
-                    .await?;
-                start = next_start_key;
-                image_layers.extend(image);
+                self.create_image_layer_for_metadata_keys(
+                    partition,
+                    image_layer_writer,
+                    lsn,
+                    ctx,
+                    img_range.clone(),
+                    mode,
+                    start,
+                    io_concurrency,
+                )
+                .await?
+            };
+            start = next_start_key;
+            if let Some(unfinished_image_layer) = unfinished_image_layer {
+                batch_image_writer.add_unfinished_image_writer(
+                    unfinished_image_layer,
+                    img_range,
+                    lsn,
+                );
             }
         }
 
+        let image_layers = batch_image_writer.finish(self, ctx).await?;
+
         let mut guard = self.layers.write().await;
 
         // FIXME: we could add the images to be uploaded *before* returning from here, but right
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 28c3381318..ad19738bc2 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -3197,7 +3197,7 @@ impl TimelineAdaptor {
         // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
         let start = Key::MIN;
         let ImageLayerCreationOutcome {
-            image,
+            unfinished_image_layer,
             next_start_key: _,
         } = self
             .timeline
@@ -3212,7 +3212,10 @@ impl TimelineAdaptor {
             )
             .await?;
 
-        if let Some(image_layer) = image {
+        if let Some(image_layer_writer) = unfinished_image_layer {
+            let (desc, path) = image_layer_writer.finish(ctx).await?;
+            let image_layer =
+                Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
             self.new_images.push(image_layer);
         }
 

From 983e18e63ee60960ebf8f9b5a3833aab861e3792 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 28 Jan 2025 18:18:32 -0500
Subject: [PATCH 285/583] feat(pageserver): add compaction_upper_limit config
 (#10550)

## Problem

Follow-up of the incident, we should not use the same bound on
lower/upper limit of compaction files. This patch adds an upper bound
limit, which is set to 50 for now.

## Summary of changes

Add `compaction_upper_limit`.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 control_plane/src/pageserver.rs                  |  5 +++++
 libs/pageserver_api/src/config.rs                | 12 ++++++++++++
 libs/pageserver_api/src/models.rs                |  8 ++++++++
 pageserver/src/http/openapi_spec.yml             |  2 ++
 pageserver/src/tenant.rs                         |  8 ++++++++
 pageserver/src/tenant/config.rs                  | 13 +++++++++++++
 pageserver/src/tenant/timeline.rs                |  8 ++++++++
 pageserver/src/tenant/timeline/compaction.rs     | 13 ++-----------
 test_runner/regress/test_attach_tenant_config.py |  1 +
 9 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 52527ffa90..383c174684 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -347,6 +347,11 @@ impl PageServerNode {
                 .map(|x| x.parse::<usize>())
                 .transpose()
                 .context("Failed to parse 'compaction_threshold' as an integer")?,
+            compaction_upper_limit: settings
+                .remove("compaction_upper_limit")
+                .map(|x| x.parse::<usize>())
+                .transpose()
+                .context("Failed to parse 'compaction_upper_limit' as an integer")?,
             compaction_algorithm: settings
                 .remove("compaction_algorithm")
                 .map(serde_json::from_str)
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 40c8837af5..422da0dc95 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -256,6 +256,11 @@ pub struct TenantConfigToml {
     pub compaction_period: Duration,
     /// Level0 delta layer threshold for compaction.
     pub compaction_threshold: usize,
+    /// Controls the amount of L0 included in a single compaction iteration.
+    /// The unit is `checkpoint_distance`, i.e., a size.
+    /// We add L0s to the set of layers to compact until their cumulative
+    /// size exceeds `compaction_upper_limit * checkpoint_distance`.
+    pub compaction_upper_limit: usize,
     pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
     /// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure,
     /// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer
@@ -523,6 +528,12 @@ pub mod tenant_conf_defaults {
 
     pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
     pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
+
+    // This value needs to be tuned to avoid OOM. We have 3/4 of the total CPU threads to do background works, that's 16*3/4=9 on
+    // most of our pageservers. Compaction ~50 layers requires about 2GB memory (could be reduced later by optimizing L0 hole
+    // calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB.
+    pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50;
+
     pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
         crate::models::CompactionAlgorithm::Legacy;
 
@@ -563,6 +574,7 @@ impl Default for TenantConfigToml {
             compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
                 .expect("cannot parse default compaction period"),
             compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
+            compaction_upper_limit: DEFAULT_COMPACTION_UPPER_LIMIT,
             compaction_algorithm: crate::models::CompactionAlgorithmSettings {
                 kind: DEFAULT_COMPACTION_ALGORITHM,
             },
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 16f89ae13b..43447c67bd 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -458,6 +458,8 @@ pub struct TenantConfigPatch {
     pub compaction_period: FieldPatch<String>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub compaction_threshold: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_upper_limit: FieldPatch<usize>,
     // defer parsing compaction_algorithm, like eviction_policy
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
@@ -522,6 +524,7 @@ pub struct TenantConfig {
     pub compaction_target_size: Option<u64>,
     pub compaction_period: Option<String>,
     pub compaction_threshold: Option<usize>,
+    pub compaction_upper_limit: Option<usize>,
     // defer parsing compaction_algorithm, like eviction_policy
     pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
     pub l0_flush_delay_threshold: Option<usize>,
@@ -559,6 +562,7 @@ impl TenantConfig {
             mut compaction_target_size,
             mut compaction_period,
             mut compaction_threshold,
+            mut compaction_upper_limit,
             mut compaction_algorithm,
             mut l0_flush_delay_threshold,
             mut l0_flush_stall_threshold,
@@ -594,6 +598,9 @@ impl TenantConfig {
             .apply(&mut compaction_target_size);
         patch.compaction_period.apply(&mut compaction_period);
         patch.compaction_threshold.apply(&mut compaction_threshold);
+        patch
+            .compaction_upper_limit
+            .apply(&mut compaction_upper_limit);
         patch.compaction_algorithm.apply(&mut compaction_algorithm);
         patch
             .l0_flush_delay_threshold
@@ -653,6 +660,7 @@ impl TenantConfig {
             compaction_target_size,
             compaction_period,
             compaction_threshold,
+            compaction_upper_limit,
             compaction_algorithm,
             l0_flush_delay_threshold,
             l0_flush_stall_threshold,
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index ee43440534..4b976e7f6f 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -984,6 +984,8 @@ components:
           type: string
         compaction_threshold:
           type: string
+        compaction_upper_limit:
+          type: string
         image_creation_threshold:
           type: integer
         walreceiver_connect_timeout:
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4361fa3d66..085f76c05d 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3816,6 +3816,13 @@ impl Tenant {
             .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
     }
 
+    pub fn get_compaction_upper_limit(&self) -> usize {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .compaction_upper_limit
+            .unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit)
+    }
+
     pub fn get_gc_horizon(&self) -> u64 {
         let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
@@ -5469,6 +5476,7 @@ pub(crate) mod harness {
                 compaction_target_size: Some(tenant_conf.compaction_target_size),
                 compaction_period: Some(tenant_conf.compaction_period),
                 compaction_threshold: Some(tenant_conf.compaction_threshold),
+                compaction_upper_limit: Some(tenant_conf.compaction_upper_limit),
                 compaction_algorithm: Some(tenant_conf.compaction_algorithm),
                 l0_flush_delay_threshold: tenant_conf.l0_flush_delay_threshold,
                 l0_flush_stall_threshold: tenant_conf.l0_flush_stall_threshold,
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 50da998c30..139ed27bd2 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -277,6 +277,10 @@ pub struct TenantConfOpt {
     #[serde(default)]
     pub compaction_threshold: Option<usize>,
 
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub compaction_upper_limit: Option<usize>,
+
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
     pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
@@ -401,6 +405,9 @@ impl TenantConfOpt {
             compaction_threshold: self
                 .compaction_threshold
                 .unwrap_or(global_conf.compaction_threshold),
+            compaction_upper_limit: self
+                .compaction_upper_limit
+                .unwrap_or(global_conf.compaction_upper_limit),
             compaction_algorithm: self
                 .compaction_algorithm
                 .as_ref()
@@ -478,6 +485,7 @@ impl TenantConfOpt {
             mut compaction_target_size,
             mut compaction_period,
             mut compaction_threshold,
+            mut compaction_upper_limit,
             mut compaction_algorithm,
             mut l0_flush_delay_threshold,
             mut l0_flush_stall_threshold,
@@ -519,6 +527,9 @@ impl TenantConfOpt {
             .map(|v| humantime::parse_duration(&v))?
             .apply(&mut compaction_period);
         patch.compaction_threshold.apply(&mut compaction_threshold);
+        patch
+            .compaction_upper_limit
+            .apply(&mut compaction_upper_limit);
         patch.compaction_algorithm.apply(&mut compaction_algorithm);
         patch
             .l0_flush_delay_threshold
@@ -596,6 +607,7 @@ impl TenantConfOpt {
             compaction_target_size,
             compaction_period,
             compaction_threshold,
+            compaction_upper_limit,
             compaction_algorithm,
             l0_flush_delay_threshold,
             l0_flush_stall_threshold,
@@ -657,6 +669,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             compaction_target_size: value.compaction_target_size,
             compaction_period: value.compaction_period.map(humantime),
             compaction_threshold: value.compaction_threshold,
+            compaction_upper_limit: value.compaction_upper_limit,
             l0_flush_delay_threshold: value.l0_flush_delay_threshold,
             l0_flush_stall_threshold: value.l0_flush_stall_threshold,
             l0_flush_wait_upload: value.l0_flush_wait_upload,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2033ebcdeb..f3cdad82d9 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2181,6 +2181,14 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
     }
 
+    fn get_compaction_upper_limit(&self) -> usize {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .compaction_upper_limit
+            .unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit)
+    }
+
     fn get_l0_flush_delay_threshold(&self) -> Option<usize> {
         // Disable L0 flushes by default. This and compaction needs further tuning.
         const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 0; // TODO: default to e.g. 3
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index ad19738bc2..76dcc159ea 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -47,9 +47,7 @@ use crate::tenant::timeline::{ImageLayerCreationOutcome, IoConcurrency};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded};
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
-use pageserver_api::config::tenant_conf_defaults::{
-    DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
-};
+use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE;
 
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpace;
@@ -1117,14 +1115,7 @@ impl Timeline {
         // Under normal circumstances, we will accumulate up to compaction_interval L0s of size
         // checkpoint_distance each.  To avoid edge cases using extra system resources, bound our
         // work in this function to only operate on this much delta data at once.
-        //
-        // Take the max of the configured value & the default, so that tests that configure tiny values
-        // can still use a sensible amount of memory, but if a deployed system configures bigger values we
-        // still let them compact a full stack of L0s in one go.
-        let delta_size_limit = std::cmp::max(
-            self.get_compaction_threshold(),
-            DEFAULT_COMPACTION_THRESHOLD,
-        ) as u64
+        let delta_size_limit = self.get_compaction_upper_limit() as u64
             * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);
 
         let mut fully_compacted = true;
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 8b92e4c442..e88d245c8f 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -139,6 +139,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
     fully_custom_config = {
         "compaction_period": "1h",
         "compaction_threshold": 13,
+        "compaction_upper_limit": 100,
         "l0_flush_delay_threshold": 25,
         "l0_flush_stall_threshold": 42,
         "l0_flush_wait_upload": True,

From 9ab13d6e2c6f2877a89bb6963923f1ef767c1fc3 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Wed, 29 Jan 2025 12:16:00 +0300
Subject: [PATCH 286/583] Log statements in test_layer_map (#10554)

## Problem

test_layer_map doesn't log statements and it is not clear how long they
take.

## Summary of changes

Do log them.

ref https://github.com/neondatabase/neon/issues/10409
---
 test_runner/performance/test_layer_map.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index 9b159c5fcf..efc7fa59db 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -31,6 +31,7 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
 
     endpoint = env.endpoints.create_start("main", tenant_id=tenant)
     cur = endpoint.connect().cursor()
+    cur.execute("set log_statement = 'all'")
     cur.execute("create table t(x integer)")
     for _ in range(n_iters):
         cur.execute(f"insert into t values (generate_series(1,{n_records}))")

From 9f81828429ad6475b4fbb1a814240213b74bec63 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Wed, 29 Jan 2025 10:19:11 +0100
Subject: [PATCH 287/583] Test extension upgrade compatibility (#10244)

## Problem
We have to test the extensions, shipped with Neon for compatibility
before the upgrade.
## Summary of changes
Added the test for compatibility with the upgraded extensions.
---
 .github/workflows/build_and_test.yml          | 25 +++++
 .../compute_wrapper/shell/compute.sh          | 67 ++++++++-----
 docker-compose/docker-compose.yml             |  4 +-
 .../ext-src/hll-src/test-upgrade.sh           |  5 +
 .../ext-src/hypopg-src/test-upgrade.patch     | 27 ++++++
 .../ext-src/hypopg-src/test-upgrade.sh        |  6 ++
 .../ext-src/ip4r-src/test-upgrade.patch       | 23 +++++
 .../ext-src/ip4r-src/test-upgrade.sh          |  6 ++
 .../ext-src/pg_cron-src/test-upgrade.patch    | 75 +++++++++++++++
 .../ext-src/pg_cron-src/test-upgrade.sh       |  6 ++
 .../ext-src/pg_ivm-src/test-upgrade.patch     | 18 ++++
 .../ext-src/pg_ivm-src/test-upgrade.sh        |  6 ++
 .../pg_roaringbitmap-src/test-upgrade.patch   | 25 +++++
 .../pg_roaringbitmap-src/test-upgrade.sh      |  6 ++
 .../ext-src/pg_semver-src/test-upgrade.patch  | 24 +++++
 .../ext-src/pg_semver-src/test-upgrade.sh     |  6 ++
 .../ext-src/pg_uuidv7-src/test-upgrade.sh     |  5 +
 .../ext-src/pgvector-src/test-upgrade.sh      |  5 +
 .../ext-src/plv8-src/test-upgrade.sh          |  5 +
 .../postgresql-unit-src/test-upgrade.sh       |  5 +
 .../ext-src/prefix-src/test-upgrade.sh        |  5 +
 .../ext-src/rum-src/test-upgrade.patch        | 19 ++++
 .../ext-src/rum-src/test-upgrade.sh           |  6 ++
 docker-compose/test_extensions_upgrade.sh     | 93 +++++++++++++++++++
 24 files changed, 450 insertions(+), 22 deletions(-)
 create mode 100755 docker-compose/ext-src/hll-src/test-upgrade.sh
 create mode 100644 docker-compose/ext-src/hypopg-src/test-upgrade.patch
 create mode 100755 docker-compose/ext-src/hypopg-src/test-upgrade.sh
 create mode 100644 docker-compose/ext-src/ip4r-src/test-upgrade.patch
 create mode 100755 docker-compose/ext-src/ip4r-src/test-upgrade.sh
 create mode 100644 docker-compose/ext-src/pg_cron-src/test-upgrade.patch
 create mode 100755 docker-compose/ext-src/pg_cron-src/test-upgrade.sh
 create mode 100644 docker-compose/ext-src/pg_ivm-src/test-upgrade.patch
 create mode 100755 docker-compose/ext-src/pg_ivm-src/test-upgrade.sh
 create mode 100644 docker-compose/ext-src/pg_roaringbitmap-src/test-upgrade.patch
 create mode 100755 docker-compose/ext-src/pg_roaringbitmap-src/test-upgrade.sh
 create mode 100644 docker-compose/ext-src/pg_semver-src/test-upgrade.patch
 create mode 100755 docker-compose/ext-src/pg_semver-src/test-upgrade.sh
 create mode 100755 docker-compose/ext-src/pg_uuidv7-src/test-upgrade.sh
 create mode 100755 docker-compose/ext-src/pgvector-src/test-upgrade.sh
 create mode 100755 docker-compose/ext-src/plv8-src/test-upgrade.sh
 create mode 100755 docker-compose/ext-src/postgresql-unit-src/test-upgrade.sh
 create mode 100755 docker-compose/ext-src/prefix-src/test-upgrade.sh
 create mode 100644 docker-compose/ext-src/rum-src/test-upgrade.patch
 create mode 100755 docker-compose/ext-src/rum-src/test-upgrade.sh
 create mode 100755 docker-compose/test_extensions_upgrade.sh

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 99658187a8..e588fc5a0e 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -786,6 +786,17 @@ jobs:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
+      - name: Get the last compute release tag
+        id: get-last-compute-release-tag
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        run: |
+          tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\
+            -H "Accept: application/vnd.github+json" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            "/repos/${{ github.repository }}/releases")
+          echo tag=${tag} >> ${GITHUB_OUTPUT}
+
       # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
       # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
       # Regular pageserver version string looks like
@@ -817,6 +828,20 @@ jobs:
           TEST_VERSION_ONLY: ${{ matrix.pg_version }}
         run: ./docker-compose/docker_compose_test.sh
 
+      - name: Print logs and clean up docker-compose test
+        if: always()
+        run: |
+          docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true
+          docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down
+
+      - name: Test extension upgrade
+        timeout-minutes: 20
+        if: ${{ needs.tag.outputs.build-tag == github.run_id }}
+        env:
+          NEWTAG: ${{ needs.tag.outputs.build-tag }}
+          OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
+        run: ./docker-compose/test_extensions_upgrade.sh
+
       - name: Print logs and clean up
         if: always()
         run: |
diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh
index 33455e458a..b4f8d3d66a 100755
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -20,30 +20,55 @@ while ! nc -z pageserver 6400; do
 done
 echo "Page server is ready."
 
-echo "Create a tenant and timeline"
-generate_id tenant_id
-PARAMS=(
-     -X PUT
-     -H "Content-Type: application/json"
-     -d "{\"mode\": \"AttachedSingle\", \"generation\": 1, \"tenant_conf\": {}}"
-     "http://pageserver:9898/v1/tenant/${tenant_id}/location_config"
-)
-result=$(curl "${PARAMS[@]}")
-echo $result | jq .
+cp ${SPEC_FILE_ORG} ${SPEC_FILE}
 
-generate_id timeline_id
-PARAMS=(
-     -sbf
-     -X POST
-     -H "Content-Type: application/json"
-     -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"
-     "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
-)
-result=$(curl "${PARAMS[@]}")
-echo $result | jq .
+ if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then
+   tenant_id=${TENANT_ID}
+   timeline_id=${TIMELINE_ID}
+else
+  echo "Check if a tenant present"
+  PARAMS=(
+       -X GET
+       -H "Content-Type: application/json"
+       "http://pageserver:9898/v1/tenant"
+  )
+  tenant_id=$(curl "${PARAMS[@]}" | jq -r .[0].id)
+  if [ -z "${tenant_id}" ] || [ "${tenant_id}" = null ]; then
+    echo "Create a tenant"
+    generate_id tenant_id
+    PARAMS=(
+         -X PUT
+         -H "Content-Type: application/json"
+         -d "{\"mode\": \"AttachedSingle\", \"generation\": 1, \"tenant_conf\": {}}"
+        "http://pageserver:9898/v1/tenant/${tenant_id}/location_config"
+    )
+    result=$(curl "${PARAMS[@]}")
+    echo $result | jq .
+  fi
+
+  echo "Check if a timeline present"
+  PARAMS=(
+       -X GET
+       -H "Content-Type: application/json"
+       "http://pageserver:9898/v1/tenant/${tenant_id}/timeline"
+  )
+  timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id)
+  if [ -z "${timeline_id}" ] || [ "${timeline_id}" = null ]; then
+    generate_id timeline_id
+    PARAMS=(
+        -sbf
+        -X POST
+        -H "Content-Type: application/json"
+        -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"
+        "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
+    )
+    result=$(curl "${PARAMS[@]}")
+    echo $result | jq .
+  fi
+fi
 
 echo "Overwrite tenant id and timeline id in spec file"
-sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE}
+sed -i "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE}
 sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
 
 cat ${SPEC_FILE}
diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
index 95e4b6fde7..489d60f38c 100644
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -149,11 +149,13 @@ services:
       args:
         - REPOSITORY=${REPOSITORY:-neondatabase}
         - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16}
-        - TAG=${TAG:-latest}
+        - TAG=${COMPUTE_TAG:-${TAG:-latest}}
         - http_proxy=${http_proxy:-}
         - https_proxy=${https_proxy:-}
     environment:
       - PG_VERSION=${PG_VERSION:-16}
+      - TENANT_ID=${TENANT_ID:-}
+      - TIMELINE_ID=${TIMELINE_ID:-}
       #- RUST_BACKTRACE=1
     # Mount the test files directly, for faster editing cycle.
     volumes:
diff --git a/docker-compose/ext-src/hll-src/test-upgrade.sh b/docker-compose/ext-src/hll-src/test-upgrade.sh
new file mode 100755
index 0000000000..f9e9aedcb2
--- /dev/null
+++ b/docker-compose/ext-src/hll-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin'    --dbname=contrib_regression add_agg agg_oob auto_sparse card_op cast_shape copy_binary cumulative_add_cardinality_correction cumulative_add_comprehensive_promotion cumulative_add_sparse_edge cumulative_add_sparse_random cumulative_add_sparse_step cumulative_union_comprehensive cumulative_union_explicit_explicit cumulative_union_explicit_promotion cumulative_union_probabilistic_probabilistic cumulative_union_sparse_full_representation cumulative_union_sparse_promotion cumulative_union_sparse_sparse disable_hashagg equal explicit_thresh hash hash_any meta_func murmur_bigint murmur_bytea nosparse notequal scalar_oob storedproc transaction typmod typmod_insert union_op
\ No newline at end of file
diff --git a/docker-compose/ext-src/hypopg-src/test-upgrade.patch b/docker-compose/ext-src/hypopg-src/test-upgrade.patch
new file mode 100644
index 0000000000..71fe26b164
--- /dev/null
+++ b/docker-compose/ext-src/hypopg-src/test-upgrade.patch
@@ -0,0 +1,27 @@
+diff --git a/expected/hypopg.out b/expected/hypopg.out
+index 90121d0..859260b 100644
+--- a/expected/hypopg.out
++++ b/expected/hypopg.out
+@@ -11,7 +11,8 @@ BEGIN
+ END;
+ $_$
+ LANGUAGE plpgsql;
+-CREATE EXTENSION hypopg;
++CREATE EXTENSION IF NOT EXISTS hypopg;
++NOTICE:  extension "hypopg" already exists, skipping
+ CREATE TABLE hypo (id integer, val text, "Id2" bigint);
+ INSERT INTO hypo SELECT i, 'line ' || i
+ FROM generate_series(1,100000) f(i);
+diff --git a/test/sql/hypopg.sql b/test/sql/hypopg.sql
+index 99722b0..8d6bacb 100644
+--- a/test/sql/hypopg.sql
++++ b/test/sql/hypopg.sql
+@@ -12,7 +12,7 @@ END;
+ $_$
+ LANGUAGE plpgsql;
+
+-CREATE EXTENSION hypopg;
++CREATE EXTENSION IF NOT EXISTS hypopg;
+
+ CREATE TABLE hypo (id integer, val text, "Id2" bigint);
+
diff --git a/docker-compose/ext-src/hypopg-src/test-upgrade.sh b/docker-compose/ext-src/hypopg-src/test-upgrade.sh
new file mode 100755
index 0000000000..066ac3329e
--- /dev/null
+++ b/docker-compose/ext-src/hypopg-src/test-upgrade.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --inputdir=test --dbname=contrib_regression hypopg hypo_brin hypo_index_part hypo_include hypo_hash hypo_hide_index
\ No newline at end of file
diff --git a/docker-compose/ext-src/ip4r-src/test-upgrade.patch b/docker-compose/ext-src/ip4r-src/test-upgrade.patch
new file mode 100644
index 0000000000..e166052cca
--- /dev/null
+++ b/docker-compose/ext-src/ip4r-src/test-upgrade.patch
@@ -0,0 +1,23 @@
+diff --git a/expected/ip4r.out b/expected/ip4r.out
+index 7527af3..b38ed29 100644
+--- a/expected/ip4r.out
++++ b/expected/ip4r.out
+@@ -1,6 +1,5 @@
+ --
+ /*CUT-HERE*/
+-CREATE EXTENSION ip4r;
+ -- Check whether any of our opclasses fail amvalidate
+ DO $d$
+   DECLARE
+diff --git a/sql/ip4r.sql b/sql/ip4r.sql
+index 65c49ec..24ade09 100644
+--- a/sql/ip4r.sql
++++ b/sql/ip4r.sql
+@@ -1,7 +1,6 @@
+ --
+
+ /*CUT-HERE*/
+-CREATE EXTENSION ip4r;
+
+ -- Check whether any of our opclasses fail amvalidate
+
diff --git a/docker-compose/ext-src/ip4r-src/test-upgrade.sh b/docker-compose/ext-src/ip4r-src/test-upgrade.sh
new file mode 100755
index 0000000000..0463833598
--- /dev/null
+++ b/docker-compose/ext-src/ip4r-src/test-upgrade.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin'    --dbname=contrib_regression ip4r ip4r-softerr ip4r-v11
\ No newline at end of file
diff --git a/docker-compose/ext-src/pg_cron-src/test-upgrade.patch b/docker-compose/ext-src/pg_cron-src/test-upgrade.patch
new file mode 100644
index 0000000000..7f187be6e2
--- /dev/null
+++ b/docker-compose/ext-src/pg_cron-src/test-upgrade.patch
@@ -0,0 +1,75 @@
+diff --git a/expected/pg_cron-test.out b/expected/pg_cron-test.out
+index d79d542..1663886 100644
+--- a/expected/pg_cron-test.out
++++ b/expected/pg_cron-test.out
+@@ -1,30 +1,3 @@
+-CREATE EXTENSION pg_cron VERSION '1.0';
+-SELECT extversion FROM pg_extension WHERE extname='pg_cron';
+- extversion 
+-------------
+- 1.0
+-(1 row)
+-
+--- Test binary compatibility with v1.4 function signature.
+-ALTER EXTENSION pg_cron UPDATE TO '1.4';
+-SELECT cron.unschedule(job_name := 'no_such_job');
+-ERROR:  could not find valid entry for job 'no_such_job'
+-SELECT cron.schedule('testjob', '* * * * *', 'SELECT 1');
+- schedule 
+-----------
+-        1
+-(1 row)
+-
+-SELECT cron.unschedule('testjob');
+- unschedule 
+-------------
+- t
+-(1 row)
+-
+--- Test cache invalidation
+-DROP EXTENSION pg_cron;
+-CREATE EXTENSION pg_cron VERSION '1.4';
+-ALTER EXTENSION pg_cron UPDATE;
+ -- Vacuum every day at 10:00am (GMT)
+ SELECT cron.schedule('0 10 * * *', 'VACUUM');
+  schedule 
+@@ -300,8 +273,3 @@ SELECT jobid, jobname, schedule, command FROM cron.job ORDER BY jobid;
+ SELECT cron.schedule('bad-last-dom-job1', '0 11 $foo * *', 'VACUUM FULL');
+ ERROR:  invalid schedule: 0 11 $foo * *
+ HINT:  Use cron format (e.g. 5 4 * * *), or interval format '[1-59] seconds'
+--- cleaning
+-DROP EXTENSION pg_cron;
+-drop user pgcron_cront;
+-drop database pgcron_dbno;
+-drop database pgcron_dbyes;
+diff --git a/sql/pg_cron-test.sql b/sql/pg_cron-test.sql
+index 45f94d9..241cf73 100644
+--- a/sql/pg_cron-test.sql
++++ b/sql/pg_cron-test.sql
+@@ -1,17 +1,3 @@
+-CREATE EXTENSION pg_cron VERSION '1.0';
+-SELECT extversion FROM pg_extension WHERE extname='pg_cron';
+--- Test binary compatibility with v1.4 function signature.
+-ALTER EXTENSION pg_cron UPDATE TO '1.4';
+-SELECT cron.unschedule(job_name := 'no_such_job');
+-SELECT cron.schedule('testjob', '* * * * *', 'SELECT 1');
+-SELECT cron.unschedule('testjob');
+-
+--- Test cache invalidation
+-DROP EXTENSION pg_cron;
+-CREATE EXTENSION pg_cron VERSION '1.4';
+-
+-ALTER EXTENSION pg_cron UPDATE;
+-
+ -- Vacuum every day at 10:00am (GMT)
+ SELECT cron.schedule('0 10 * * *', 'VACUUM');
+ 
+@@ -156,8 +142,3 @@ SELECT jobid, jobname, schedule, command FROM cron.job ORDER BY jobid;
+ -- invalid last of day job
+ SELECT cron.schedule('bad-last-dom-job1', '0 11 $foo * *', 'VACUUM FULL');
+ 
+--- cleaning
+-DROP EXTENSION pg_cron;
+-drop user pgcron_cront;
+-drop database pgcron_dbno;
+-drop database pgcron_dbyes;
diff --git a/docker-compose/ext-src/pg_cron-src/test-upgrade.sh b/docker-compose/ext-src/pg_cron-src/test-upgrade.sh
new file mode 100755
index 0000000000..8308b495d3
--- /dev/null
+++ b/docker-compose/ext-src/pg_cron-src/test-upgrade.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin'    --dbname=contrib_regression pg_cron-test
\ No newline at end of file
diff --git a/docker-compose/ext-src/pg_ivm-src/test-upgrade.patch b/docker-compose/ext-src/pg_ivm-src/test-upgrade.patch
new file mode 100644
index 0000000000..77de3accfd
--- /dev/null
+++ b/docker-compose/ext-src/pg_ivm-src/test-upgrade.patch
@@ -0,0 +1,18 @@
+diff --git a/expected/pg_ivm.out b/expected/pg_ivm.out
+index e8798ee..cca58d0 100644
+--- a/expected/pg_ivm.out
++++ b/expected/pg_ivm.out
+@@ -1,4 +1,3 @@
+-CREATE EXTENSION pg_ivm;
+ GRANT ALL ON SCHEMA public TO public;
+ -- create a table to use as a basis for views and materialized views in various combinations
+ CREATE TABLE mv_base_a (i int, j int);
+diff --git a/sql/pg_ivm.sql b/sql/pg_ivm.sql
+index d3c1a01..9382d7f 100644
+--- a/sql/pg_ivm.sql
++++ b/sql/pg_ivm.sql
+@@ -1,4 +1,3 @@
+-CREATE EXTENSION pg_ivm;
+ GRANT ALL ON SCHEMA public TO public;
+ 
+ -- create a table to use as a basis for views and materialized views in various combinations
diff --git a/docker-compose/ext-src/pg_ivm-src/test-upgrade.sh b/docker-compose/ext-src/pg_ivm-src/test-upgrade.sh
new file mode 100755
index 0000000000..5ece4dbf1c
--- /dev/null
+++ b/docker-compose/ext-src/pg_ivm-src/test-upgrade.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression pg_ivm create_immv refresh_immv
\ No newline at end of file
diff --git a/docker-compose/ext-src/pg_roaringbitmap-src/test-upgrade.patch b/docker-compose/ext-src/pg_roaringbitmap-src/test-upgrade.patch
new file mode 100644
index 0000000000..ece3f31488
--- /dev/null
+++ b/docker-compose/ext-src/pg_roaringbitmap-src/test-upgrade.patch
@@ -0,0 +1,25 @@
+diff --git a/expected/roaringbitmap.out b/expected/roaringbitmap.out
+index de70531..a5f7c15 100644
+--- a/expected/roaringbitmap.out
++++ b/expected/roaringbitmap.out
+@@ -1,7 +1,6 @@
+ --
+ --  Test roaringbitmap extension
+ --
+-CREATE EXTENSION if not exists roaringbitmap;
+ -- Test input and output
+ set roaringbitmap.output_format='array';
+ set extra_float_digits = 0;
+diff --git a/sql/roaringbitmap.sql b/sql/roaringbitmap.sql
+index a0e9c74..84bc966 100644
+--- a/sql/roaringbitmap.sql
++++ b/sql/roaringbitmap.sql
+@@ -2,8 +2,6 @@
+ --  Test roaringbitmap extension
+ --
+ 
+-CREATE EXTENSION if not exists roaringbitmap;
+-
+ -- Test input and output
+ 
+ set roaringbitmap.output_format='array';
diff --git a/docker-compose/ext-src/pg_roaringbitmap-src/test-upgrade.sh b/docker-compose/ext-src/pg_roaringbitmap-src/test-upgrade.sh
new file mode 100755
index 0000000000..ea1264fb28
--- /dev/null
+++ b/docker-compose/ext-src/pg_roaringbitmap-src/test-upgrade.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin'    --dbname=contrib_regression roaringbitmap
\ No newline at end of file
diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade.patch b/docker-compose/ext-src/pg_semver-src/test-upgrade.patch
new file mode 100644
index 0000000000..c6eab001da
--- /dev/null
+++ b/docker-compose/ext-src/pg_semver-src/test-upgrade.patch
@@ -0,0 +1,24 @@
+diff --git a/test/sql/base.sql b/test/sql/base.sql
+index af599d8..2eed91b 100644
+--- a/test/sql/base.sql
++++ b/test/sql/base.sql
+@@ -2,7 +2,6 @@
+ BEGIN;
+ 
+ \i test/pgtap-core.sql
+-\i sql/semver.sql
+ 
+ SELECT plan(334);
+ --SELECT * FROM no_plan();
+diff --git a/test/sql/corpus.sql b/test/sql/corpus.sql
+index 1f5f637..a519905 100644
+--- a/test/sql/corpus.sql
++++ b/test/sql/corpus.sql
+@@ -4,7 +4,6 @@ BEGIN;
+ -- Test the SemVer corpus from https://regex101.com/r/Ly7O1x/3/.
+ 
+ \i test/pgtap-core.sql
+-\i sql/semver.sql
+ 
+ SELECT plan(71);
+ --SELECT * FROM no_plan();
diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade.sh b/docker-compose/ext-src/pg_semver-src/test-upgrade.sh
new file mode 100755
index 0000000000..e1541f272a
--- /dev/null
+++ b/docker-compose/ext-src/pg_semver-src/test-upgrade.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin'    --inputdir=test --dbname=contrib_regression base corpus
\ No newline at end of file
diff --git a/docker-compose/ext-src/pg_uuidv7-src/test-upgrade.sh b/docker-compose/ext-src/pg_uuidv7-src/test-upgrade.sh
new file mode 100755
index 0000000000..bb8b589df8
--- /dev/null
+++ b/docker-compose/ext-src/pg_uuidv7-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin'    --inputdir=test --dbname=contrib_regression  002_uuid_generate_v7 003_uuid_v7_to_timestamptz 004_uuid_timestamptz_to_v7 005_uuid_v7_to_timestamp 006_uuid_timestamp_to_v7
\ No newline at end of file
diff --git a/docker-compose/ext-src/pgvector-src/test-upgrade.sh b/docker-compose/ext-src/pgvector-src/test-upgrade.sh
new file mode 100755
index 0000000000..8967f62173
--- /dev/null
+++ b/docker-compose/ext-src/pgvector-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --inputdir=test --use-existing --dbname=contrib_regression bit btree cast copy halfvec hnsw_bit hnsw_halfvec hnsw_sparsevec hnsw_vector ivfflat_bit ivfflat_halfvec ivfflat_vector sparsevec vector_type
\ No newline at end of file
diff --git a/docker-compose/ext-src/plv8-src/test-upgrade.sh b/docker-compose/ext-src/plv8-src/test-upgrade.sh
new file mode 100755
index 0000000000..6514d4fe92
--- /dev/null
+++ b/docker-compose/ext-src/plv8-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin'  --use-existing --dbname=contrib_regression plv8 plv8-errors scalar_args inline json startup_pre startup varparam json_conv jsonb_conv window guc es6 arraybuffer composites currentresource startup_perms bytea find_function_perms memory_limits reset show array_spread regression dialect bigint procedure
\ No newline at end of file
diff --git a/docker-compose/ext-src/postgresql-unit-src/test-upgrade.sh b/docker-compose/ext-src/postgresql-unit-src/test-upgrade.sh
new file mode 100755
index 0000000000..17a89620e0
--- /dev/null
+++ b/docker-compose/ext-src/postgresql-unit-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --dbname=contrib_regression extension tables unit binary unicode prefix units time temperature functions language_functions round derived compare aggregate iec custom crosstab convert
\ No newline at end of file
diff --git a/docker-compose/ext-src/prefix-src/test-upgrade.sh b/docker-compose/ext-src/prefix-src/test-upgrade.sh
new file mode 100755
index 0000000000..a609df0bed
--- /dev/null
+++ b/docker-compose/ext-src/prefix-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin'    --dbname=contrib_regression prefix falcon explain queries
\ No newline at end of file
diff --git a/docker-compose/ext-src/rum-src/test-upgrade.patch b/docker-compose/ext-src/rum-src/test-upgrade.patch
new file mode 100644
index 0000000000..1a9805254f
--- /dev/null
+++ b/docker-compose/ext-src/rum-src/test-upgrade.patch
@@ -0,0 +1,19 @@
+diff --git a/expected/rum.out b/expected/rum.out
+index 5966d19..8860b79 100644
+--- a/expected/rum.out
++++ b/expected/rum.out
+@@ -1,4 +1,3 @@
+-CREATE EXTENSION rum;
+ CREATE TABLE test_rum( t text, a tsvector );
+ CREATE TRIGGER tsvectorupdate
+ BEFORE UPDATE OR INSERT ON test_rum
+diff --git a/sql/rum.sql b/sql/rum.sql
+index 8414bb9..898e6ab 100644
+--- a/sql/rum.sql
++++ b/sql/rum.sql
+@@ -1,5 +1,3 @@
+-CREATE EXTENSION rum;
+-
+ CREATE TABLE test_rum( t text, a tsvector );
+
+ CREATE TRIGGER tsvectorupdate
\ No newline at end of file
diff --git a/docker-compose/ext-src/rum-src/test-upgrade.sh b/docker-compose/ext-src/rum-src/test-upgrade.sh
new file mode 100755
index 0000000000..6daf6d4054
--- /dev/null
+++ b/docker-compose/ext-src/rum-src/test-upgrade.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --dbname=contrib_regression rum rum_validate rum_hash ruminv timestamp orderby orderby_hash altorder altorder_hash limits int2 int4 int8 float4 float8 money oid time timetz date interval macaddr inet cidr text varchar char bytea bit varbit numeric rum_weight expr array
\ No newline at end of file
diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh
new file mode 100755
index 0000000000..ff93b98065
--- /dev/null
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+set -eux -o pipefail
+cd "$(dirname "${0}")"
+# Takes a variable name as argument. The result is stored in that variable.
+generate_id() {
+    local -n resvar=$1
+    printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
+}
+if [ -z ${OLDTAG+x} ] || [ -z ${NEWTAG+x} ] || [ -z "${OLDTAG}" ] || [ -z "${NEWTAG}" ]; then
+  echo OLDTAG and NEWTAG must be defined
+  exit 1
+fi
+export PG_VERSION=${PG_VERSION:-16}
+function wait_for_ready {
+  TIME=0
+  while ! docker compose logs compute_is_ready | grep -q "accepting connections" && [ ${TIME} -le 300 ] ; do
+    ((TIME += 1 ))
+    sleep 1
+  done
+  if [ ${TIME} -gt 300 ]; then
+    echo Time is out.
+    exit 2
+  fi
+}
+function create_extensions() {
+  for ext in ${1}; do
+    docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext}"
+  done
+}
+EXTENSIONS='[
+{"extname": "plv8", "extdir": "plv8-src"},
+{"extname": "vector", "extdir": "pgvector-src"},
+{"extname": "unit", "extdir": "postgresql-unit-src"},
+{"extname": "hypopg", "extdir": "hypopg-src"},
+{"extname": "rum", "extdir": "rum-src"},
+{"extname": "ip4r", "extdir": "ip4r-src"},
+{"extname": "prefix", "extdir": "prefix-src"},
+{"extname": "hll", "extdir": "hll-src"},
+{"extname": "pg_cron", "extdir": "pg_cron-src"},
+{"extname": "pg_uuidv7", "extdir": "pg_uuidv7-src"},
+{"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"},
+{"extname": "semver", "extdir": "pg_semver-src"},
+{"extname": "pg_ivm", "extdir": "pg_ivm-src"}
+]'
+EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
+TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d
+wait_for_ready
+docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
+docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
+create_extensions "${EXTNAMES}"
+query="select json_object_agg(extname,extversion) from pg_extension where extname in ('${EXTNAMES// /\',\'}')"
+new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
+docker compose --profile test-extensions down
+TAG=${OLDTAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
+wait_for_ready
+docker compose cp  ext-src neon-test-extensions:/
+docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
+docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
+create_extensions "${EXTNAMES}"
+query="select pge.extname from pg_extension pge join (select key as extname, value as extversion from json_each_text('${new_vers}')) x on pge.extname=x.extname and pge.extversion <> x.extversion"
+exts=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
+if [ -z "${exts}" ]; then
+  echo "No extensions were upgraded"
+else
+  tenant_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.tenant_id")
+  timeline_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id")
+  for ext in ${exts}; do
+    echo Testing ${ext}...
+    EXTDIR=$(echo ${EXTENSIONS} | jq -r '.[] | select(.extname=="'${ext}'") | .extdir')
+    generate_id new_timeline_id
+    PARAMS=(
+        -sbf
+        -X POST
+        -H "Content-Type: application/json"
+        -d "{\"new_timeline_id\": \"${new_timeline_id}\", \"pg_version\": ${PG_VERSION}, \"ancestor_timeline_id\": \"${timeline_id}\"}"
+        "http://127.0.0.1:9898/v1/tenant/${tenant_id}/timeline/"
+    )
+    result=$(curl "${PARAMS[@]}")
+    echo $result | jq .
+    TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} TAG=${OLDTAG} docker compose down compute compute_is_ready
+    COMPUTE_TAG=${NEWTAG} TAG=${OLDTAG} TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} docker compose up --quiet-pull -d --build compute compute_is_ready
+    wait_for_ready
+    TID=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id")
+    if [ ${TID} != ${new_timeline_id} ]; then
+      echo Timeline mismatch
+      exit 1
+    fi
+    docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}"
+    docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh
+    docker compose exec neon-test-extensions psql -d contrib_regression -c "alter extension ${ext} update"
+    docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}"
+  done
+fi

From 4d2328ebe3b5f6db9a57effe86bc70614dcd9d19 Mon Sep 17 00:00:00 2001
From: alexanderlaw <exclusion@gmail.com>
Date: Wed, 29 Jan 2025 12:05:43 +0200
Subject: [PATCH 288/583] Fix C code to satisfy sanitizers (#10473)

---
 pgxn/neon/file_cache.c  | 12 ++++++------
 pgxn/neon/walproposer.c |  3 ++-
 vendor/postgres-v17     |  2 +-
 vendor/revisions.json   |  2 +-
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 64b236061d..08b7652175 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -480,7 +480,7 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	if (LFC_ENABLED())
 	{
 		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
-		found = entry != NULL && (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) != 0;
+		found = entry != NULL && (entry->bitmap[chunk_offs >> 5] & ((uint32)1 << (chunk_offs & 31))) != 0;
 	}
 	LWLockRelease(lfc_lock);
 	return found;
@@ -527,7 +527,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
 				{
 					if ((entry->bitmap[chunk_offs >> 5] & 
-						(1 << (chunk_offs & 31))) != 0)
+						((uint32)1 << (chunk_offs & 31))) != 0)
 					{
 						BITMAP_SET(bitmap, i);
 						found++;
@@ -620,7 +620,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	}
 
 	/* remove the page from the cache */
-	entry->bitmap[chunk_offs >> 5] &= ~(1 << (chunk_offs & (32 - 1)));
+	entry->bitmap[chunk_offs >> 5] &= ~((uint32)1 << (chunk_offs & (32 - 1)));
 
 	if (entry->access_count == 0)
 	{
@@ -774,7 +774,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			 * If the page is valid, we consider it "read".
 			 * All other pages will be fetched separately by the next cache
 			 */
-			if (entry->bitmap[(chunk_offs + i) / 32] & (1 << ((chunk_offs + i) % 32)))
+			if (entry->bitmap[(chunk_offs + i) / 32] & ((uint32)1 << ((chunk_offs + i) % 32)))
 			{
 				BITMAP_SET(mask, buf_offset + i);
 				iteration_hits++;
@@ -1034,7 +1034,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				{
 					lfc_ctl->used_pages += 1 - ((entry->bitmap[(chunk_offs + i) >> 5] >> ((chunk_offs + i) & 31)) & 1);
 					entry->bitmap[(chunk_offs + i) >> 5] |=
-						(1 << ((chunk_offs + i) & 31));
+						((uint32)1 << ((chunk_offs + i) & 31));
 				}
 			}
 
@@ -1282,7 +1282,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
 			{
 				for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 				{
-					if (entry->bitmap[i >> 5] & (1 << (i & 31)))
+					if (entry->bitmap[i >> 5] & ((uint32)1 << (i & 31)))
 					{
 						fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
 						fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index e89ffdb628..7472fd6afc 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1024,7 +1024,8 @@ DetermineEpochStartLsn(WalProposer *wp)
 	dth = &wp->safekeeper[wp->donor].voteResponse.termHistory;
 	wp->propTermHistory.n_entries = dth->n_entries + 1;
 	wp->propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * wp->propTermHistory.n_entries);
-	memcpy(wp->propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries);
+	if (dth->n_entries > 0)
+		memcpy(wp->propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries);
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn;
 
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 46f9b96555..b654fa88b6 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 46f9b96555e084c35dd975da9485996db9e86181
+Subproject commit b654fa88b6fd2ad24a03a14a7cd417ec66e518f9
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 3aa42d22c5..982f537692 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v17": [
     "17.2",
-    "46f9b96555e084c35dd975da9485996db9e86181"
+    "b654fa88b6fd2ad24a03a14a7cd417ec66e518f9"
   ],
   "v16": [
     "16.6",

From 222cc181e9314fe6c7596f06d677a92875f4aa8b Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Wed, 29 Jan 2025 13:19:10 +0200
Subject: [PATCH 289/583] impr(proxy): Move the CancelMap to Redis hashes 
 (#10364)

## Problem
The approach of having CancelMap as an in-memory structure increases
code complexity,
as well as putting additional load for Redis streams.

## Summary of changes
- Implement a set of KV ops for Redis client;
- Remove cancel notifications code;
- Send KV ops over the bounded channel to the handling background task
for removing and adding the cancel keys.


Closes #9660
---
 Cargo.lock                                    |   1 +
 libs/pq_proto/src/lib.rs                      |   7 +
 libs/proxy/tokio-postgres2/Cargo.toml         |   1 +
 .../proxy/tokio-postgres2/src/cancel_token.rs |   3 +-
 libs/proxy/tokio-postgres2/src/client.rs      |   3 +-
 libs/proxy/tokio-postgres2/src/config.rs      |   5 +-
 proxy/src/auth/backend/mod.rs                 |   3 +-
 proxy/src/bin/local_proxy.rs                  |  10 +-
 proxy/src/bin/proxy.rs                        |  46 +-
 proxy/src/cancellation.rs                     | 563 +++++++++---------
 proxy/src/compute.rs                          |   1 -
 proxy/src/console_redirect_proxy.rs           |  29 +-
 proxy/src/metrics.rs                          |  30 +-
 proxy/src/proxy/mod.rs                        |  39 +-
 proxy/src/proxy/passthrough.rs                |   9 +-
 proxy/src/rate_limiter/limiter.rs             |   6 +
 proxy/src/redis/cancellation_publisher.rs     |  72 +--
 .../connection_with_credentials_provider.rs   |   1 +
 proxy/src/redis/keys.rs                       |  88 +++
 proxy/src/redis/kv_ops.rs                     | 185 ++++++
 proxy/src/redis/mod.rs                        |   2 +
 proxy/src/redis/notifications.rs              | 105 +---
 proxy/src/serverless/mod.rs                   |   8 +-
 proxy/src/serverless/websocket.rs             |   4 +-
 24 files changed, 674 insertions(+), 547 deletions(-)
 create mode 100644 proxy/src/redis/keys.rs
 create mode 100644 proxy/src/redis/kv_ops.rs

diff --git a/Cargo.lock b/Cargo.lock
index 3c33901247..c19fdc0941 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6935,6 +6935,7 @@ dependencies = [
  "pin-project-lite",
  "postgres-protocol2",
  "postgres-types2",
+ "serde",
  "tokio",
  "tokio-util",
 ]
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 50b2c69d24..f99128b76a 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -182,6 +182,13 @@ pub struct CancelKeyData {
     pub cancel_key: i32,
 }
 
+pub fn id_to_cancel_key(id: u64) -> CancelKeyData {
+    CancelKeyData {
+        backend_pid: (id >> 32) as i32,
+        cancel_key: (id & 0xffffffff) as i32,
+    }
+}
+
 impl fmt::Display for CancelKeyData {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         let hi = (self.backend_pid as u64) << 32;
diff --git a/libs/proxy/tokio-postgres2/Cargo.toml b/libs/proxy/tokio-postgres2/Cargo.toml
index 56e7c4da47..ade0ffc9f6 100644
--- a/libs/proxy/tokio-postgres2/Cargo.toml
+++ b/libs/proxy/tokio-postgres2/Cargo.toml
@@ -19,3 +19,4 @@ postgres-protocol2 = { path = "../postgres-protocol2" }
 postgres-types2 = { path = "../postgres-types2" }
 tokio = { workspace = true, features = ["io-util", "time", "net"] }
 tokio-util = { workspace = true, features = ["codec"] }
+serde = { workspace = true, features = ["derive"] }
\ No newline at end of file
diff --git a/libs/proxy/tokio-postgres2/src/cancel_token.rs b/libs/proxy/tokio-postgres2/src/cancel_token.rs
index a10e8bf5c3..718f903a92 100644
--- a/libs/proxy/tokio-postgres2/src/cancel_token.rs
+++ b/libs/proxy/tokio-postgres2/src/cancel_token.rs
@@ -3,12 +3,13 @@ use crate::tls::TlsConnect;
 
 use crate::{cancel_query, client::SocketConfig, tls::MakeTlsConnect};
 use crate::{cancel_query_raw, Error};
+use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::TcpStream;
 
 /// The capability to request cancellation of in-progress queries on a
 /// connection.
-#[derive(Clone)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct CancelToken {
     pub socket_config: Option<SocketConfig>,
     pub ssl_mode: SslMode,
diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs
index a7cd53afc3..9bbbd4c260 100644
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -18,6 +18,7 @@ use fallible_iterator::FallibleIterator;
 use futures_util::{future, ready, TryStreamExt};
 use parking_lot::Mutex;
 use postgres_protocol2::message::{backend::Message, frontend};
+use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::fmt;
 use std::sync::Arc;
@@ -137,7 +138,7 @@ impl InnerClient {
     }
 }
 
-#[derive(Clone)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct SocketConfig {
     pub host: Host,
     pub port: u16,
diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs
index 11a361a81b..47cc45ac80 100644
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -7,6 +7,7 @@ use crate::tls::MakeTlsConnect;
 use crate::tls::TlsConnect;
 use crate::{Client, Connection, Error};
 use postgres_protocol2::message::frontend::StartupMessageParams;
+use serde::{Deserialize, Serialize};
 use std::fmt;
 use std::str;
 use std::time::Duration;
@@ -16,7 +17,7 @@ pub use postgres_protocol2::authentication::sasl::ScramKeys;
 use tokio::net::TcpStream;
 
 /// TLS configuration.
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
 #[non_exhaustive]
 pub enum SslMode {
     /// Do not use TLS.
@@ -50,7 +51,7 @@ pub enum ReplicationMode {
 }
 
 /// A host specification.
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub enum Host {
     /// A TCP hostname.
     Tcp(String),
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index de48be2952..d17d91a56d 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -12,6 +12,7 @@ pub(crate) use console_redirect::ConsoleRedirectError;
 use ipnet::{Ipv4Net, Ipv6Net};
 use local::LocalBackend;
 use postgres_client::config::AuthKeys;
+use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{debug, info, warn};
 
@@ -133,7 +134,7 @@ pub(crate) struct ComputeUserInfoNoEndpoint {
     pub(crate) options: NeonOptions,
 }
 
-#[derive(Debug, Clone, Default)]
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
 pub(crate) struct ComputeUserInfo {
     pub(crate) endpoint: EndpointId,
     pub(crate) user: RoleName,
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index 644f670f88..ee8b3d4ef5 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -7,12 +7,11 @@ use std::time::Duration;
 use anyhow::{bail, ensure, Context};
 use camino::{Utf8Path, Utf8PathBuf};
 use compute_api::spec::LocalProxySpec;
-use dashmap::DashMap;
 use futures::future::Either;
 use proxy::auth::backend::jwt::JwkCache;
 use proxy::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP};
 use proxy::auth::{self};
-use proxy::cancellation::CancellationHandlerMain;
+use proxy::cancellation::CancellationHandler;
 use proxy::config::{
     self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig,
 };
@@ -211,12 +210,7 @@ async fn main() -> anyhow::Result<()> {
         auth_backend,
         http_listener,
         shutdown.clone(),
-        Arc::new(CancellationHandlerMain::new(
-            &config.connect_to_compute,
-            Arc::new(DashMap::new()),
-            None,
-            proxy::metrics::CancellationSource::Local,
-        )),
+        Arc::new(CancellationHandler::new(&config.connect_to_compute, None)),
         endpoint_rate_limiter,
     );
 
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 70b50436bf..e1affe8391 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -7,7 +7,7 @@ use anyhow::bail;
 use futures::future::Either;
 use proxy::auth::backend::jwt::JwkCache;
 use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned};
-use proxy::cancellation::{CancelMap, CancellationHandler};
+use proxy::cancellation::{handle_cancel_messages, CancellationHandler};
 use proxy::config::{
     self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig,
     ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2,
@@ -18,8 +18,8 @@ use proxy::metrics::Metrics;
 use proxy::rate_limiter::{
     EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, WakeComputeRateLimiter,
 };
-use proxy::redis::cancellation_publisher::RedisPublisherClient;
 use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
+use proxy::redis::kv_ops::RedisKVClient;
 use proxy::redis::{elasticache, notifications};
 use proxy::scram::threadpool::ThreadPool;
 use proxy::serverless::cancel_set::CancelSet;
@@ -28,7 +28,6 @@ use proxy::tls::client_config::compute_client_config_with_root_certs;
 use proxy::{auth, control_plane, http, serverless, usage_metrics};
 use remote_storage::RemoteStorageConfig;
 use tokio::net::TcpListener;
-use tokio::sync::Mutex;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::{info, warn, Instrument};
@@ -158,8 +157,11 @@ struct ProxyCliArgs {
     #[clap(long, default_value_t = 64)]
     auth_rate_limit_ip_subnet: u8,
     /// Redis rate limiter max number of requests per second.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)]
     redis_rps_limit: Vec<RateBucketInfo>,
+    /// Cancellation channel size (max queue size for redis kv client)
+    #[clap(long, default_value = "1024")]
+    cancellation_ch_size: usize,
     /// cache for `allowed_ips` (use `size=0` to disable)
     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     allowed_ips_cache: String,
@@ -382,27 +384,19 @@ async fn main() -> anyhow::Result<()> {
 
     let cancellation_token = CancellationToken::new();
 
-    let cancel_map = CancelMap::default();
-
     let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone());
     RateBucketInfo::validate(redis_rps_limit)?;
 
-    let redis_publisher = match &regional_redis_client {
-        Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
-            redis_publisher.clone(),
-            args.region.clone(),
-            redis_rps_limit,
-        )?))),
-        None => None,
-    };
+    let redis_kv_client = regional_redis_client
+        .as_ref()
+        .map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit));
 
-    let cancellation_handler = Arc::new(CancellationHandler::<
-        Option<Arc<Mutex<RedisPublisherClient>>>,
-    >::new(
+    // channel size should be higher than redis client limit to avoid blocking
+    let cancel_ch_size = args.cancellation_ch_size;
+    let (tx_cancel, rx_cancel) = tokio::sync::mpsc::channel(cancel_ch_size);
+    let cancellation_handler = Arc::new(CancellationHandler::new(
         &config.connect_to_compute,
-        cancel_map.clone(),
-        redis_publisher,
-        proxy::metrics::CancellationSource::FromClient,
+        Some(tx_cancel),
     ));
 
     // bit of a hack - find the min rps and max rps supported and turn it into
@@ -495,25 +489,29 @@ async fn main() -> anyhow::Result<()> {
                     let cache = api.caches.project_info.clone();
                     if let Some(client) = client1 {
                         maintenance_tasks.spawn(notifications::task_main(
-                            config,
                             client,
                             cache.clone(),
-                            cancel_map.clone(),
                             args.region.clone(),
                         ));
                     }
                     if let Some(client) = client2 {
                         maintenance_tasks.spawn(notifications::task_main(
-                            config,
                             client,
                             cache.clone(),
-                            cancel_map.clone(),
                             args.region.clone(),
                         ));
                     }
                     maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
                 }
             }
+
+            if let Some(mut redis_kv_client) = redis_kv_client {
+                maintenance_tasks.spawn(async move {
+                    redis_kv_client.try_connect().await?;
+                    handle_cancel_messages(&mut redis_kv_client, rx_cancel).await
+                });
+            }
+
             if let Some(regional_redis_client) = regional_redis_client {
                 let cache = api.caches.endpoints_cache.clone();
                 let con = regional_redis_client;
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index a96c43f2ce..34f708a36b 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,48 +1,124 @@
 use std::net::{IpAddr, SocketAddr};
 use std::sync::Arc;
 
-use dashmap::DashMap;
 use ipnet::{IpNet, Ipv4Net, Ipv6Net};
 use postgres_client::tls::MakeTlsConnect;
 use postgres_client::CancelToken;
 use pq_proto::CancelKeyData;
+use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tokio::net::TcpStream;
-use tokio::sync::Mutex;
+use tokio::sync::mpsc;
 use tracing::{debug, info};
-use uuid::Uuid;
 
 use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo};
-use crate::auth::{check_peer_addr_is_in_list, AuthError, IpPattern};
+use crate::auth::{check_peer_addr_is_in_list, AuthError};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::error::ReportableError;
 use crate::ext::LockExt;
-use crate::metrics::{CancellationRequest, CancellationSource, Metrics};
+use crate::metrics::CancelChannelSizeGuard;
+use crate::metrics::{CancellationRequest, Metrics, RedisMsgKind};
 use crate::rate_limiter::LeakyBucketRateLimiter;
-use crate::redis::cancellation_publisher::{
-    CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
-};
+use crate::redis::keys::KeyPrefix;
+use crate::redis::kv_ops::RedisKVClient;
 use crate::tls::postgres_rustls::MakeRustlsConnect;
-
-pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
-pub type CancellationHandlerMain = CancellationHandler<Option<Arc<Mutex<RedisPublisherClient>>>>;
-pub(crate) type CancellationHandlerMainInternal = Option<Arc<Mutex<RedisPublisherClient>>>;
+use std::convert::Infallible;
+use tokio::sync::oneshot;
 
 type IpSubnetKey = IpNet;
 
+const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time
+const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10);
+
+// Message types for sending through mpsc channel
+pub enum CancelKeyOp {
+    StoreCancelKey {
+        key: String,
+        field: String,
+        value: String,
+        resp_tx: Option<oneshot::Sender<anyhow::Result<()>>>,
+        _guard: CancelChannelSizeGuard<'static>,
+        expire: i64, // TTL for key
+    },
+    GetCancelData {
+        key: String,
+        resp_tx: oneshot::Sender<anyhow::Result<Vec<(String, String)>>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
+    RemoveCancelKey {
+        key: String,
+        field: String,
+        resp_tx: Option<oneshot::Sender<anyhow::Result<()>>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
+}
+
+// Running as a separate task to accept messages through the rx channel
+// In case of problems with RTT: switch to recv_many() + redis pipeline
+pub async fn handle_cancel_messages(
+    client: &mut RedisKVClient,
+    mut rx: mpsc::Receiver<CancelKeyOp>,
+) -> anyhow::Result<Infallible> {
+    loop {
+        if let Some(msg) = rx.recv().await {
+            match msg {
+                CancelKeyOp::StoreCancelKey {
+                    key,
+                    field,
+                    value,
+                    resp_tx,
+                    _guard,
+                    expire: _,
+                } => {
+                    if let Some(resp_tx) = resp_tx {
+                        resp_tx
+                            .send(client.hset(key, field, value).await)
+                            .inspect_err(|e| {
+                                tracing::debug!("failed to send StoreCancelKey response: {:?}", e);
+                            })
+                            .ok();
+                    } else {
+                        drop(client.hset(key, field, value).await);
+                    }
+                }
+                CancelKeyOp::GetCancelData {
+                    key,
+                    resp_tx,
+                    _guard,
+                } => {
+                    drop(resp_tx.send(client.hget_all(key).await));
+                }
+                CancelKeyOp::RemoveCancelKey {
+                    key,
+                    field,
+                    resp_tx,
+                    _guard,
+                } => {
+                    if let Some(resp_tx) = resp_tx {
+                        resp_tx
+                            .send(client.hdel(key, field).await)
+                            .inspect_err(|e| {
+                                tracing::debug!("failed to send StoreCancelKey response: {:?}", e);
+                            })
+                            .ok();
+                    } else {
+                        drop(client.hdel(key, field).await);
+                    }
+                }
+            }
+        }
+    }
+}
+
 /// Enables serving `CancelRequest`s.
 ///
 /// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances.
-pub struct CancellationHandler<P> {
+pub struct CancellationHandler {
     compute_config: &'static ComputeConfig,
-    map: CancelMap,
-    client: P,
-    /// This field used for the monitoring purposes.
-    /// Represents the source of the cancellation request.
-    from: CancellationSource,
     // rate limiter of cancellation requests
     limiter: Arc<std::sync::Mutex<LeakyBucketRateLimiter<IpSubnetKey>>>,
+    tx: Option<mpsc::Sender<CancelKeyOp>>, // send messages to the redis KV client task
 }
 
 #[derive(Debug, Error)]
@@ -61,6 +137,12 @@ pub(crate) enum CancelError {
 
     #[error("Authentication backend error")]
     AuthError(#[from] AuthError),
+
+    #[error("key not found")]
+    NotFound,
+
+    #[error("proxy service error")]
+    InternalError,
 }
 
 impl ReportableError for CancelError {
@@ -73,274 +155,191 @@ impl ReportableError for CancelError {
             CancelError::Postgres(_) => crate::error::ErrorKind::Compute,
             CancelError::RateLimit => crate::error::ErrorKind::RateLimit,
             CancelError::IpNotAllowed => crate::error::ErrorKind::User,
+            CancelError::NotFound => crate::error::ErrorKind::User,
             CancelError::AuthError(_) => crate::error::ErrorKind::ControlPlane,
+            CancelError::InternalError => crate::error::ErrorKind::Service,
         }
     }
 }
 
-impl<P: CancellationPublisher> CancellationHandler<P> {
-    /// Run async action within an ephemeral session identified by [`CancelKeyData`].
-    pub(crate) fn get_session(self: Arc<Self>) -> Session<P> {
+impl CancellationHandler {
+    pub fn new(
+        compute_config: &'static ComputeConfig,
+        tx: Option<mpsc::Sender<CancelKeyOp>>,
+    ) -> Self {
+        Self {
+            compute_config,
+            tx,
+            limiter: Arc::new(std::sync::Mutex::new(
+                LeakyBucketRateLimiter::<IpSubnetKey>::new_with_shards(
+                    LeakyBucketRateLimiter::<IpSubnetKey>::DEFAULT,
+                    64,
+                ),
+            )),
+        }
+    }
+
+    pub(crate) fn get_key(self: &Arc<Self>) -> Session {
         // we intentionally generate a random "backend pid" and "secret key" here.
         // we use the corresponding u64 as an identifier for the
         // actual endpoint+pid+secret for postgres/pgbouncer.
         //
         // if we forwarded the backend_pid from postgres to the client, there would be a lot
         // of overlap between our computes as most pids are small (~100).
-        let key = loop {
-            let key = rand::random();
 
-            // Random key collisions are unlikely to happen here, but they're still possible,
-            // which is why we have to take care not to rewrite an existing key.
-            match self.map.entry(key) {
-                dashmap::mapref::entry::Entry::Occupied(_) => continue,
-                dashmap::mapref::entry::Entry::Vacant(e) => {
-                    e.insert(None);
-                }
-            }
-            break key;
-        };
+        let key: CancelKeyData = rand::random();
+
+        let prefix_key: KeyPrefix = KeyPrefix::Cancel(key);
+        let redis_key = prefix_key.build_redis_key();
 
         debug!("registered new query cancellation key {key}");
         Session {
             key,
-            cancellation_handler: self,
+            redis_key,
+            cancellation_handler: Arc::clone(self),
         }
     }
 
-    /// Cancelling only in notification, will be removed
-    pub(crate) async fn cancel_session(
+    async fn get_cancel_key(
         &self,
         key: CancelKeyData,
-        session_id: Uuid,
-        peer_addr: IpAddr,
-        check_allowed: bool,
-    ) -> Result<(), CancelError> {
-        // TODO: check for unspecified address is only for backward compatibility, should be removed
-        if !peer_addr.is_unspecified() {
-            let subnet_key = match peer_addr {
-                IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here
-                IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
-            };
-            if !self.limiter.lock_propagate_poison().check(subnet_key, 1) {
-                // log only the subnet part of the IP address to know which subnet is rate limited
-                tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}");
-                Metrics::get()
-                    .proxy
-                    .cancellation_requests_total
-                    .inc(CancellationRequest {
-                        source: self.from,
-                        kind: crate::metrics::CancellationOutcome::RateLimitExceeded,
-                    });
-                return Err(CancelError::RateLimit);
-            }
-        }
+    ) -> Result<Option<CancelClosure>, CancelError> {
+        let prefix_key: KeyPrefix = KeyPrefix::Cancel(key);
+        let redis_key = prefix_key.build_redis_key();
 
-        // NB: we should immediately release the lock after cloning the token.
-        let cancel_state = self.map.get(&key).and_then(|x| x.clone());
-        let Some(cancel_closure) = cancel_state else {
-            tracing::warn!("query cancellation key not found: {key}");
-            Metrics::get()
+        let (resp_tx, resp_rx) = tokio::sync::oneshot::channel();
+        let op = CancelKeyOp::GetCancelData {
+            key: redis_key,
+            resp_tx,
+            _guard: Metrics::get()
                 .proxy
-                .cancellation_requests_total
-                .inc(CancellationRequest {
-                    source: self.from,
-                    kind: crate::metrics::CancellationOutcome::NotFound,
-                });
-
-            if session_id == Uuid::nil() {
-                // was already published, do not publish it again
-                return Ok(());
-            }
-
-            match self.client.try_publish(key, session_id, peer_addr).await {
-                Ok(()) => {} // do nothing
-                Err(e) => {
-                    // log it here since cancel_session could be spawned in a task
-                    tracing::error!("failed to publish cancellation key: {key}, error: {e}");
-                    return Err(CancelError::IO(std::io::Error::new(
-                        std::io::ErrorKind::Other,
-                        e.to_string(),
-                    )));
-                }
-            }
-            return Ok(());
+                .cancel_channel_size
+                .guard(RedisMsgKind::HGetAll),
         };
 
-        if check_allowed
-            && !check_peer_addr_is_in_list(&peer_addr, cancel_closure.ip_allowlist.as_slice())
-        {
-            // log it here since cancel_session could be spawned in a task
-            tracing::warn!("IP is not allowed to cancel the query: {key}");
-            return Err(CancelError::IpNotAllowed);
-        }
+        let Some(tx) = &self.tx else {
+            tracing::warn!("cancellation handler is not available");
+            return Err(CancelError::InternalError);
+        };
 
-        Metrics::get()
-            .proxy
-            .cancellation_requests_total
-            .inc(CancellationRequest {
-                source: self.from,
-                kind: crate::metrics::CancellationOutcome::Found,
-            });
-        info!(
-            "cancelling query per user's request using key {key}, hostname {}, address: {}",
-            cancel_closure.hostname, cancel_closure.socket_addr
-        );
-        cancel_closure.try_cancel_query(self.compute_config).await
+        tx.send_timeout(op, REDIS_SEND_TIMEOUT)
+            .await
+            .map_err(|e| {
+                tracing::warn!("failed to send GetCancelData for {key}: {e}");
+            })
+            .map_err(|()| CancelError::InternalError)?;
+
+        let result = resp_rx.await.map_err(|e| {
+            tracing::warn!("failed to receive GetCancelData response: {e}");
+            CancelError::InternalError
+        })?;
+
+        let cancel_state_str: Option<String> = match result {
+            Ok(mut state) => {
+                if state.len() == 1 {
+                    Some(state.remove(0).1)
+                } else {
+                    tracing::warn!("unexpected number of entries in cancel state: {state:?}");
+                    return Err(CancelError::InternalError);
+                }
+            }
+            Err(e) => {
+                tracing::warn!("failed to receive cancel state from redis: {e}");
+                return Err(CancelError::InternalError);
+            }
+        };
+
+        let cancel_state: Option<CancelClosure> = match cancel_state_str {
+            Some(state) => {
+                let cancel_closure: CancelClosure = serde_json::from_str(&state).map_err(|e| {
+                    tracing::warn!("failed to deserialize cancel state: {e}");
+                    CancelError::InternalError
+                })?;
+                Some(cancel_closure)
+            }
+            None => None,
+        };
+        Ok(cancel_state)
     }
-
     /// Try to cancel a running query for the corresponding connection.
     /// If the cancellation key is not found, it will be published to Redis.
     /// check_allowed - if true, check if the IP is allowed to cancel the query.
     /// Will fetch IP allowlist internally.
     ///
     /// return Result primarily for tests
-    pub(crate) async fn cancel_session_auth<T: BackendIpAllowlist>(
+    pub(crate) async fn cancel_session<T: BackendIpAllowlist>(
         &self,
         key: CancelKeyData,
         ctx: RequestContext,
         check_allowed: bool,
         auth_backend: &T,
     ) -> Result<(), CancelError> {
-        // TODO: check for unspecified address is only for backward compatibility, should be removed
-        if !ctx.peer_addr().is_unspecified() {
-            let subnet_key = match ctx.peer_addr() {
-                IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here
-                IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
-            };
-            if !self.limiter.lock_propagate_poison().check(subnet_key, 1) {
-                // log only the subnet part of the IP address to know which subnet is rate limited
-                tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}");
-                Metrics::get()
-                    .proxy
-                    .cancellation_requests_total
-                    .inc(CancellationRequest {
-                        source: self.from,
-                        kind: crate::metrics::CancellationOutcome::RateLimitExceeded,
-                    });
-                return Err(CancelError::RateLimit);
-            }
+        let subnet_key = match ctx.peer_addr() {
+            IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here
+            IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
+        };
+        if !self.limiter.lock_propagate_poison().check(subnet_key, 1) {
+            // log only the subnet part of the IP address to know which subnet is rate limited
+            tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}");
+            Metrics::get()
+                .proxy
+                .cancellation_requests_total
+                .inc(CancellationRequest {
+                    kind: crate::metrics::CancellationOutcome::RateLimitExceeded,
+                });
+            return Err(CancelError::RateLimit);
         }
 
-        // NB: we should immediately release the lock after cloning the token.
-        let cancel_state = self.map.get(&key).and_then(|x| x.clone());
+        let cancel_state = self.get_cancel_key(key).await.map_err(|e| {
+            tracing::warn!("failed to receive RedisOp response: {e}");
+            CancelError::InternalError
+        })?;
+
         let Some(cancel_closure) = cancel_state else {
             tracing::warn!("query cancellation key not found: {key}");
             Metrics::get()
                 .proxy
                 .cancellation_requests_total
                 .inc(CancellationRequest {
-                    source: self.from,
                     kind: crate::metrics::CancellationOutcome::NotFound,
                 });
-
-            if ctx.session_id() == Uuid::nil() {
-                // was already published, do not publish it again
-                return Ok(());
-            }
-
-            match self
-                .client
-                .try_publish(key, ctx.session_id(), ctx.peer_addr())
-                .await
-            {
-                Ok(()) => {} // do nothing
-                Err(e) => {
-                    // log it here since cancel_session could be spawned in a task
-                    tracing::error!("failed to publish cancellation key: {key}, error: {e}");
-                    return Err(CancelError::IO(std::io::Error::new(
-                        std::io::ErrorKind::Other,
-                        e.to_string(),
-                    )));
-                }
-            }
-            return Ok(());
+            return Err(CancelError::NotFound);
         };
 
-        let ip_allowlist = auth_backend
-            .get_allowed_ips(&ctx, &cancel_closure.user_info)
-            .await
-            .map_err(CancelError::AuthError)?;
+        if check_allowed {
+            let ip_allowlist = auth_backend
+                .get_allowed_ips(&ctx, &cancel_closure.user_info)
+                .await
+                .map_err(CancelError::AuthError)?;
 
-        if check_allowed && !check_peer_addr_is_in_list(&ctx.peer_addr(), &ip_allowlist) {
-            // log it here since cancel_session could be spawned in a task
-            tracing::warn!("IP is not allowed to cancel the query: {key}");
-            return Err(CancelError::IpNotAllowed);
+            if !check_peer_addr_is_in_list(&ctx.peer_addr(), &ip_allowlist) {
+                // log it here since cancel_session could be spawned in a task
+                tracing::warn!(
+                    "IP is not allowed to cancel the query: {key}, address: {}",
+                    ctx.peer_addr()
+                );
+                return Err(CancelError::IpNotAllowed);
+            }
         }
 
         Metrics::get()
             .proxy
             .cancellation_requests_total
             .inc(CancellationRequest {
-                source: self.from,
                 kind: crate::metrics::CancellationOutcome::Found,
             });
         info!("cancelling query per user's request using key {key}");
         cancel_closure.try_cancel_query(self.compute_config).await
     }
-
-    #[cfg(test)]
-    fn contains(&self, session: &Session<P>) -> bool {
-        self.map.contains_key(&session.key)
-    }
-
-    #[cfg(test)]
-    fn is_empty(&self) -> bool {
-        self.map.is_empty()
-    }
-}
-
-impl CancellationHandler<()> {
-    pub fn new(
-        compute_config: &'static ComputeConfig,
-        map: CancelMap,
-        from: CancellationSource,
-    ) -> Self {
-        Self {
-            compute_config,
-            map,
-            client: (),
-            from,
-            limiter: Arc::new(std::sync::Mutex::new(
-                LeakyBucketRateLimiter::<IpSubnetKey>::new_with_shards(
-                    LeakyBucketRateLimiter::<IpSubnetKey>::DEFAULT,
-                    64,
-                ),
-            )),
-        }
-    }
-}
-
-impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
-    pub fn new(
-        compute_config: &'static ComputeConfig,
-        map: CancelMap,
-        client: Option<Arc<Mutex<P>>>,
-        from: CancellationSource,
-    ) -> Self {
-        Self {
-            compute_config,
-            map,
-            client,
-            from,
-            limiter: Arc::new(std::sync::Mutex::new(
-                LeakyBucketRateLimiter::<IpSubnetKey>::new_with_shards(
-                    LeakyBucketRateLimiter::<IpSubnetKey>::DEFAULT,
-                    64,
-                ),
-            )),
-        }
-    }
 }
 
 /// This should've been a [`std::future::Future`], but
 /// it's impossible to name a type of an unboxed future
 /// (we'd need something like `#![feature(type_alias_impl_trait)]`).
-#[derive(Clone)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct CancelClosure {
     socket_addr: SocketAddr,
     cancel_token: CancelToken,
-    ip_allowlist: Vec<IpPattern>,
     hostname: String, // for pg_sni router
     user_info: ComputeUserInfo,
 }
@@ -349,14 +348,12 @@ impl CancelClosure {
     pub(crate) fn new(
         socket_addr: SocketAddr,
         cancel_token: CancelToken,
-        ip_allowlist: Vec<IpPattern>,
         hostname: String,
         user_info: ComputeUserInfo,
     ) -> Self {
         Self {
             socket_addr,
             cancel_token,
-            ip_allowlist,
             hostname,
             user_info,
         }
@@ -385,99 +382,75 @@ impl CancelClosure {
         debug!("query was cancelled");
         Ok(())
     }
-
-    /// Obsolete (will be removed after moving CancelMap to Redis), only for notifications
-    pub(crate) fn set_ip_allowlist(&mut self, ip_allowlist: Vec<IpPattern>) {
-        self.ip_allowlist = ip_allowlist;
-    }
 }
 
 /// Helper for registering query cancellation tokens.
-pub(crate) struct Session<P> {
+pub(crate) struct Session {
     /// The user-facing key identifying this session.
     key: CancelKeyData,
-    /// The [`CancelMap`] this session belongs to.
-    cancellation_handler: Arc<CancellationHandler<P>>,
+    redis_key: String,
+    cancellation_handler: Arc<CancellationHandler>,
 }
 
-impl<P> Session<P> {
-    /// Store the cancel token for the given session.
-    /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
-    pub(crate) fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
-        debug!("enabling query cancellation for this session");
-        self.cancellation_handler
-            .map
-            .insert(self.key, Some(cancel_closure));
-
-        self.key
+impl Session {
+    pub(crate) fn key(&self) -> &CancelKeyData {
+        &self.key
     }
-}
 
-impl<P> Drop for Session<P> {
-    fn drop(&mut self) {
-        self.cancellation_handler.map.remove(&self.key);
-        debug!("dropped query cancellation key {}", &self.key);
-    }
-}
-
-#[cfg(test)]
-#[expect(clippy::unwrap_used)]
-mod tests {
-    use std::time::Duration;
-
-    use super::*;
-    use crate::config::RetryConfig;
-    use crate::tls::client_config::compute_client_config_with_certs;
-
-    fn config() -> ComputeConfig {
-        let retry = RetryConfig {
-            base_delay: Duration::from_secs(1),
-            max_retries: 5,
-            backoff_factor: 2.0,
+    // Send the store key op to the cancellation handler
+    pub(crate) async fn write_cancel_key(
+        &self,
+        cancel_closure: CancelClosure,
+    ) -> Result<(), CancelError> {
+        let Some(tx) = &self.cancellation_handler.tx else {
+            tracing::warn!("cancellation handler is not available");
+            return Err(CancelError::InternalError);
         };
 
-        ComputeConfig {
-            retry,
-            tls: Arc::new(compute_client_config_with_certs(std::iter::empty())),
-            timeout: Duration::from_secs(2),
-        }
-    }
+        let closure_json = serde_json::to_string(&cancel_closure).map_err(|e| {
+            tracing::warn!("failed to serialize cancel closure: {e}");
+            CancelError::InternalError
+        })?;
 
-    #[tokio::test]
-    async fn check_session_drop() -> anyhow::Result<()> {
-        let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
-            Box::leak(Box::new(config())),
-            CancelMap::default(),
-            CancellationSource::FromRedis,
-        ));
-
-        let session = cancellation_handler.clone().get_session();
-        assert!(cancellation_handler.contains(&session));
-        drop(session);
-        // Check that the session has been dropped.
-        assert!(cancellation_handler.is_empty());
+        let op = CancelKeyOp::StoreCancelKey {
+            key: self.redis_key.clone(),
+            field: "data".to_string(),
+            value: closure_json,
+            resp_tx: None,
+            _guard: Metrics::get()
+                .proxy
+                .cancel_channel_size
+                .guard(RedisMsgKind::HSet),
+            expire: CANCEL_KEY_TTL,
+        };
 
+        let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {
+            let key = self.key;
+            tracing::warn!("failed to send StoreCancelKey for {key}: {e}");
+        });
         Ok(())
     }
 
-    #[tokio::test]
-    async fn cancel_session_noop_regression() {
-        let handler = CancellationHandler::<()>::new(
-            Box::leak(Box::new(config())),
-            CancelMap::default(),
-            CancellationSource::Local,
-        );
-        handler
-            .cancel_session(
-                CancelKeyData {
-                    backend_pid: 0,
-                    cancel_key: 0,
-                },
-                Uuid::new_v4(),
-                "127.0.0.1".parse().unwrap(),
-                true,
-            )
-            .await
-            .unwrap();
+    pub(crate) async fn remove_cancel_key(&self) -> Result<(), CancelError> {
+        let Some(tx) = &self.cancellation_handler.tx else {
+            tracing::warn!("cancellation handler is not available");
+            return Err(CancelError::InternalError);
+        };
+
+        let op = CancelKeyOp::RemoveCancelKey {
+            key: self.redis_key.clone(),
+            field: "data".to_string(),
+            resp_tx: None,
+            _guard: Metrics::get()
+                .proxy
+                .cancel_channel_size
+                .guard(RedisMsgKind::HSet),
+        };
+
+        let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {
+            let key = self.key;
+            tracing::warn!("failed to send RemoveCancelKey for {key}: {e}");
+        });
+        Ok(())
     }
 }
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index aff796bbab..d71465765f 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -296,7 +296,6 @@ impl ConnCfg {
                 process_id,
                 secret_key,
             },
-            vec![], // TODO: deprecated, will be removed
             host.to_string(),
             user_info,
         );
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 0c6755063f..78bfb6deac 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -6,7 +6,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, Instrument};
 
 use crate::auth::backend::ConsoleRedirectBackend;
-use crate::cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal};
+use crate::cancellation::CancellationHandler;
 use crate::config::{ProxyConfig, ProxyProtocolV2};
 use crate::context::RequestContext;
 use crate::error::ReportableError;
@@ -24,7 +24,7 @@ pub async fn task_main(
     backend: &'static ConsoleRedirectBackend,
     listener: tokio::net::TcpListener,
     cancellation_token: CancellationToken,
-    cancellation_handler: Arc<CancellationHandlerMain>,
+    cancellation_handler: Arc<CancellationHandler>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("proxy has shut down");
@@ -140,15 +140,16 @@ pub async fn task_main(
     Ok(())
 }
 
+#[allow(clippy::too_many_arguments)]
 pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
     backend: &'static ConsoleRedirectBackend,
     ctx: &RequestContext,
-    cancellation_handler: Arc<CancellationHandlerMain>,
+    cancellation_handler: Arc<CancellationHandler>,
     stream: S,
     conn_gauge: NumClientConnectionsGuard<'static>,
     cancellations: tokio_util::task::task_tracker::TaskTracker,
-) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
+) -> Result<Option<ProxyPassthrough<S>>, ClientRequestError> {
     debug!(
         protocol = %ctx.protocol(),
         "handling interactive connection from client"
@@ -171,13 +172,13 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         HandshakeData::Cancel(cancel_key_data) => {
             // spawn a task to cancel the session, but don't wait for it
             cancellations.spawn({
-                let cancellation_handler_clone = Arc::clone(&cancellation_handler);
+                let cancellation_handler_clone  = Arc::clone(&cancellation_handler);
                 let ctx = ctx.clone();
                 let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?ctx.session_id());
                 cancel_span.follows_from(tracing::Span::current());
                 async move {
                     cancellation_handler_clone
-                        .cancel_session_auth(
+                        .cancel_session(
                             cancel_key_data,
                             ctx,
                             config.authentication_config.ip_allowlist_check_enabled,
@@ -195,7 +196,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     ctx.set_db_options(params.clone());
 
-    let (node_info, user_info, ip_allowlist) = match backend
+    let (node_info, user_info, _ip_allowlist) = match backend
         .authenticate(ctx, &config.authentication_config, &mut stream)
         .await
     {
@@ -220,10 +221,14 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     .or_else(|e| stream.throw_error(e))
     .await?;
 
-    node.cancel_closure
-        .set_ip_allowlist(ip_allowlist.unwrap_or_default());
-    let session = cancellation_handler.get_session();
-    prepare_client_connection(&node, &session, &mut stream).await?;
+    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
+    let session = cancellation_handler_clone.get_key();
+
+    session
+        .write_cancel_key(node.cancel_closure.clone())
+        .await?;
+
+    prepare_client_connection(&node, *session.key(), &mut stream).await?;
 
     // Before proxy passing, forward to compute whatever data is left in the
     // PqStream input buffer. Normally there is none, but our serverless npm
@@ -237,8 +242,8 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         aux: node.aux.clone(),
         compute: node,
         session_id: ctx.session_id(),
+        cancel: session,
         _req: request_gauge,
         _conn: conn_gauge,
-        _cancel: session,
     }))
 }
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 659c57c865..f3d281a26b 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -56,6 +56,8 @@ pub struct ProxyMetrics {
     pub connection_requests: CounterPairVec<NumConnectionRequestsGauge>,
     #[metric(flatten)]
     pub http_endpoint_pools: HttpEndpointPools,
+    #[metric(flatten)]
+    pub cancel_channel_size: CounterPairVec<CancelChannelSizeGauge>,
 
     /// Time it took for proxy to establish a connection to the compute endpoint.
     // largest bucket = 2^16 * 0.5ms = 32s
@@ -294,6 +296,16 @@ impl CounterPairAssoc for NumConnectionRequestsGauge {
 pub type NumConnectionRequestsGuard<'a> =
     metrics::MeasuredCounterPairGuard<'a, NumConnectionRequestsGauge>;
 
+pub struct CancelChannelSizeGauge;
+impl CounterPairAssoc for CancelChannelSizeGauge {
+    const INC_NAME: &'static MetricName = MetricName::from_str("opened_msgs_cancel_channel_total");
+    const DEC_NAME: &'static MetricName = MetricName::from_str("closed_msgs_cancel_channel_total");
+    const INC_HELP: &'static str = "Number of processing messages in the cancellation channel.";
+    const DEC_HELP: &'static str = "Number of closed messages in the cancellation channel.";
+    type LabelGroupSet = StaticLabelSet<RedisMsgKind>;
+}
+pub type CancelChannelSizeGuard<'a> = metrics::MeasuredCounterPairGuard<'a, CancelChannelSizeGauge>;
+
 #[derive(LabelGroup)]
 #[label(set = ComputeConnectionLatencySet)]
 pub struct ComputeConnectionLatencyGroup {
@@ -340,13 +352,6 @@ pub struct RedisErrors<'a> {
     pub channel: &'a str,
 }
 
-#[derive(FixedCardinalityLabel, Copy, Clone)]
-pub enum CancellationSource {
-    FromClient,
-    FromRedis,
-    Local,
-}
-
 #[derive(FixedCardinalityLabel, Copy, Clone)]
 pub enum CancellationOutcome {
     NotFound,
@@ -357,7 +362,6 @@ pub enum CancellationOutcome {
 #[derive(LabelGroup)]
 #[label(set = CancellationRequestSet)]
 pub struct CancellationRequest {
-    pub source: CancellationSource,
     pub kind: CancellationOutcome,
 }
 
@@ -369,6 +373,16 @@ pub enum Waiting {
     RetryTimeout,
 }
 
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "kind")]
+pub enum RedisMsgKind {
+    HSet,
+    HSetMultiple,
+    HGet,
+    HGetAll,
+    HDel,
+}
+
 #[derive(Default)]
 struct Accumulated {
     cplane: time::Duration,
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 63f93f0a91..ab173bd0d0 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -13,8 +13,9 @@ pub use copy_bidirectional::{copy_bidirectional_client_compute, ErrorSource};
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
-use pq_proto::{BeMessage as Be, StartupMessageParams};
+use pq_proto::{BeMessage as Be, CancelKeyData, StartupMessageParams};
 use regex::Regex;
+use serde::{Deserialize, Serialize};
 use smol_str::{format_smolstr, SmolStr};
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
@@ -23,7 +24,7 @@ use tracing::{debug, error, info, warn, Instrument};
 
 use self::connect_compute::{connect_to_compute, TcpMechanism};
 use self::passthrough::ProxyPassthrough;
-use crate::cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal};
+use crate::cancellation::{self, CancellationHandler};
 use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
 use crate::context::RequestContext;
 use crate::error::ReportableError;
@@ -57,7 +58,7 @@ pub async fn task_main(
     auth_backend: &'static auth::Backend<'static, ()>,
     listener: tokio::net::TcpListener,
     cancellation_token: CancellationToken,
-    cancellation_handler: Arc<CancellationHandlerMain>,
+    cancellation_handler: Arc<CancellationHandler>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
@@ -243,13 +244,13 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
     auth_backend: &'static auth::Backend<'static, ()>,
     ctx: &RequestContext,
-    cancellation_handler: Arc<CancellationHandlerMain>,
+    cancellation_handler: Arc<CancellationHandler>,
     stream: S,
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     conn_gauge: NumClientConnectionsGuard<'static>,
     cancellations: tokio_util::task::task_tracker::TaskTracker,
-) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
+) -> Result<Option<ProxyPassthrough<S>>, ClientRequestError> {
     debug!(
         protocol = %ctx.protocol(),
         "handling interactive connection from client"
@@ -278,7 +279,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
                 cancel_span.follows_from(tracing::Span::current());
                 async move {
                     cancellation_handler_clone
-                        .cancel_session_auth(
+                        .cancel_session(
                             cancel_key_data,
                             ctx,
                             config.authentication_config.ip_allowlist_check_enabled,
@@ -312,7 +313,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     };
 
     let user = user_info.get_user().to_owned();
-    let (user_info, ip_allowlist) = match user_info
+    let (user_info, _ip_allowlist) = match user_info
         .authenticate(
             ctx,
             &mut stream,
@@ -356,10 +357,14 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     .or_else(|e| stream.throw_error(e))
     .await?;
 
-    node.cancel_closure
-        .set_ip_allowlist(ip_allowlist.unwrap_or_default());
-    let session = cancellation_handler.get_session();
-    prepare_client_connection(&node, &session, &mut stream).await?;
+    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
+    let session = cancellation_handler_clone.get_key();
+
+    session
+        .write_cancel_key(node.cancel_closure.clone())
+        .await?;
+
+    prepare_client_connection(&node, *session.key(), &mut stream).await?;
 
     // Before proxy passing, forward to compute whatever data is left in the
     // PqStream input buffer. Normally there is none, but our serverless npm
@@ -373,23 +378,19 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         aux: node.aux.clone(),
         compute: node,
         session_id: ctx.session_id(),
+        cancel: session,
         _req: request_gauge,
         _conn: conn_gauge,
-        _cancel: session,
     }))
 }
 
 /// Finish client connection initialization: confirm auth success, send params, etc.
 #[tracing::instrument(skip_all)]
-pub(crate) async fn prepare_client_connection<P>(
+pub(crate) async fn prepare_client_connection(
     node: &compute::PostgresConnection,
-    session: &cancellation::Session<P>,
+    cancel_key_data: CancelKeyData,
     stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> Result<(), std::io::Error> {
-    // Register compute's query cancellation token and produce a new, unique one.
-    // The new token (cancel_key_data) will be sent to the client.
-    let cancel_key_data = session.enable_query_cancellation(node.cancel_closure.clone());
-
     // Forward all deferred notices to the client.
     for notice in &node.delayed_notice {
         stream.write_message_noflush(&Be::Raw(b'N', notice.as_bytes()))?;
@@ -411,7 +412,7 @@ pub(crate) async fn prepare_client_connection<P>(
     Ok(())
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, Default)]
+#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
 pub(crate) struct NeonOptions(Vec<(SmolStr, SmolStr)>);
 
 impl NeonOptions {
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index a42f9aad39..08871380d6 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -56,18 +56,18 @@ pub(crate) async fn proxy_pass(
     Ok(())
 }
 
-pub(crate) struct ProxyPassthrough<P, S> {
+pub(crate) struct ProxyPassthrough<S> {
     pub(crate) client: Stream<S>,
     pub(crate) compute: PostgresConnection,
     pub(crate) aux: MetricsAuxInfo,
     pub(crate) session_id: uuid::Uuid,
+    pub(crate) cancel: cancellation::Session,
 
     pub(crate) _req: NumConnectionRequestsGuard<'static>,
     pub(crate) _conn: NumClientConnectionsGuard<'static>,
-    pub(crate) _cancel: cancellation::Session<P>,
 }
 
-impl<P, S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<P, S> {
+impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
     pub(crate) async fn proxy_pass(
         self,
         compute_config: &ComputeConfig,
@@ -81,6 +81,9 @@ impl<P, S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<P, S> {
         {
             tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database");
         }
+
+        drop(self.cancel.remove_cancel_key().await); // we don't need a result. If the queue is full, we just log the error
+
         res
     }
 }
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 6f6a8c9d47..ec080f270b 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -138,6 +138,12 @@ impl RateBucketInfo {
         Self::new(200, Duration::from_secs(600)),
     ];
 
+    // For all the sessions will be cancel key. So this limit is essentially global proxy limit.
+    pub const DEFAULT_REDIS_SET: [Self; 2] = [
+        Self::new(100_000, Duration::from_secs(1)),
+        Self::new(50_000, Duration::from_secs(10)),
+    ];
+
     /// All of these are per endpoint-maskedip pair.
     /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
     ///
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index 228dbb7f64..30d8b83e60 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -2,12 +2,10 @@ use core::net::IpAddr;
 use std::sync::Arc;
 
 use pq_proto::CancelKeyData;
-use redis::AsyncCommands;
 use tokio::sync::Mutex;
 use uuid::Uuid;
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
-use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME};
 use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
 
 pub trait CancellationPublisherMut: Send + Sync + 'static {
@@ -83,9 +81,10 @@ impl<P: CancellationPublisherMut> CancellationPublisher for Arc<Mutex<P>> {
 }
 
 pub struct RedisPublisherClient {
+    #[allow(dead_code)]
     client: ConnectionWithCredentialsProvider,
-    region_id: String,
-    limiter: GlobalRateLimiter,
+    _region_id: String,
+    _limiter: GlobalRateLimiter,
 }
 
 impl RedisPublisherClient {
@@ -96,26 +95,12 @@ impl RedisPublisherClient {
     ) -> anyhow::Result<Self> {
         Ok(Self {
             client,
-            region_id,
-            limiter: GlobalRateLimiter::new(info.into()),
+            _region_id: region_id,
+            _limiter: GlobalRateLimiter::new(info.into()),
         })
     }
 
-    async fn publish(
-        &mut self,
-        cancel_key_data: CancelKeyData,
-        session_id: Uuid,
-        peer_addr: IpAddr,
-    ) -> anyhow::Result<()> {
-        let payload = serde_json::to_string(&Notification::Cancel(CancelSession {
-            region_id: Some(self.region_id.clone()),
-            cancel_key_data,
-            session_id,
-            peer_addr: Some(peer_addr),
-        }))?;
-        let _: () = self.client.publish(PROXY_CHANNEL_NAME, payload).await?;
-        Ok(())
-    }
+    #[allow(dead_code)]
     pub(crate) async fn try_connect(&mut self) -> anyhow::Result<()> {
         match self.client.connect().await {
             Ok(()) => {}
@@ -126,49 +111,4 @@ impl RedisPublisherClient {
         }
         Ok(())
     }
-    async fn try_publish_internal(
-        &mut self,
-        cancel_key_data: CancelKeyData,
-        session_id: Uuid,
-        peer_addr: IpAddr,
-    ) -> anyhow::Result<()> {
-        // TODO: review redundant error duplication logs.
-        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping cancellation message");
-            return Err(anyhow::anyhow!("Rate limit exceeded"));
-        }
-        match self.publish(cancel_key_data, session_id, peer_addr).await {
-            Ok(()) => return Ok(()),
-            Err(e) => {
-                tracing::error!("failed to publish a message: {e}");
-            }
-        }
-        tracing::info!("Publisher is disconnected. Reconnectiong...");
-        self.try_connect().await?;
-        self.publish(cancel_key_data, session_id, peer_addr).await
-    }
-}
-
-impl CancellationPublisherMut for RedisPublisherClient {
-    async fn try_publish(
-        &mut self,
-        cancel_key_data: CancelKeyData,
-        session_id: Uuid,
-        peer_addr: IpAddr,
-    ) -> anyhow::Result<()> {
-        tracing::info!("publishing cancellation key to Redis");
-        match self
-            .try_publish_internal(cancel_key_data, session_id, peer_addr)
-            .await
-        {
-            Ok(()) => {
-                tracing::debug!("cancellation key successfuly published to Redis");
-                Ok(())
-            }
-            Err(e) => {
-                tracing::error!("failed to publish a message: {e}");
-                Err(e)
-            }
-        }
-    }
 }
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index 0f6e765b02..b5c3d13216 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -29,6 +29,7 @@ impl Clone for Credentials {
 /// Provides PubSub connection without credentials refresh.
 pub struct ConnectionWithCredentialsProvider {
     credentials: Credentials,
+    // TODO: with more load on the connection, we should consider using a connection pool
     con: Option<MultiplexedConnection>,
     refresh_token_task: Option<JoinHandle<()>>,
     mutex: tokio::sync::Mutex<()>,
diff --git a/proxy/src/redis/keys.rs b/proxy/src/redis/keys.rs
new file mode 100644
index 0000000000..dddc7e2054
--- /dev/null
+++ b/proxy/src/redis/keys.rs
@@ -0,0 +1,88 @@
+use anyhow::Ok;
+use pq_proto::{id_to_cancel_key, CancelKeyData};
+use serde::{Deserialize, Serialize};
+use std::io::ErrorKind;
+
+pub mod keyspace {
+    pub const CANCEL_PREFIX: &str = "cancel";
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub(crate) enum KeyPrefix {
+    #[serde(untagged)]
+    Cancel(CancelKeyData),
+}
+
+impl KeyPrefix {
+    pub(crate) fn build_redis_key(&self) -> String {
+        match self {
+            KeyPrefix::Cancel(key) => {
+                let hi = (key.backend_pid as u64) << 32;
+                let lo = (key.cancel_key as u64) & 0xffff_ffff;
+                let id = hi | lo;
+                let keyspace = keyspace::CANCEL_PREFIX;
+                format!("{keyspace}:{id:x}")
+            }
+        }
+    }
+
+    #[allow(dead_code)]
+    pub(crate) fn as_str(&self) -> &'static str {
+        match self {
+            KeyPrefix::Cancel(_) => keyspace::CANCEL_PREFIX,
+        }
+    }
+}
+
+#[allow(dead_code)]
+pub(crate) fn parse_redis_key(key: &str) -> anyhow::Result<KeyPrefix> {
+    let (prefix, key_str) = key.split_once(':').ok_or_else(|| {
+        anyhow::anyhow!(std::io::Error::new(
+            ErrorKind::InvalidData,
+            "missing prefix"
+        ))
+    })?;
+
+    match prefix {
+        keyspace::CANCEL_PREFIX => {
+            let id = u64::from_str_radix(key_str, 16)?;
+
+            Ok(KeyPrefix::Cancel(id_to_cancel_key(id)))
+        }
+        _ => Err(anyhow::anyhow!(std::io::Error::new(
+            ErrorKind::InvalidData,
+            "unknown prefix"
+        ))),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_build_redis_key() {
+        let cancel_key: KeyPrefix = KeyPrefix::Cancel(CancelKeyData {
+            backend_pid: 12345,
+            cancel_key: 54321,
+        });
+
+        let redis_key = cancel_key.build_redis_key();
+        assert_eq!(redis_key, "cancel:30390000d431");
+    }
+
+    #[test]
+    fn test_parse_redis_key() {
+        let redis_key = "cancel:30390000d431";
+        let key: KeyPrefix = parse_redis_key(redis_key).expect("Failed to parse key");
+
+        let ref_key = CancelKeyData {
+            backend_pid: 12345,
+            cancel_key: 54321,
+        };
+
+        assert_eq!(key.as_str(), KeyPrefix::Cancel(ref_key).as_str());
+        let KeyPrefix::Cancel(cancel_key) = key;
+        assert_eq!(ref_key, cancel_key);
+    }
+}
diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs
new file mode 100644
index 0000000000..dcc6aac51b
--- /dev/null
+++ b/proxy/src/redis/kv_ops.rs
@@ -0,0 +1,185 @@
+use redis::{AsyncCommands, ToRedisArgs};
+
+use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
+
+use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
+
+pub struct RedisKVClient {
+    client: ConnectionWithCredentialsProvider,
+    limiter: GlobalRateLimiter,
+}
+
+impl RedisKVClient {
+    pub fn new(client: ConnectionWithCredentialsProvider, info: &'static [RateBucketInfo]) -> Self {
+        Self {
+            client,
+            limiter: GlobalRateLimiter::new(info.into()),
+        }
+    }
+
+    pub async fn try_connect(&mut self) -> anyhow::Result<()> {
+        match self.client.connect().await {
+            Ok(()) => {}
+            Err(e) => {
+                tracing::error!("failed to connect to redis: {e}");
+                return Err(e);
+            }
+        }
+        Ok(())
+    }
+
+    pub(crate) async fn hset<K, F, V>(&mut self, key: K, field: F, value: V) -> anyhow::Result<()>
+    where
+        K: ToRedisArgs + Send + Sync,
+        F: ToRedisArgs + Send + Sync,
+        V: ToRedisArgs + Send + Sync,
+    {
+        if !self.limiter.check() {
+            tracing::info!("Rate limit exceeded. Skipping hset");
+            return Err(anyhow::anyhow!("Rate limit exceeded"));
+        }
+
+        match self.client.hset(&key, &field, &value).await {
+            Ok(()) => return Ok(()),
+            Err(e) => {
+                tracing::error!("failed to set a key-value pair: {e}");
+            }
+        }
+
+        tracing::info!("Redis client is disconnected. Reconnectiong...");
+        self.try_connect().await?;
+        self.client
+            .hset(key, field, value)
+            .await
+            .map_err(anyhow::Error::new)
+    }
+
+    #[allow(dead_code)]
+    pub(crate) async fn hset_multiple<K, V>(
+        &mut self,
+        key: &str,
+        items: &[(K, V)],
+    ) -> anyhow::Result<()>
+    where
+        K: ToRedisArgs + Send + Sync,
+        V: ToRedisArgs + Send + Sync,
+    {
+        if !self.limiter.check() {
+            tracing::info!("Rate limit exceeded. Skipping hset_multiple");
+            return Err(anyhow::anyhow!("Rate limit exceeded"));
+        }
+
+        match self.client.hset_multiple(key, items).await {
+            Ok(()) => return Ok(()),
+            Err(e) => {
+                tracing::error!("failed to set a key-value pair: {e}");
+            }
+        }
+
+        tracing::info!("Redis client is disconnected. Reconnectiong...");
+        self.try_connect().await?;
+        self.client
+            .hset_multiple(key, items)
+            .await
+            .map_err(anyhow::Error::new)
+    }
+
+    #[allow(dead_code)]
+    pub(crate) async fn expire<K>(&mut self, key: K, seconds: i64) -> anyhow::Result<()>
+    where
+        K: ToRedisArgs + Send + Sync,
+    {
+        if !self.limiter.check() {
+            tracing::info!("Rate limit exceeded. Skipping expire");
+            return Err(anyhow::anyhow!("Rate limit exceeded"));
+        }
+
+        match self.client.expire(&key, seconds).await {
+            Ok(()) => return Ok(()),
+            Err(e) => {
+                tracing::error!("failed to set a key-value pair: {e}");
+            }
+        }
+
+        tracing::info!("Redis client is disconnected. Reconnectiong...");
+        self.try_connect().await?;
+        self.client
+            .expire(key, seconds)
+            .await
+            .map_err(anyhow::Error::new)
+    }
+
+    #[allow(dead_code)]
+    pub(crate) async fn hget<K, F, V>(&mut self, key: K, field: F) -> anyhow::Result<V>
+    where
+        K: ToRedisArgs + Send + Sync,
+        F: ToRedisArgs + Send + Sync,
+        V: redis::FromRedisValue,
+    {
+        if !self.limiter.check() {
+            tracing::info!("Rate limit exceeded. Skipping hget");
+            return Err(anyhow::anyhow!("Rate limit exceeded"));
+        }
+
+        match self.client.hget(&key, &field).await {
+            Ok(value) => return Ok(value),
+            Err(e) => {
+                tracing::error!("failed to get a value: {e}");
+            }
+        }
+
+        tracing::info!("Redis client is disconnected. Reconnectiong...");
+        self.try_connect().await?;
+        self.client
+            .hget(key, field)
+            .await
+            .map_err(anyhow::Error::new)
+    }
+
+    pub(crate) async fn hget_all<K, V>(&mut self, key: K) -> anyhow::Result<V>
+    where
+        K: ToRedisArgs + Send + Sync,
+        V: redis::FromRedisValue,
+    {
+        if !self.limiter.check() {
+            tracing::info!("Rate limit exceeded. Skipping hgetall");
+            return Err(anyhow::anyhow!("Rate limit exceeded"));
+        }
+
+        match self.client.hgetall(&key).await {
+            Ok(value) => return Ok(value),
+            Err(e) => {
+                tracing::error!("failed to get a value: {e}");
+            }
+        }
+
+        tracing::info!("Redis client is disconnected. Reconnectiong...");
+        self.try_connect().await?;
+        self.client.hgetall(key).await.map_err(anyhow::Error::new)
+    }
+
+    pub(crate) async fn hdel<K, F>(&mut self, key: K, field: F) -> anyhow::Result<()>
+    where
+        K: ToRedisArgs + Send + Sync,
+        F: ToRedisArgs + Send + Sync,
+    {
+        if !self.limiter.check() {
+            tracing::info!("Rate limit exceeded. Skipping hdel");
+            return Err(anyhow::anyhow!("Rate limit exceeded"));
+        }
+
+        match self.client.hdel(&key, &field).await {
+            Ok(()) => return Ok(()),
+            Err(e) => {
+                tracing::error!("failed to delete a key-value pair: {e}");
+            }
+        }
+
+        tracing::info!("Redis client is disconnected. Reconnectiong...");
+        self.try_connect().await?;
+        self.client
+            .hdel(key, field)
+            .await
+            .map_err(anyhow::Error::new)
+    }
+}
diff --git a/proxy/src/redis/mod.rs b/proxy/src/redis/mod.rs
index a322f0368c..8b46a8e6ca 100644
--- a/proxy/src/redis/mod.rs
+++ b/proxy/src/redis/mod.rs
@@ -1,4 +1,6 @@
 pub mod cancellation_publisher;
 pub mod connection_with_credentials_provider;
 pub mod elasticache;
+pub mod keys;
+pub mod kv_ops;
 pub mod notifications;
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 63cdf6176c..19fdd3280d 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -6,18 +6,14 @@ use pq_proto::CancelKeyData;
 use redis::aio::PubSub;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
-use tracing::Instrument;
 use uuid::Uuid;
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use crate::cache::project_info::ProjectInfoCache;
-use crate::cancellation::{CancelMap, CancellationHandler};
-use crate::config::ProxyConfig;
 use crate::intern::{ProjectIdInt, RoleNameInt};
 use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
 
 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
-pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates";
 const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
 const INVALIDATION_LAG: std::time::Duration = std::time::Duration::from_secs(20);
 
@@ -25,8 +21,6 @@ async fn try_connect(client: &ConnectionWithCredentialsProvider) -> anyhow::Resu
     let mut conn = client.get_async_pubsub().await?;
     tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`");
     conn.subscribe(CPLANE_CHANNEL_NAME).await?;
-    tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`");
-    conn.subscribe(PROXY_CHANNEL_NAME).await?;
     Ok(conn)
 }
 
@@ -71,8 +65,6 @@ pub(crate) enum Notification {
         deserialize_with = "deserialize_json_string"
     )]
     PasswordUpdate { password_update: PasswordUpdate },
-    #[serde(rename = "/cancel_session")]
-    Cancel(CancelSession),
 
     #[serde(
         other,
@@ -138,7 +130,6 @@ where
 
 struct MessageHandler<C: ProjectInfoCache + Send + Sync + 'static> {
     cache: Arc<C>,
-    cancellation_handler: Arc<CancellationHandler<()>>,
     region_id: String,
 }
 
@@ -146,23 +137,14 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> Clone for MessageHandler<C> {
     fn clone(&self) -> Self {
         Self {
             cache: self.cache.clone(),
-            cancellation_handler: self.cancellation_handler.clone(),
             region_id: self.region_id.clone(),
         }
     }
 }
 
 impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
-    pub(crate) fn new(
-        cache: Arc<C>,
-        cancellation_handler: Arc<CancellationHandler<()>>,
-        region_id: String,
-    ) -> Self {
-        Self {
-            cache,
-            cancellation_handler,
-            region_id,
-        }
+    pub(crate) fn new(cache: Arc<C>, region_id: String) -> Self {
+        Self { cache, region_id }
     }
 
     pub(crate) async fn increment_active_listeners(&self) {
@@ -207,46 +189,6 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
 
         tracing::debug!(?msg, "received a message");
         match msg {
-            Notification::Cancel(cancel_session) => {
-                tracing::Span::current().record(
-                    "session_id",
-                    tracing::field::display(cancel_session.session_id),
-                );
-                Metrics::get()
-                    .proxy
-                    .redis_events_count
-                    .inc(RedisEventsCount::CancelSession);
-                if let Some(cancel_region) = cancel_session.region_id {
-                    // If the message is not for this region, ignore it.
-                    if cancel_region != self.region_id {
-                        return Ok(());
-                    }
-                }
-
-                // TODO: Remove unspecified peer_addr after the complete migration to the new format
-                let peer_addr = cancel_session
-                    .peer_addr
-                    .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED));
-                let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?cancel_session.session_id);
-                cancel_span.follows_from(tracing::Span::current());
-                // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message.
-                match self
-                    .cancellation_handler
-                    .cancel_session(
-                        cancel_session.cancel_key_data,
-                        uuid::Uuid::nil(),
-                        peer_addr,
-                        cancel_session.peer_addr.is_some(),
-                    )
-                    .instrument(cancel_span)
-                    .await
-                {
-                    Ok(()) => {}
-                    Err(e) => {
-                        tracing::warn!("failed to cancel session: {e}");
-                    }
-                }
-            }
             Notification::AllowedIpsUpdate { .. }
             | Notification::PasswordUpdate { .. }
             | Notification::BlockPublicOrVpcAccessUpdated { .. }
@@ -293,7 +235,6 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
                 password_update.project_id,
                 password_update.role_name,
             ),
-        Notification::Cancel(_) => unreachable!("cancel message should be handled separately"),
         Notification::BlockPublicOrVpcAccessUpdated { .. } => {
             // https://github.com/neondatabase/neon/pull/10073
         }
@@ -323,8 +264,8 @@ async fn handle_messages<C: ProjectInfoCache + Send + Sync + 'static>(
             }
             Err(e) => {
                 tracing::error!(
-            "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}"
-        );
+                    "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}"
+                );
                 tokio::time::sleep(RECONNECT_TIMEOUT).await;
                 continue;
             }
@@ -350,21 +291,14 @@ async fn handle_messages<C: ProjectInfoCache + Send + Sync + 'static>(
 /// Handle console's invalidation messages.
 #[tracing::instrument(name = "redis_notifications", skip_all)]
 pub async fn task_main<C>(
-    config: &'static ProxyConfig,
     redis: ConnectionWithCredentialsProvider,
     cache: Arc<C>,
-    cancel_map: CancelMap,
     region_id: String,
 ) -> anyhow::Result<Infallible>
 where
     C: ProjectInfoCache + Send + Sync + 'static,
 {
-    let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
-        &config.connect_to_compute,
-        cancel_map,
-        crate::metrics::CancellationSource::FromRedis,
-    ));
-    let handler = MessageHandler::new(cache, cancellation_handler, region_id);
+    let handler = MessageHandler::new(cache, region_id);
     // 6h - 1m.
     // There will be 1 minute overlap between two tasks. But at least we can be sure that no message is lost.
     let mut interval = tokio::time::interval(std::time::Duration::from_secs(6 * 60 * 60 - 60));
@@ -442,35 +376,6 @@ mod tests {
 
         Ok(())
     }
-    #[test]
-    fn parse_cancel_session() -> anyhow::Result<()> {
-        let cancel_key_data = CancelKeyData {
-            backend_pid: 42,
-            cancel_key: 41,
-        };
-        let uuid = uuid::Uuid::new_v4();
-        let msg = Notification::Cancel(CancelSession {
-            cancel_key_data,
-            region_id: None,
-            session_id: uuid,
-            peer_addr: None,
-        });
-        let text = serde_json::to_string(&msg)?;
-        let result: Notification = serde_json::from_str(&text)?;
-        assert_eq!(msg, result);
-
-        let msg = Notification::Cancel(CancelSession {
-            cancel_key_data,
-            region_id: Some("region".to_string()),
-            session_id: uuid,
-            peer_addr: None,
-        });
-        let text = serde_json::to_string(&msg)?;
-        let result: Notification = serde_json::from_str(&text)?;
-        assert_eq!(msg, result,);
-
-        Ok(())
-    }
 
     #[test]
     fn parse_unknown_topic() -> anyhow::Result<()> {
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index c2623e0eca..6888772362 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -43,7 +43,7 @@ use tokio_util::task::TaskTracker;
 use tracing::{info, warn, Instrument};
 use utils::http::error::ApiError;
 
-use crate::cancellation::CancellationHandlerMain;
+use crate::cancellation::CancellationHandler;
 use crate::config::{ProxyConfig, ProxyProtocolV2};
 use crate::context::RequestContext;
 use crate::ext::TaskExt;
@@ -61,7 +61,7 @@ pub async fn task_main(
     auth_backend: &'static crate::auth::Backend<'static, ()>,
     ws_listener: TcpListener,
     cancellation_token: CancellationToken,
-    cancellation_handler: Arc<CancellationHandlerMain>,
+    cancellation_handler: Arc<CancellationHandler>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
@@ -318,7 +318,7 @@ async fn connection_handler(
     backend: Arc<PoolingBackend>,
     connections: TaskTracker,
     cancellations: TaskTracker,
-    cancellation_handler: Arc<CancellationHandlerMain>,
+    cancellation_handler: Arc<CancellationHandler>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_token: CancellationToken,
     conn: AsyncRW,
@@ -412,7 +412,7 @@ async fn request_handler(
     config: &'static ProxyConfig,
     backend: Arc<PoolingBackend>,
     ws_connections: TaskTracker,
-    cancellation_handler: Arc<CancellationHandlerMain>,
+    cancellation_handler: Arc<CancellationHandler>,
     session_id: uuid::Uuid,
     conn_info: ConnectionInfo,
     // used to cancel in-flight HTTP requests. not used to cancel websockets
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 47326c1181..585a7d63b2 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -12,7 +12,7 @@ use pin_project_lite::pin_project;
 use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
 use tracing::warn;
 
-use crate::cancellation::CancellationHandlerMain;
+use crate::cancellation::CancellationHandler;
 use crate::config::ProxyConfig;
 use crate::context::RequestContext;
 use crate::error::{io_error, ReportableError};
@@ -129,7 +129,7 @@ pub(crate) async fn serve_websocket(
     auth_backend: &'static crate::auth::Backend<'static, ()>,
     ctx: RequestContext,
     websocket: OnUpgrade,
-    cancellation_handler: Arc<CancellationHandlerMain>,
+    cancellation_handler: Arc<CancellationHandler>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     hostname: Option<String>,
     cancellations: tokio_util::task::task_tracker::TaskTracker,

From 2f82c21c638578332b61593bdf2fc83fe41de2db Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 29 Jan 2025 12:55:24 +0000
Subject: [PATCH 290/583] chore: update rust-postgres fork (#10557)

I updated the fork to fix some lints. Cargo keeps getting confused by it
so let's just update the lockfile here
---
 Cargo.lock | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c19fdc0941..f14f4cdb82 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4449,7 +4449,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#547812b5e4972425fc3a9108cf9bae39e41ee000"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4462,7 +4462,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#547812b5e4972425fc3a9108cf9bae39e41ee000"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -4514,9 +4514,10 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#547812b5e4972425fc3a9108cf9bae39e41ee000"
 dependencies = [
  "bytes",
+ "chrono",
  "fallible-iterator",
  "postgres-protocol 0.6.4",
 ]
@@ -6859,7 +6860,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#547812b5e4972425fc3a9108cf9bae39e41ee000"
 dependencies = [
  "async-trait",
  "byteorder",

From 34d9e2d8e33ba9afe1a7f68241fe84a9055a4754 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Wed, 29 Jan 2025 16:01:56 +0100
Subject: [PATCH 291/583] Add a test for GrapgQL (#10156)

## Problem
We currently don't run the tests shipped with `pg_graphql`.
## Summary of changes
The tests for `pg_graphql` are added.
---
 compute/compute-node.Dockerfile               |  4 +++-
 compute/patches/pg_graphql.patch              | 19 +++++++++++++++++++
 docker-compose/docker_compose_test.sh         |  5 +++--
 .../ext-src/pg_graphql-src/neon-test.sh       | 13 +++++++++++++
 docker-compose/run-tests.sh                   | 17 ++++++++++-------
 5 files changed, 48 insertions(+), 10 deletions(-)
 create mode 100644 compute/patches/pg_graphql.patch
 create mode 100755 docker-compose/ext-src/pg_graphql-src/neon-test.sh

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 539135470e..9d7aeda590 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1330,7 +1330,8 @@ COPY --from=vector-pg-build /pgvector.patch /ext-src/
 COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
 #COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 #COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src
-#COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
+COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
+COPY compute/patches/pg_graphql.patch /ext-src
 #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
 COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
 COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
@@ -1364,6 +1365,7 @@ RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
 RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
 RUN patch -p1 </ext-src/pg_cron.patch
+RUN cd /ext-src/pg_graphql-src && patch -p1 </ext-src/pg_graphql.patch
 ENV PATH=/usr/local/pgsql/bin:$PATH
 ENV PGHOST=compute
 ENV PGPORT=55433
diff --git a/compute/patches/pg_graphql.patch b/compute/patches/pg_graphql.patch
new file mode 100644
index 0000000000..bf0ac38afa
--- /dev/null
+++ b/compute/patches/pg_graphql.patch
@@ -0,0 +1,19 @@
+commit ec6a491d126882966a696f9ad5d3698935361d55
+Author: Alexey Masterov <alexeymasterov@neon.tech>
+Date:   Tue Dec 17 10:25:00 2024 +0100
+
+    Changes required to run tests on Neon
+
+diff --git a/test/expected/permissions_functions.out b/test/expected/permissions_functions.out
+index 1e9fbc2..94cbe25 100644
+--- a/test/expected/permissions_functions.out
++++ b/test/expected/permissions_functions.out
+@@ -64,7 +64,7 @@ begin;
+     select current_user;
+  current_user 
+ --------------
+- postgres
++ cloud_admin
+ (1 row)
+ 
+     -- revoke default access from the public role for new functions
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index f42aca673b..a05d6c043d 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -31,7 +31,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
     echo "clean up containers if exists"
     cleanup
     PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version))
-    PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d
+    PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --quiet-pull --build -d
 
     echo "wait until the compute is ready. timeout after 60s. "
     cnt=0
@@ -51,6 +51,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
     done
 
     if [ $pg_version -ge 16 ]; then
+        docker cp ext-src $TEST_CONTAINER_NAME:/
         # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
         # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
         echo Adding dummy config
@@ -61,7 +62,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
         docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
         rm -rf $TMPDIR
         # We are running tests now
-        if ! docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
+        if ! docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
             $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
         then
             FAILED=$(tail -1 testout.txt)
diff --git a/docker-compose/ext-src/pg_graphql-src/neon-test.sh b/docker-compose/ext-src/pg_graphql-src/neon-test.sh
new file mode 100755
index 0000000000..38bcb4bfb6
--- /dev/null
+++ b/docker-compose/ext-src/pg_graphql-src/neon-test.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+set -ex
+cd "$(dirname "${0}")"
+dropdb --if-exists contrib_regression
+createdb contrib_regression
+PGXS="$(dirname "$(pg_config --pgxs)" )"
+REGRESS="${PGXS}/../test/regress/pg_regress"
+TESTDIR="test"
+TESTS=$(ls "${TESTDIR}/sql" | sort )
+TESTS=${TESTS//\.sql/}
+psql -v ON_ERROR_STOP=1 -f test/fixtures.sql -d contrib_regression
+${REGRESS} --use-existing --dbname=contrib_regression --inputdir=${TESTDIR} ${TESTS}
+
diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh
index 9873187b62..1e794a42a1 100644
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -4,14 +4,17 @@ set -x
 cd /ext-src || exit 2
 FAILED=
 LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
-for d in ${LIST}
-do
-       [ -d "${d}" ] || continue
-       if ! psql -w -c "select 1" >/dev/null; then
-          FAILED="${d} ${FAILED}"
-          break
-       fi
+for d in ${LIST}; do
+    [ -d "${d}" ] || continue
+    if ! psql -w -c "select 1" >/dev/null; then
+      FAILED="${d} ${FAILED}"
+      break
+    fi
+    if [ -f "${d}/neon-test.sh" ]; then
+       "${d}/neon-test.sh" || FAILED="${d} ${FAILED}"
+    else
        USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
+    fi
 done
 [ -z "${FAILED}" ] && exit 0
 echo "${FAILED}"

From 7922458b98c3f8f7519a60b3261d69cf00575a06 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 29 Jan 2025 09:45:36 -0600
Subject: [PATCH 292/583] Use num_cpus from the workspace in pageserver
 (#10545)

Luckily they were the same version, so we didn't spend time compiling
two versions, which could have been the case in the future.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 pageserver/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 9c835c956b..6e4eaa0efd 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -36,7 +36,7 @@ itertools.workspace = true
 md5.workspace = true
 nix.workspace = true
 # hack to get the number of worker threads tokio uses
-num_cpus = { version = "1.15" }
+num_cpus.workspace = true
 num-traits.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true

From 34e560fe373821114049dd97144d80d31171b13b Mon Sep 17 00:00:00 2001
From: Mikhail Kot <mikhail@neon.tech>
Date: Wed, 29 Jan 2025 15:52:00 +0000
Subject: [PATCH 293/583] download exporters from releases rather than using
 docker images (#10551)

Use releases for postgres-exporter, pgbouncer-exporter, and sql-exporter
---
 compute/compute-node.Dockerfile | 40 +++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 9d7aeda590..7ac6e9bc58 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -5,6 +5,7 @@ ARG TAG=pinned
 ARG BUILD_TAG
 ARG DEBIAN_VERSION=bookworm
 ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
+ARG ALPINE_CURL_VERSION=8.11.1
 
 #########################################################################################
 #
@@ -1266,16 +1267,31 @@ RUN set -e \
 
 #########################################################################################
 #
-# Layers "postgres-exporter", "pgbouncer-exporter", and "sql-exporter"
+# Layer "exporters"
 #
 #########################################################################################
-
-FROM quay.io/prometheuscommunity/postgres-exporter:v0.16.0 AS postgres-exporter
-FROM quay.io/prometheuscommunity/pgbouncer-exporter:v0.10.2 AS pgbouncer-exporter
-
-# Keep the version the same as in build-tools.Dockerfile and
-# test_runner/regress/test_compute_metrics.py.
-FROM burningalchemist/sql_exporter:0.17.0 AS sql-exporter
+FROM alpine/curl:${ALPINE_CURL_VERSION} AS exporters
+ARG TARGETARCH
+# Keep sql_exporter version same as in build-tools.Dockerfile and
+# test_runner/regress/test_compute_metrics.py
+RUN if [ "$TARGETARCH" = "amd64" ]; then\
+        postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
+        pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
+        sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\
+    else\
+        postgres_exporter_sha256='131a376d25778ff9701a4c81f703f179e0b58db5c2c496e66fa43f8179484786';\
+        pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\
+        sql_exporter_sha256='11918b00be6e2c3a67564adfdb2414fdcbb15a5db76ea17d1d1a944237a893c6';\
+    fi\
+    && curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.16.0/postgres_exporter-0.16.0.linux-${TARGETARCH}.tar.gz\
+     | tar xzf - --strip-components=1 -C.\
+    && curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\
+     | tar xzf - --strip-components=1 -C.\
+    && curl -sL https://github.com/burningalchemist/sql_exporter/releases/download/0.17.0/sql_exporter-0.17.0.linux-${TARGETARCH}.tar.gz\
+     | tar xzf - --strip-components=1 -C.\
+    && echo "${postgres_exporter_sha256} postgres_exporter" | sha256sum -c -\
+    && echo "${pgbouncer_exporter_sha256} pgbouncer_exporter" | sha256sum -c -\
+    && echo "${sql_exporter_sha256} sql_exporter" | sha256sum -c -
 
 #########################################################################################
 #
@@ -1403,10 +1419,10 @@ COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/local_proxy /usr/local/bin/local_proxy
 RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
 
-# Metrics exporter binaries and  configuration files
-COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
-COPY --from=pgbouncer-exporter /bin/pgbouncer_exporter /bin/pgbouncer_exporter
-COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter
+# Metrics exporter binaries and configuration files
+COPY --from=exporters ./postgres_exporter /bin/postgres_exporter
+COPY --from=exporters ./pgbouncer_exporter /bin/pgbouncer_exporter
+COPY --from=exporters ./sql_exporter /bin/sql_exporter
 
 COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml
 

From 190c19c0344b3720f62e80679634872d390aaa3a Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 29 Jan 2025 17:02:07 +0000
Subject: [PATCH 294/583] chore: update rust-postgres on rebase (#10561)

I tried a full update of our tokio-postgres fork before. We hit some
breaking change. This PR only pulls in ~50% of the changes from
upstream: https://github.com/neondatabase/rust-postgres/pull/38.
---
 Cargo.lock | 58 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f14f4cdb82..9ba90355df 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1312,7 +1312,7 @@ dependencies = [
  "tar",
  "thiserror 1.0.69",
  "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "tokio-stream",
  "tokio-util",
  "tower 0.5.2",
@@ -1421,7 +1421,7 @@ dependencies = [
  "storage_broker",
  "thiserror 1.0.69",
  "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "tokio-util",
  "toml",
  "toml_edit",
@@ -4060,8 +4060,8 @@ dependencies = [
  "pageserver_compaction",
  "pin-project-lite",
  "postgres",
- "postgres-protocol 0.6.4",
- "postgres-types 0.2.4",
+ "postgres-protocol 0.6.6",
+ "postgres-types 0.2.6",
  "postgres_backend",
  "postgres_connection",
  "postgres_ffi",
@@ -4092,7 +4092,7 @@ dependencies = [
  "tokio",
  "tokio-epoll-uring",
  "tokio-io-timeout",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
@@ -4150,7 +4150,7 @@ dependencies = [
  "serde",
  "thiserror 1.0.69",
  "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "tokio-stream",
  "tokio-util",
  "utils",
@@ -4448,23 +4448,23 @@ dependencies = [
 
 [[package]]
 name = "postgres"
-version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#547812b5e4972425fc3a9108cf9bae39e41ee000"
+version = "0.19.6"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
 dependencies = [
  "bytes",
  "fallible-iterator",
  "futures-util",
  "log",
  "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
 ]
 
 [[package]]
 name = "postgres-protocol"
-version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#547812b5e4972425fc3a9108cf9bae39e41ee000"
+version = "0.6.6"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
 dependencies = [
- "base64 0.20.0",
+ "base64 0.21.1",
  "byteorder",
  "bytes",
  "fallible-iterator",
@@ -4513,13 +4513,13 @@ dependencies = [
 
 [[package]]
 name = "postgres-types"
-version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#547812b5e4972425fc3a9108cf9bae39e41ee000"
+version = "0.2.6"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
 dependencies = [
  "bytes",
  "chrono",
  "fallible-iterator",
- "postgres-protocol 0.6.4",
+ "postgres-protocol 0.6.6",
 ]
 
 [[package]]
@@ -4555,7 +4555,7 @@ dependencies = [
  "serde",
  "thiserror 1.0.69",
  "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "tokio-postgres-rustls",
  "tokio-rustls 0.26.0",
  "tokio-util",
@@ -4570,7 +4570,7 @@ dependencies = [
  "itertools 0.10.5",
  "once_cell",
  "postgres",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "url",
 ]
 
@@ -4664,7 +4664,7 @@ dependencies = [
  "byteorder",
  "bytes",
  "itertools 0.10.5",
- "postgres-protocol 0.6.4",
+ "postgres-protocol 0.6.6",
  "rand 0.8.5",
  "serde",
  "thiserror 1.0.69",
@@ -4912,7 +4912,7 @@ dependencies = [
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
  "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "tokio-postgres2",
  "tokio-rustls 0.26.0",
  "tokio-tungstenite 0.21.0",
@@ -5700,7 +5700,7 @@ dependencies = [
  "pageserver_api",
  "parking_lot 0.12.1",
  "postgres",
- "postgres-protocol 0.6.4",
+ "postgres-protocol 0.6.6",
  "postgres_backend",
  "postgres_ffi",
  "pprof",
@@ -5724,7 +5724,7 @@ dependencies = [
  "tikv-jemallocator",
  "tokio",
  "tokio-io-timeout",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
@@ -6394,7 +6394,7 @@ dependencies = [
  "serde_json",
  "storage_controller_client",
  "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "tokio-postgres-rustls",
  "tokio-stream",
  "tokio-util",
@@ -6859,8 +6859,8 @@ dependencies = [
 
 [[package]]
 name = "tokio-postgres"
-version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#547812b5e4972425fc3a9108cf9bae39e41ee000"
+version = "0.7.9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
 dependencies = [
  "async-trait",
  "byteorder",
@@ -6873,11 +6873,13 @@ dependencies = [
  "percent-encoding",
  "phf",
  "pin-project-lite",
- "postgres-protocol 0.6.4",
- "postgres-types 0.2.4",
+ "postgres-protocol 0.6.6",
+ "postgres-types 0.2.6",
+ "rand 0.8.5",
  "socket2",
  "tokio",
  "tokio-util",
+ "whoami",
 ]
 
 [[package]]
@@ -6915,7 +6917,7 @@ dependencies = [
  "ring",
  "rustls 0.23.18",
  "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "tokio-rustls 0.26.0",
  "x509-certificate",
 ]
@@ -7593,7 +7595,7 @@ dependencies = [
  "serde_json",
  "sysinfo",
  "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "tokio-util",
  "tracing",
  "tracing-subscriber",

From fdfbc7b358585a6b0ee013f0fa66ada3a0979eb0 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 29 Jan 2025 17:08:25 +0000
Subject: [PATCH 295/583] pageserver: hold GC while reading from a timeline
 (#10559)

## Problem

If we are GC-ing because a new image layer was added while traversing
the timeline, then it will remove layers that are required for
fulfilling the current get request (read-path cannot "look back" and
notice the new image layer).

## Summary of Changes

Prevent GC from progressing on the current timeline while it is being
visited for a read.

Epic: https://github.com/neondatabase/neon/issues/9376
---
 pageserver/src/tenant/timeline.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f3cdad82d9..24bc7890c6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3468,6 +3468,13 @@ impl Timeline {
         let mut completed_keyspace = KeySpace::default();
         let mut image_covered_keyspace = KeySpaceRandomAccum::new();
 
+        // Prevent GC from progressing while visiting the current timeline.
+        // If we are GC-ing because a new image layer was added while traversing
+        // the timeline, then it will remove layers that are required for fulfilling
+        // the current get request (read-path cannot "look back" and notice the new
+        // image layer).
+        let _gc_cutoff_holder = timeline.get_latest_gc_cutoff_lsn();
+
         loop {
             if cancel.is_cancelled() {
                 return Err(GetVectoredError::Cancelled);

From 34322b2424e9f00a4d8f4b07ce23464012dda151 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Wed, 29 Jan 2025 19:09:25 +0100
Subject: [PATCH 296/583] chore(compute): Simplify new compute_ctl metrics and
 fix flaky test (#10560)

## Problem

1. d04d924 added separate metrics for total requests and failures
separately, but it doesn't make much sense. We could just have a unified
counter with `http_status`.
2. `test_compute_migrations_retry` had a race, i.e., it was waiting for
the last successful migration, not an actual failure. This was revealed
after adding an assert on failure metric in d04d924.

## Summary of changes

1. Switch to unified counters for `compute_ctl` requests.
2. Add a waiting loop into `test_compute_migrations_retry` to eliminate
the race.

Part of neondatabase/cloud#17590
---
 compute_tools/src/extension_server.rs         | 26 +++++-----
 compute_tools/src/metrics.rs                  | 26 ++--------
 compute_tools/src/spec.rs                     | 50 +++++++++----------
 .../regress/test_compute_migrations.py        | 33 +++++++-----
 4 files changed, 61 insertions(+), 74 deletions(-)

diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index fa638c74b3..64c338f4d7 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -85,7 +85,7 @@ use tracing::info;
 use tracing::log::warn;
 use zstd::stream::read::Decoder;
 
-use crate::metrics::{REMOTE_EXT_REQUESTS_FAILED, REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
+use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
 
 fn get_pg_config(argument: &str, pgbin: &str) -> String {
     // gives the result of `pg_config [argument]`
@@ -260,22 +260,20 @@ async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Res
 
     info!("Download extension {:?} from uri {:?}", ext_path, uri);
 
-    REMOTE_EXT_REQUESTS_TOTAL.with_label_values(&[]).inc();
-
     match do_extension_server_request(&uri).await {
         Ok(resp) => {
             info!(
                 "Successfully downloaded remote extension data {:?}",
                 ext_path
             );
+            REMOTE_EXT_REQUESTS_TOTAL
+                .with_label_values(&[&StatusCode::OK.to_string()])
+                .inc();
             Ok(resp)
         }
         Err((msg, status)) => {
-            let status_str = status
-                .map(|s| s.to_string())
-                .unwrap_or(UNKNOWN_HTTP_STATUS.to_string());
-            REMOTE_EXT_REQUESTS_FAILED
-                .with_label_values(&[&status_str])
+            REMOTE_EXT_REQUESTS_TOTAL
+                .with_label_values(&[&status])
                 .inc();
             bail!(msg);
         }
@@ -283,12 +281,12 @@ async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Res
 }
 
 // Do a single remote extensions server request.
-// Return result or (error message + status code) in case of any failures.
-async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, Option<StatusCode>)> {
+// Return result or (error message + stringified status code) in case of any failures.
+async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, String)> {
     let resp = reqwest::get(uri).await.map_err(|e| {
         (
             format!("could not perform remote extensions server request: {}", e),
-            None,
+            UNKNOWN_HTTP_STATUS.to_string(),
         )
     })?;
     let status = resp.status();
@@ -300,19 +298,19 @@ async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, Option
                 format!("could not read remote extensions server response: {}", e),
                 // It's fine to return and report error with status as 200 OK,
                 // because we still failed to read the response.
-                Some(status),
+                status.to_string(),
             )),
         },
         StatusCode::SERVICE_UNAVAILABLE => Err((
             "remote extensions server is temporarily unavailable".to_string(),
-            Some(status),
+            status.to_string(),
         )),
         _ => Err((
             format!(
                 "unexpected remote extensions server response status code: {}",
                 status
             ),
-            Some(status),
+            status.to_string(),
         )),
     }
 }
diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs
index 3684338571..870b294d08 100644
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -32,16 +32,7 @@ pub const UNKNOWN_HTTP_STATUS: &str = "unknown";
 pub(crate) static CPLANE_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "compute_ctl_cplane_requests_total",
-        "Total number of control plane requests made by compute_ctl",
-        &["rpc"]
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static CPLANE_REQUESTS_FAILED: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "compute_ctl_cplane_requests_failed_total",
-        "Total number of failed control plane requests made by compute_ctl",
+        "Total number of control plane requests made by compute_ctl by status",
         &["rpc", "http_status"]
     )
     .expect("failed to define a metric")
@@ -62,18 +53,9 @@ pub(crate) static DB_MIGRATION_FAILED: Lazy<IntCounterVec> = Lazy::new(|| {
 pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "compute_ctl_remote_ext_requests_total",
-        "Total number of requests made by compute_ctl to download extensions from S3 proxy",
+        "Total number of requests made by compute_ctl to download extensions from S3 proxy by status",
         // Do not use any labels like extension name yet.
         // We can add them later if needed.
-        &[]
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static REMOTE_EXT_REQUESTS_FAILED: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "compute_ctl_remote_ext_requests_failed_total",
-        "Total number of failed requests to S3 proxy",
         &["http_status"]
     )
     .expect("failed to define a metric")
@@ -82,9 +64,7 @@ pub(crate) static REMOTE_EXT_REQUESTS_FAILED: Lazy<IntCounterVec> = Lazy::new(||
 pub fn collect() -> Vec<MetricFamily> {
     let mut metrics = INSTALLED_EXTENSIONS.collect();
     metrics.extend(CPLANE_REQUESTS_TOTAL.collect());
-    metrics.extend(CPLANE_REQUESTS_FAILED.collect());
-    metrics.extend(DB_MIGRATION_FAILED.collect());
     metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect());
-    metrics.extend(REMOTE_EXT_REQUESTS_FAILED.collect());
+    metrics.extend(DB_MIGRATION_FAILED.collect());
     metrics
 }
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 01de13811f..43a820885b 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -6,9 +6,7 @@ use std::path::Path;
 use tracing::{error, info, instrument, warn};
 
 use crate::config;
-use crate::metrics::{
-    CPlaneRequestRPC, CPLANE_REQUESTS_FAILED, CPLANE_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS,
-};
+use crate::metrics::{CPlaneRequestRPC, CPLANE_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
 use crate::migration::MigrationRunner;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;
@@ -22,7 +20,7 @@ use compute_api::spec::ComputeSpec;
 fn do_control_plane_request(
     uri: &str,
     jwt: &str,
-) -> Result<ControlPlaneSpecResponse, (bool, String, Option<StatusCode>)> {
+) -> Result<ControlPlaneSpecResponse, (bool, String, String)> {
     let resp = reqwest::blocking::Client::new()
         .get(uri)
         .header("Authorization", format!("Bearer {}", jwt))
@@ -31,7 +29,7 @@ fn do_control_plane_request(
             (
                 true,
                 format!("could not perform spec request to control plane: {}", e),
-                None,
+                UNKNOWN_HTTP_STATUS.to_string(),
             )
         })?;
 
@@ -42,13 +40,13 @@ fn do_control_plane_request(
             Err(e) => Err((
                 true,
                 format!("could not deserialize control plane response: {}", e),
-                Some(status),
+                status.to_string(),
             )),
         },
         StatusCode::SERVICE_UNAVAILABLE => Err((
             true,
             "control plane is temporarily unavailable".to_string(),
-            Some(status),
+            status.to_string(),
         )),
         StatusCode::BAD_GATEWAY => {
             // We have a problem with intermittent 502 errors now
@@ -57,7 +55,7 @@ fn do_control_plane_request(
             Err((
                 true,
                 "control plane request failed with 502".to_string(),
-                Some(status),
+                status.to_string(),
             ))
         }
         // Another code, likely 500 or 404, means that compute is unknown to the control plane
@@ -65,7 +63,7 @@ fn do_control_plane_request(
         _ => Err((
             false,
             format!("unexpected control plane response status code: {}", status),
-            Some(status),
+            status.to_string(),
         )),
     }
 }
@@ -92,26 +90,28 @@ pub fn get_spec_from_control_plane(
     // - no spec for compute yet (Empty state) -> return Ok(None)
     // - got spec -> return Ok(Some(spec))
     while attempt < 4 {
-        CPLANE_REQUESTS_TOTAL
-            .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str()])
-            .inc();
         spec = match do_control_plane_request(&cp_uri, &jwt) {
-            Ok(spec_resp) => match spec_resp.status {
-                ControlPlaneComputeStatus::Empty => Ok(None),
-                ControlPlaneComputeStatus::Attached => {
-                    if let Some(spec) = spec_resp.spec {
-                        Ok(Some(spec))
-                    } else {
-                        bail!("compute is attached, but spec is empty")
+            Ok(spec_resp) => {
+                CPLANE_REQUESTS_TOTAL
+                    .with_label_values(&[
+                        CPlaneRequestRPC::GetSpec.as_str(),
+                        &StatusCode::OK.to_string(),
+                    ])
+                    .inc();
+                match spec_resp.status {
+                    ControlPlaneComputeStatus::Empty => Ok(None),
+                    ControlPlaneComputeStatus::Attached => {
+                        if let Some(spec) = spec_resp.spec {
+                            Ok(Some(spec))
+                        } else {
+                            bail!("compute is attached, but spec is empty")
+                        }
                     }
                 }
-            },
+            }
             Err((retry, msg, status)) => {
-                let status_str = status
-                    .map(|s| s.to_string())
-                    .unwrap_or(UNKNOWN_HTTP_STATUS.to_string());
-                CPLANE_REQUESTS_FAILED
-                    .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status_str])
+                CPLANE_REQUESTS_TOTAL
+                    .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status])
                     .inc();
                 if retry {
                     Err(anyhow!(msg))
diff --git a/test_runner/regress/test_compute_migrations.py b/test_runner/regress/test_compute_migrations.py
index ec2e38f021..0dbb187c39 100644
--- a/test_runner/regress/test_compute_migrations.py
+++ b/test_runner/regress/test_compute_migrations.py
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, cast
 import pytest
 from fixtures.compute_migrations import COMPUTE_MIGRATIONS, NUM_COMPUTE_MIGRATIONS
 from fixtures.metrics import parse_metrics
+from fixtures.utils import wait_until
 
 if TYPE_CHECKING:
     from fixtures.neon_fixtures import NeonEnv
@@ -24,7 +25,26 @@ def test_compute_migrations_retry(neon_simple_env: NeonEnv, compute_migrations_d
     for i in range(1, NUM_COMPUTE_MIGRATIONS + 1):
         endpoint.start(env={"FAILPOINTS": f"compute-migration=return({i})"})
 
-        # Make sure that the migrations ran
+        # Check that migration failure is properly recorded in the metrics
+        #
+        # N.B. wait_for_migrations() only waits till the last successful
+        # migration is applied. It doesn't wait till the migration failure due
+        # to the failpoint. This opens a race for checking the metrics. To avoid
+        # this, we first wait until the migration failure metric is seen.
+        def check_migration_failure_metrics():
+            client = endpoint.http_client()
+            raw_metrics = client.metrics()
+            metrics = parse_metrics(raw_metrics)
+            failed_migration = metrics.query_all(
+                "compute_ctl_db_migration_failed_total",
+            )
+            assert len(failed_migration) == 1
+            for sample in failed_migration:
+                assert sample.value == 1
+
+        wait_until(check_migration_failure_metrics)
+
+        # Make sure that all migrations before the failed one are applied
         endpoint.wait_for_migrations(wait_for=i - 1)
 
         # Confirm that we correctly recorded that in the
@@ -34,17 +54,6 @@ def test_compute_migrations_retry(neon_simple_env: NeonEnv, compute_migrations_d
             migration_id = cast("int", cur.fetchall()[0][0])
             assert migration_id == i - 1
 
-        # Check that migration failure is properly recorded in the metrics
-        client = endpoint.http_client()
-        raw_metrics = client.metrics()
-        metrics = parse_metrics(raw_metrics)
-        failed_migration = metrics.query_all(
-            "compute_ctl_db_migration_failed_total",
-        )
-        assert len(failed_migration) == 1
-        for sample in failed_migration:
-            assert sample.value == 1
-
         endpoint.stop()
 
     endpoint.start()

From 5bcefb4ee13b0ac5c04747a8b61b56dacdb65984 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 29 Jan 2025 13:43:39 -0500
Subject: [PATCH 297/583] fix(pageserver): compaction perftest wrt upper limit
 (#10564)

## Problem

The config is added in https://github.com/neondatabase/neon/pull/10550
causing behavior change for l0 compaction.

close https://github.com/neondatabase/neon/issues/10562

## Summary of changes

Fix the test case to consider the effect of upper_limit.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 2 +-
 test_runner/performance/test_compaction.py   | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 76dcc159ea..589aea18b4 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1112,7 +1112,7 @@ impl Timeline {
         // Accumulate the size of layers in `deltas_to_compact`
         let mut deltas_to_compact_bytes = 0;
 
-        // Under normal circumstances, we will accumulate up to compaction_interval L0s of size
+        // Under normal circumstances, we will accumulate up to compaction_upper_limit L0s of size
         // checkpoint_distance each.  To avoid edge cases using extra system resources, bound our
         // work in this function to only operate on this much delta data at once.
         let delta_size_limit = self.get_compaction_upper_limit() as u64
diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py
index 0cd1080fa7..eaa89ae754 100644
--- a/test_runner/performance/test_compaction.py
+++ b/test_runner/performance/test_compaction.py
@@ -75,6 +75,7 @@ def test_compaction_l0_memory(neon_compare: NeonCompare):
             # Initially disable compaction so that we will build up a stack of L0s
             "compaction_period": "0s",
             "gc_period": "0s",
+            "compaction_upper_limit": 12,
         }
     )
     neon_compare.tenant = tenant_id
@@ -91,6 +92,7 @@ def test_compaction_l0_memory(neon_compare: NeonCompare):
     tenant_conf = pageserver_http.tenant_config(tenant_id)
     assert tenant_conf.effective_config["checkpoint_distance"] == 256 * 1024 * 1024
     assert tenant_conf.effective_config["compaction_threshold"] == 10
+    assert tenant_conf.effective_config["compaction_upper_limit"] == 12
 
     # Aim to write about 20 L0s, so that we will hit the limit on how many
     # to compact at once

From 707a9260573c58b908f789c5e92301f9e5ff77ab Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 29 Jan 2025 13:22:01 -0600
Subject: [PATCH 298/583] Remove unused compute_ctl HTTP routes (#10544)

These are not used anywhere within the platform, so let's remove dead
code.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/http/openapi_spec.yaml      |  29 ----
 compute_tools/src/http/routes/info.rs         |  11 --
 .../src/http/routes/installed_extensions.rs   |  33 ----
 compute_tools/src/http/routes/mod.rs          |   2 -
 compute_tools/src/http/server.rs              |   8 +-
 libs/compute_api/src/responses.rs             |   5 -
 test_runner/fixtures/endpoint/http.py         |   5 -
 test_runner/regress/test_compute_metrics.py   |  91 +++++++++++
 .../regress/test_installed_extensions.py      | 154 ------------------
 9 files changed, 92 insertions(+), 246 deletions(-)
 delete mode 100644 compute_tools/src/http/routes/info.rs
 delete mode 100644 compute_tools/src/http/routes/installed_extensions.rs
 delete mode 100644 test_runner/regress/test_installed_extensions.py

diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index 50319cdd85..bbdb7d0917 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -68,35 +68,6 @@ paths:
               schema:
                 $ref: "#/components/schemas/ComputeInsights"
 
-  /installed_extensions:
-    get:
-      tags:
-      - Info
-      summary: Get installed extensions.
-      description: ""
-      operationId: getInstalledExtensions
-      responses:
-        200:
-          description: List of installed extensions
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/InstalledExtensions"
-  /info:
-    get:
-      tags:
-      - Info
-      summary: Get info about the compute pod / VM.
-      description: ""
-      operationId: getInfo
-      responses:
-        200:
-          description: Info
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Info"
-
   /dbs_and_roles:
     get:
       tags:
diff --git a/compute_tools/src/http/routes/info.rs b/compute_tools/src/http/routes/info.rs
deleted file mode 100644
index 32d6fea74c..0000000000
--- a/compute_tools/src/http/routes/info.rs
+++ /dev/null
@@ -1,11 +0,0 @@
-use axum::response::Response;
-use compute_api::responses::InfoResponse;
-use http::StatusCode;
-
-use crate::http::JsonResponse;
-
-/// Get information about the physical characteristics about the compute.
-pub(in crate::http) async fn get_info() -> Response {
-    let num_cpus = num_cpus::get_physical();
-    JsonResponse::success(StatusCode::OK, &InfoResponse { num_cpus })
-}
diff --git a/compute_tools/src/http/routes/installed_extensions.rs b/compute_tools/src/http/routes/installed_extensions.rs
deleted file mode 100644
index db74a6b195..0000000000
--- a/compute_tools/src/http/routes/installed_extensions.rs
+++ /dev/null
@@ -1,33 +0,0 @@
-use std::sync::Arc;
-
-use axum::{extract::State, response::Response};
-use compute_api::responses::ComputeStatus;
-use http::StatusCode;
-use tokio::task;
-
-use crate::{compute::ComputeNode, http::JsonResponse, installed_extensions};
-
-/// Get a list of installed extensions.
-pub(in crate::http) async fn get_installed_extensions(
-    State(compute): State<Arc<ComputeNode>>,
-) -> Response {
-    let status = compute.get_status();
-    if status != ComputeStatus::Running {
-        return JsonResponse::invalid_status(status);
-    }
-
-    let conf = compute.get_conn_conf(None);
-    let res = task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf))
-        .await
-        .unwrap();
-
-    match res {
-        Ok(installed_extensions) => {
-            JsonResponse::success(StatusCode::OK, Some(installed_extensions))
-        }
-        Err(e) => JsonResponse::error(
-            StatusCode::INTERNAL_SERVER_ERROR,
-            format!("failed to get list of installed extensions: {e}"),
-        ),
-    }
-}
diff --git a/compute_tools/src/http/routes/mod.rs b/compute_tools/src/http/routes/mod.rs
index 3efa1153ad..a67be7fd5a 100644
--- a/compute_tools/src/http/routes/mod.rs
+++ b/compute_tools/src/http/routes/mod.rs
@@ -10,9 +10,7 @@ pub(in crate::http) mod extension_server;
 pub(in crate::http) mod extensions;
 pub(in crate::http) mod failpoints;
 pub(in crate::http) mod grants;
-pub(in crate::http) mod info;
 pub(in crate::http) mod insights;
-pub(in crate::http) mod installed_extensions;
 pub(in crate::http) mod metrics;
 pub(in crate::http) mod metrics_json;
 pub(in crate::http) mod status;
diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs
index da650585fc..e41ed9df2d 100644
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -22,8 +22,7 @@ use uuid::Uuid;
 
 use super::routes::{
     check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
-    grants, info as info_route, insights, installed_extensions, metrics, metrics_json, status,
-    terminate,
+    grants, insights, metrics, metrics_json, status, terminate,
 };
 use crate::compute::ComputeNode;
 
@@ -60,12 +59,7 @@ async fn serve(port: u16, compute: Arc<ComputeNode>) {
         )
         .route("/extensions", post(extensions::install_extension))
         .route("/grants", post(grants::add_grant))
-        .route("/info", get(info_route::get_info))
         .route("/insights", get(insights::get_insights))
-        .route(
-            "/installed_extensions",
-            get(installed_extensions::get_installed_extensions),
-        )
         .route("/metrics", get(metrics::get_metrics))
         .route("/metrics.json", get(metrics_json::get_metrics))
         .route("/status", get(status::get_status))
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 9ce605089b..5286e0e61d 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -15,11 +15,6 @@ pub struct GenericAPIError {
     pub error: String,
 }
 
-#[derive(Debug, Clone, Serialize)]
-pub struct InfoResponse {
-    pub num_cpus: usize,
-}
-
 #[derive(Debug, Clone, Serialize)]
 pub struct ExtensionInstallResponse {
     pub extension: PgIdent,
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
index aa0d95fe80..6e8210e978 100644
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -28,11 +28,6 @@ class EndpointHttpClient(requests.Session):
         res.raise_for_status()
         return res.text
 
-    def installed_extensions(self):
-        res = self.get(f"http://localhost:{self.port}/installed_extensions")
-        res.raise_for_status()
-        return res.json()
-
     def extensions(self, extension: str, version: str, database: str):
         body = {
             "extension": extension,
diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py
index 5dcc93acff..99d41e410a 100644
--- a/test_runner/regress/test_compute_metrics.py
+++ b/test_runner/regress/test_compute_metrics.py
@@ -5,16 +5,22 @@ import os
 import shutil
 import sys
 from enum import StrEnum
+from logging import debug
 from pathlib import Path
 from typing import TYPE_CHECKING, cast
 
 import pytest
 import requests
 import yaml
+from fixtures.endpoint.http import EndpointHttpClient
 from fixtures.log_helper import log
+from fixtures.metrics import parse_metrics
 from fixtures.paths import BASE_DIR, COMPUTE_CONFIG_DIR
+from fixtures.utils import wait_until
+from prometheus_client.samples import Sample
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
     from types import TracebackType
     from typing import Self, TypedDict
 
@@ -467,3 +473,88 @@ def test_perf_counters(neon_simple_env: NeonEnv):
     cur.execute("CREATE EXTENSION neon VERSION '1.5'")
     cur.execute("SELECT * FROM neon_perf_counters")
     cur.execute("SELECT * FROM neon_backend_perf_counters")
+
+
+def collect_metric(
+    client: EndpointHttpClient,
+    name: str,
+    filter: dict[str, str],
+    predicate: Callable[[list[Sample]], bool],
+) -> Callable[[], list[Sample]]:
+    """
+    Call this function as the first argument to wait_until().
+    """
+
+    def __collect_metric() -> list[Sample]:
+        resp = client.metrics()
+        debug("Metrics: %s", resp)
+        m = parse_metrics(resp)
+        samples = m.query_all(name, filter)
+        debug("Samples: %s", samples)
+        assert predicate(samples), "predicate failed"
+        return samples
+
+    return __collect_metric
+
+
+def test_compute_installed_extensions_metric(neon_simple_env: NeonEnv):
+    """
+    Test that the compute_installed_extensions properly reports accurate
+    results. Important to note that currently this metric is only gathered on
+    compute start.
+    """
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create_start("main")
+
+    client = endpoint.http_client()
+
+    def __has_plpgsql(samples: list[Sample]) -> bool:
+        """
+        Check that plpgsql is installed in the template1 and postgres databases
+        """
+        return len(samples) == 1 and samples[0].value == 2
+
+    wait_until(
+        collect_metric(
+            client,
+            "compute_installed_extensions",
+            {"extension_name": "plpgsql", "version": "1.0", "owned_by_superuser": "1"},
+            __has_plpgsql,
+        ),
+        name="compute_installed_extensions",
+    )
+
+    # Install the neon extension, so we can check for it on the restart
+    endpoint.safe_psql("CREATE EXTENSION neon VERSION '1.0'")
+
+    # The metric is only gathered on compute start, so restart to check if the
+    # neon extension will now be there.
+    endpoint.stop()
+    endpoint.start()
+
+    client = endpoint.http_client()
+
+    def __has_neon(samples: list[Sample]) -> bool:
+        return len(samples) == 1 and samples[0].value == 1
+
+    wait_until(
+        collect_metric(
+            client,
+            "compute_installed_extensions",
+            {"extension_name": "neon", "version": "1.0", "owned_by_superuser": "1"},
+            __has_neon,
+        ),
+        name="compute_installed_extensions",
+    )
+
+    # Double check that we also still have plpgsql
+    wait_until(
+        collect_metric(
+            client,
+            "compute_installed_extensions",
+            {"extension_name": "plpgsql", "version": "1.0", "owned_by_superuser": "1"},
+            __has_plpgsql,
+        ),
+        name="compute_installed_extensions",
+    )
diff --git a/test_runner/regress/test_installed_extensions.py b/test_runner/regress/test_installed_extensions.py
deleted file mode 100644
index 4e51e7e10c..0000000000
--- a/test_runner/regress/test_installed_extensions.py
+++ /dev/null
@@ -1,154 +0,0 @@
-from __future__ import annotations
-
-import time
-from logging import info
-from typing import TYPE_CHECKING
-
-from fixtures.log_helper import log
-from fixtures.metrics import parse_metrics
-
-if TYPE_CHECKING:
-    from fixtures.neon_fixtures import NeonEnv
-
-
-def test_installed_extensions(neon_simple_env: NeonEnv):
-    """basic test for the endpoint that returns the list of installed extensions"""
-
-    env = neon_simple_env
-
-    env.create_branch("test_installed_extensions")
-
-    endpoint = env.endpoints.create_start("test_installed_extensions")
-
-    endpoint.safe_psql("CREATE DATABASE test_installed_extensions")
-    endpoint.safe_psql("CREATE DATABASE test_installed_extensions_2")
-
-    client = endpoint.http_client()
-    res = client.installed_extensions()
-
-    info("Extensions list: %s", res)
-    info("Extensions: %s", res["extensions"])
-    # 'plpgsql' is a default extension that is always installed.
-    assert any(
-        ext["extname"] == "plpgsql" and ext["version"] == "1.0" for ext in res["extensions"]
-    ), "The 'plpgsql' extension is missing"
-
-    # check that the neon_test_utils extension is not installed
-    assert not any(
-        ext["extname"] == "neon_test_utils" for ext in res["extensions"]
-    ), "The 'neon_test_utils' extension is installed"
-
-    pg_conn = endpoint.connect(dbname="test_installed_extensions")
-    with pg_conn.cursor() as cur:
-        cur.execute("CREATE EXTENSION neon_test_utils")
-        cur.execute(
-            "SELECT default_version FROM pg_available_extensions WHERE name = 'neon_test_utils'"
-        )
-        res = cur.fetchone()
-        neon_test_utils_version = res[0]
-
-    with pg_conn.cursor() as cur:
-        cur.execute("CREATE EXTENSION neon version '1.1'")
-
-    pg_conn_2 = endpoint.connect(dbname="test_installed_extensions_2")
-    with pg_conn_2.cursor() as cur:
-        cur.execute("CREATE EXTENSION neon version '1.2'")
-
-    res = client.installed_extensions()
-
-    info("Extensions list: %s", res)
-    info("Extensions: %s", res["extensions"])
-
-    # check that the neon_test_utils extension is installed only in 1 database
-    # and has the expected version
-    assert any(
-        ext["extname"] == "neon_test_utils"
-        and ext["version"] == neon_test_utils_version
-        and ext["n_databases"] == 1
-        for ext in res["extensions"]
-    )
-
-    # check that the plpgsql extension is installed in all databases
-    # this is a default extension that is always installed
-    assert any(ext["extname"] == "plpgsql" and ext["n_databases"] == 4 for ext in res["extensions"])
-
-    # check that the neon extension is installed and has expected versions
-    for ext in res["extensions"]:
-        if ext["extname"] == "neon":
-            assert ext["version"] in ["1.1", "1.2"]
-            assert ext["n_databases"] == 1
-
-    with pg_conn.cursor() as cur:
-        cur.execute("ALTER EXTENSION neon UPDATE TO '1.3'")
-
-    res = client.installed_extensions()
-
-    info("Extensions list: %s", res)
-    info("Extensions: %s", res["extensions"])
-
-    # check that the neon_test_utils extension is updated
-    for ext in res["extensions"]:
-        if ext["extname"] == "neon":
-            assert ext["version"] in ["1.2", "1.3"]
-            assert ext["n_databases"] == 1
-
-    # check that /metrics endpoint is available
-    # ensure that we see the metric before and after restart
-    res = client.metrics()
-    info("Metrics: %s", res)
-    m = parse_metrics(res)
-    neon_m = m.query_all(
-        "compute_installed_extensions",
-        {"extension_name": "neon", "version": "1.2", "owned_by_superuser": "1"},
-    )
-    assert len(neon_m) == 1
-    for sample in neon_m:
-        assert sample.value == 1
-    neon_m = m.query_all(
-        "compute_installed_extensions",
-        {"extension_name": "neon", "version": "1.3", "owned_by_superuser": "1"},
-    )
-    assert len(neon_m) == 1
-    for sample in neon_m:
-        assert sample.value == 1
-
-    endpoint.stop()
-    endpoint.start()
-
-    timeout = 10
-    while timeout > 0:
-        try:
-            res = client.metrics()
-            timeout = -1
-            if len(parse_metrics(res).query_all("compute_installed_extensions")) < 4:
-                # Assume that not all metrics that are collected yet
-                time.sleep(1)
-                timeout -= 1
-                continue
-        except Exception:
-            log.exception("failed to get metrics, assume they are not collected yet")
-            time.sleep(1)
-            timeout -= 1
-            continue
-
-        assert (
-            len(parse_metrics(res).query_all("compute_installed_extensions")) >= 4
-        ), "Not all metrics are collected"
-
-        info("After restart metrics: %s", res)
-        m = parse_metrics(res)
-        neon_m = m.query_all(
-            "compute_installed_extensions",
-            {"extension_name": "neon", "version": "1.2", "owned_by_superuser": "1"},
-        )
-        assert len(neon_m) == 1
-        for sample in neon_m:
-            assert sample.value == 1
-
-        neon_m = m.query_all(
-            "compute_installed_extensions",
-            {"extension_name": "neon", "version": "1.3", "owned_by_superuser": "1"},
-        )
-        assert len(neon_m) == 1
-        for sample in neon_m:
-            assert sample.value == 1

From 62819aca366b72409b9ae5033f739ef86c8795ff Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 29 Jan 2025 21:21:42 +0100
Subject: [PATCH 299/583] Add PostgreSQL version  17 benchmarks (#10536)

## Problem

benchmarking.yml so far is only running benchmarks with PostgreSQL
version 16.
However neon recently changed the default for new customers to
PostgreSQL version 17.

See related [epic](https://github.com/neondatabase/cloud/issues/23295)

## Summary of changes

We do not want to run every job step with both pg 16 and 17 because this
would need excessive resources (runners, computes) and extend the
benchmarking run wall clock time too much.

So we select an opinionated subset of testcases that we also report in
weekly reporting and add a postgres v17 job step.

For re-use projects associated Neon projects have been created and
connection strings have been added to neon database organization
secrets.

A follow up is to add the reporting for these new runs to some grafana
dashboards.
---
 .../workflows/_benchmarking_preparation.yml   |   5 +-
 .github/workflows/benchmarking.yml            | 128 +++++++++++++-----
 2 files changed, 95 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml
index fd328586b3..71aef1430e 100644
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ]
+        platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon, neon_pg17 ]
         database: [ clickbench, tpch, userexample ]
 
     env:
@@ -41,6 +41,9 @@ jobs:
           neon)
             CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
             ;;
+          neon_pg17)
+            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }}
+            ;;
           aws-rds-postgres)
             CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
             ;;
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index ab0f2a6155..32747d825c 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -63,11 +63,15 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - DEFAULT_PG_VERSION: 16
+          - PG_VERSION: 16
             PLATFORM: "neon-staging"
             region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
             RUNNER: [ self-hosted, us-east-2, x64 ]
-          - DEFAULT_PG_VERSION: 16
+          - PG_VERSION: 17
+            PLATFORM: "neon-staging"
+            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+            RUNNER: [ self-hosted, us-east-2, x64 ]
+          - PG_VERSION: 16
             PLATFORM: "azure-staging"
             region_id: 'azure-eastus2'
             RUNNER: [ self-hosted, eastus2, x64 ]
@@ -75,7 +79,7 @@ jobs:
       TEST_PG_BENCH_DURATIONS_MATRIX: "300"
       TEST_PG_BENCH_SCALES_MATRIX: "10,100"
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }}
+      PG_VERSION: ${{ matrix.PG_VERSION }}
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -112,7 +116,7 @@ jobs:
       uses: ./.github/actions/neon-project-create
       with:
         region_id: ${{ matrix.region_id }}
-        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+        postgres_version: ${{ env.PG_VERSION }}
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
 
     - name: Run benchmark
@@ -122,7 +126,7 @@ jobs:
         test_selection: performance
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         # Set --sparse-ordering option of pytest-order plugin
         # to ensure tests are running in order of appears in the file.
@@ -313,7 +317,11 @@ jobs:
                       { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
                       { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
                       { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, 
+                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
         }'
 
         if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
@@ -329,12 +337,15 @@ jobs:
         matrix='{
           "platform": [
             "neonvm-captest-reuse"
-          ]
+          ],
+          "pg_version" : [
+            16,17
+          ],
         }'
 
         if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
-                                                     { "platform": "rds-aurora"   }]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "platform": "rds-postgres" },
+                                                     { "pg_version": 16, "platform": "rds-aurora"   }]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -346,14 +357,14 @@ jobs:
           "platform": [
             "neonvm-captest-reuse"
           ],
-          "scale": [
-            "10"
+          "pg_version" : [
+            16,17
           ]
         }'
 
         if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
-                                                     { "platform": "rds-aurora",   "scale": "10" }]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "platform": "rds-postgres" },
+                                                     { "pg_version": 16, "platform": "rds-aurora"   }]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -378,7 +389,7 @@ jobs:
       TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
       TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }}
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
+      PG_VERSION: ${{ matrix.pg_version }}
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -416,7 +427,7 @@ jobs:
       uses: ./.github/actions/neon-project-create
       with:
         region_id: ${{ matrix.region_id }}
-        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+        postgres_version: ${{ env.PG_VERSION }}
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
         compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
 
@@ -459,7 +470,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_perf_many_relations
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -475,7 +486,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -490,7 +501,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -505,7 +516,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -549,14 +560,19 @@ jobs:
         include:
           - PLATFORM: "neonvm-captest-pgvector"
             RUNNER: [ self-hosted, us-east-2, x64 ]
+            postgres_version: 16
+          - PLATFORM: "neonvm-captest-pgvector-pg17"
+            RUNNER: [ self-hosted, us-east-2, x64 ]
+            postgres_version: 17
           - PLATFORM: "azure-captest-pgvector"
             RUNNER: [ self-hosted, eastus2, x64 ]
+            postgres_version: 16
 
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
       TEST_PG_BENCH_SCALES_MATRIX: "1"
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      PG_VERSION: ${{ matrix.postgres_version }}
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
 
@@ -590,9 +606,13 @@ jobs:
         dpkg -x postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb pg
 
         mkdir -p /tmp/neon/pg_install/v16/bin
+        mkdir -p /tmp/neon/pg_install/v17/bin
         ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
         ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql    /tmp/neon/pg_install/v16/bin/psql
         ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu     /tmp/neon/pg_install/v16/lib
+        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v17/bin/pgbench
+        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql    /tmp/neon/pg_install/v17/bin/psql
+        ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu     /tmp/neon/pg_install/v17/lib
 
         LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH:-}"
         export LD_LIBRARY_PATH
@@ -608,6 +628,9 @@ jobs:
           neonvm-captest-pgvector)
             CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
             ;;
+          neonvm-captest-pgvector-pg17)
+            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_PG17 }}
+            ;;
           azure-captest-pgvector)
             CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }}
             ;;
@@ -634,7 +657,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -649,7 +672,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -696,7 +719,7 @@ jobs:
 
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      PG_VERSION: ${{ matrix.pg_version }}
       TEST_OUTPUT: /tmp/test_output
       TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
       TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
@@ -739,7 +762,18 @@ jobs:
       run: |
         case "${PLATFORM}" in
           neonvm-captest-reuse)
-            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
+            case "${PG_VERSION}" in
+              16)
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR_V16 }}
+                ;;
+              17)
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }}
+                ;;
+              *)
+                echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
+                exit 1
+                ;;
+            esac
             ;;
           rds-aurora)
             CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CLICKBENCH_10M_CONNSTR }}
@@ -763,7 +797,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 43200 -k test_clickbench
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -812,12 +846,11 @@ jobs:
 
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      PG_VERSION: ${{ matrix.pg_version }}
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.platform }}
-      TEST_OLAP_SCALE: ${{ matrix.scale }}
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
@@ -849,21 +882,31 @@ jobs:
       run: |
         case "${PLATFORM}" in
           neonvm-captest-reuse)
-            ENV_PLATFORM=CAPTEST_TPCH
+            case "${PG_VERSION}" in
+              16)
+                CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_S10_CONNSTR"
+                ;;
+              17)
+                CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_CONNSTR_PG17"
+                ;;
+              *)
+                echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
+                exit 1
+                ;;
+            esac
             ;;
           rds-aurora)
-            ENV_PLATFORM=RDS_AURORA_TPCH
+            CONNSTR_SECRET_NAME="BENCHMARK_RDS_AURORA_TPCH_S10_CONNSTR"
             ;;
           rds-postgres)
-            ENV_PLATFORM=RDS_POSTGRES_TPCH
+            CONNSTR_SECRET_NAME="BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR"
             ;;
           *)
             echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
             exit 1
             ;;
         esac
-
-        CONNSTR_SECRET_NAME="BENCHMARK_${ENV_PLATFORM}_S${TEST_OLAP_SCALE}_CONNSTR"
+        
         echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV
 
     - name: Set up Connection String
@@ -881,13 +924,13 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_tpch
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
-        TEST_OLAP_SCALE: ${{ matrix.scale }}
+        TEST_OLAP_SCALE: 10
 
     - name: Create Allure report
       id: create-allure-report
@@ -922,7 +965,7 @@ jobs:
 
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      PG_VERSION: ${{ matrix.pg_version }}
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -959,7 +1002,18 @@ jobs:
       run: |
         case "${PLATFORM}" in
           neonvm-captest-reuse)
-            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
+            case "${PG_VERSION}" in
+              16)
+                CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
+                ;;
+              17)
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }}
+                ;;
+              *)
+                echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
+                exit 1
+                ;;
+            esac
             ;;
           rds-aurora)
             CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_AURORA_CONNSTR }}
@@ -983,7 +1037,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"

From de1c35fab32717a114b89e007c490fcc01faa5d9 Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Wed, 29 Jan 2025 22:02:54 +0100
Subject: [PATCH 300/583] add retries for apt, wget and curl (#10553)

Ref: https://github.com/neondatabase/cloud/issues/23461

## Problem
> recent CI failure due to apt-get:
```
4.266 E: Failed to fetch http://deb.debian.org/debian/pool/main/g/gcc-10/libgfortran5_10.2.1-6_arm64.deb  Error reading from server - read (104: Connection reset by peer) [IP: 146.75.122.132 80]
```

https://github.com/neondatabase/neon/actions/runs/11144974698/job/30973537767?pr=9186
thinking about if there should be a mirror-selector at the beginning of
the dockerfile so that it uses a debian mirror closer to the build
server?
## Summary of changes
We could consider adding local mirror or proxy and keep it close to our
self-hosted runners.
For now lets just add retries for `apt`, `wget` and `curl`

thanks to @skyzh for reporting that in October 2024, I just finally
found time to take a look here :)
---
 Dockerfile                                |  2 ++
 build-tools.Dockerfile                    | 10 ++++++++++
 compute/compute-node.Dockerfile           | 13 ++++++++++++-
 docker-compose/compute_wrapper/Dockerfile |  7 ++++---
 4 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index a8f7ae0a62..7ba54c8ca5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -64,6 +64,7 @@ ARG DEFAULT_PG_VERSION
 WORKDIR /data
 
 RUN set -e \
+    && echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries \
     && apt update \
     && apt install -y \
         libreadline-dev \
@@ -72,6 +73,7 @@ RUN set -e \
 	# System postgres for use with client libraries (e.g. in storage controller)
         postgresql-15 \
         openssl \
+    && rm -f /etc/apt/apt.conf.d/80-retries \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
     && useradd -d /data neon \
     && chown -R neon:neon /data
diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 7a2ec9c43e..9c13e480c1 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -3,6 +3,10 @@ ARG DEBIAN_VERSION=bookworm
 FROM debian:bookworm-slim AS pgcopydb_builder
 ARG DEBIAN_VERSION
 
+RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
+
 RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
         set -e && \
         apt update && \
@@ -61,6 +65,10 @@ RUN mkdir -p /pgcopydb/bin && \
 COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb
 COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5
 
+RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
+
 # System deps
 #
 # 'gdb' is included so that we get backtraces of core dumps produced in
@@ -218,6 +226,8 @@ RUN wget -O /tmp/libicu-${ICU_VERSION}.tgz https://github.com/unicode-org/icu/re
 USER nonroot:nonroot
 WORKDIR /home/nonroot
 
+RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc
+
 # Python
 ENV PYTHON_VERSION=3.11.10 \
     PYENV_ROOT=/home/nonroot/.pyenv \
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 7ac6e9bc58..a428c61f34 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -18,6 +18,10 @@ ARG DEBIAN_VERSION
 # Use strict mode for bash to catch errors early
 SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
 
+RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
+
 RUN case $DEBIAN_VERSION in \
       # Version-specific installs for Bullseye (PG14-PG16):
       # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18.
@@ -838,6 +842,8 @@ ENV PATH="/home/nonroot/.cargo/bin:$PATH"
 USER nonroot
 WORKDIR /home/nonroot
 
+RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc
+
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
     chmod +x rustup-init && \
     ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
@@ -874,6 +880,8 @@ ENV PATH="/home/nonroot/.cargo/bin:$PATH"
 USER nonroot
 WORKDIR /home/nonroot
 
+RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc
+
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
     chmod +x rustup-init && \
     ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
@@ -1243,6 +1251,7 @@ RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin c
 
 FROM debian:$DEBIAN_FLAVOR AS pgbouncer
 RUN set -e \
+    && echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries \
     && apt update \
     && apt install --no-install-suggests --no-install-recommends -y \
         build-essential \
@@ -1444,6 +1453,8 @@ RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/loca
 # libboost* for rdkit
 # ca-certificates for communicating with s3 by compute_ctl
 
+RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc
 
 RUN apt update && \
     case $DEBIAN_VERSION in \
@@ -1500,7 +1511,7 @@ RUN set -ex; \
     else \
         echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \
     fi; \
-    curl -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \
+    curl --retry 5 -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \
     echo "${CHECKSUM}  /tmp/awscliv2.zip" | sha256sum -c -; \
     unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \
     /tmp/awscliv2/aws/install; \
diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile
index e2e5bc7248..61f44681da 100644
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -7,11 +7,12 @@ FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG
 ARG COMPUTE_IMAGE
 
 USER root
-RUN apt-get update &&       \
+RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
+    apt-get update &&       \
     apt-get install -y curl \
                        jq   \
                        netcat-openbsd
 #This is required for the pg_hintplan test
-RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src 
+RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src
 
-USER postgres
\ No newline at end of file
+USER postgres

From ff298afb97253654f7daa5ed3c5977645a249a4f Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 29 Jan 2025 22:10:56 +0100
Subject: [PATCH 301/583] pageserver: add `level` for timeline layer metrics
 (#10563)

## Problem

We don't have good observability for per-timeline compaction debt,
specifically the number of delta layers in the frozen, L0, and L1
levels.

Touches https://github.com/neondatabase/cloud/issues/23283.

## Summary of changes

* Add a `level` label for `pageserver_layer_{count,size}` with values
`l0`, `l1`, and `frozen`.
* Track metrics for frozen layers.

There is already a `kind={delta,image}` label. `kind=image` is only
possible for `level=l1`.

We don't include the currently open ephemeral layer, only frozen layers.
There is always exactly 1 ephemeral layer, with a dynamic size which is
already tracked in `pageserver_timeline_ephemeral_bytes`.
---
 pageserver/src/metrics.rs                     | 236 +++++++++++-------
 pageserver/src/tenant/storage_layer/layer.rs  |  16 +-
 pageserver/src/tenant/timeline.rs             |   2 +-
 .../src/tenant/timeline/layer_manager.rs      |   8 +
 4 files changed, 154 insertions(+), 108 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index d2c778276d..77c0967afc 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,4 +1,13 @@
+use std::collections::HashMap;
+use std::num::NonZeroUsize;
+use std::pin::Pin;
+use std::sync::atomic::AtomicU64;
+use std::sync::{Arc, Mutex};
+use std::task::{Context, Poll};
+use std::time::{Duration, Instant};
+
 use enum_map::EnumMap;
+use futures::Future;
 use metrics::{
     register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
     register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
@@ -11,13 +20,26 @@ use pageserver_api::config::{
     PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
     PageServiceProtocolPipelinedExecutionStrategy,
 };
+use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
+use pin_project_lite::pin_project;
 use postgres_backend::{is_expected_io_error, QueryError};
 use pq_proto::framed::ConnectionError;
-use strum::{EnumCount, VariantNames};
+
+use strum::{EnumCount, IntoEnumIterator as _, VariantNames};
 use strum_macros::{IntoStaticStr, VariantNames};
 use utils::id::TimelineId;
 
+use crate::config::PageServerConf;
+use crate::context::{PageContentKind, RequestContext};
+use crate::task_mgr::TaskKind;
+use crate::tenant::layer_map::LayerMap;
+use crate::tenant::mgr::TenantSlot;
+use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc};
+use crate::tenant::tasks::BackgroundLoopKind;
+use crate::tenant::throttle::ThrottleResult;
+use crate::tenant::Timeline;
+
 /// Prometheus histogram buckets (in seconds) for operations in the critical
 /// path. In other words, operations that directly affect that latency of user
 /// queries.
@@ -443,18 +465,38 @@ static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
+#[derive(
+    strum_macros::EnumIter,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    strum_macros::IntoStaticStr,
+)]
 #[strum(serialize_all = "kebab_case")]
-pub(crate) enum MetricLayerKind {
+pub(crate) enum LayerKind {
     Delta,
     Image,
 }
 
+#[derive(
+    strum_macros::EnumIter,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    strum_macros::IntoStaticStr,
+)]
+#[strum(serialize_all = "kebab_case")]
+pub(crate) enum LayerLevel {
+    // We don't track the currently open ephemeral layer, since there's always exactly 1 and its
+    // size changes. See `TIMELINE_EPHEMERAL_BYTES`.
+    Frozen,
+    L0,
+    L1,
+}
+
 static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_layer_bytes",
-        "Sum of layer physical sizes in bytes",
-        &["tenant_id", "shard_id", "timeline_id", "kind"]
+        "Sum of frozen, L0, and L1 layer physical sizes in bytes (excluding the open ephemeral layer)",
+        &["tenant_id", "shard_id", "timeline_id", "level", "kind"]
     )
     .expect("failed to define a metric")
 });
@@ -462,8 +504,8 @@ static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
 static TIMELINE_LAYER_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_layer_count",
-        "Number of layers that exist",
-        &["tenant_id", "shard_id", "timeline_id", "kind"]
+        "Number of frozen, L0, and L1 layers (excluding the open ephemeral layer)",
+        &["tenant_id", "shard_id", "timeline_id", "level", "kind"]
     )
     .expect("failed to define a metric")
 });
@@ -2590,10 +2632,6 @@ pub(crate) struct TimelineMetrics {
     pub disk_consistent_lsn_gauge: IntGauge,
     pub pitr_history_size: UIntGauge,
     pub archival_size: UIntGauge,
-    pub(crate) layer_size_image: UIntGauge,
-    pub(crate) layer_count_image: UIntGauge,
-    pub(crate) layer_size_delta: UIntGauge,
-    pub(crate) layer_count_delta: UIntGauge,
     pub standby_horizon_gauge: IntGauge,
     pub resident_physical_size_gauge: UIntGauge,
     pub visible_physical_size_gauge: UIntGauge,
@@ -2691,42 +2729,6 @@ impl TimelineMetrics {
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
 
-        let layer_size_image = TIMELINE_LAYER_SIZE
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Image.into(),
-            ])
-            .unwrap();
-
-        let layer_count_image = TIMELINE_LAYER_COUNT
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Image.into(),
-            ])
-            .unwrap();
-
-        let layer_size_delta = TIMELINE_LAYER_SIZE
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Delta.into(),
-            ])
-            .unwrap();
-
-        let layer_count_delta = TIMELINE_LAYER_COUNT
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Delta.into(),
-            ])
-            .unwrap();
-
         let standby_horizon_gauge = STANDBY_HORIZON
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -2791,10 +2793,6 @@ impl TimelineMetrics {
             disk_consistent_lsn_gauge,
             pitr_history_size,
             archival_size,
-            layer_size_image,
-            layer_count_image,
-            layer_size_delta,
-            layer_count_delta,
             standby_horizon_gauge,
             resident_physical_size_gauge,
             visible_physical_size_gauge,
@@ -2837,6 +2835,92 @@ impl TimelineMetrics {
             .add(duration);
     }
 
+    /// Generates TIMELINE_LAYER labels for a persistent layer.
+    fn make_layer_labels(&self, layer_desc: &PersistentLayerDesc) -> [&str; 5] {
+        let level = match LayerMap::is_l0(&layer_desc.key_range, layer_desc.is_delta()) {
+            true => LayerLevel::L0,
+            false => LayerLevel::L1,
+        };
+        let kind = match layer_desc.is_delta() {
+            true => LayerKind::Delta,
+            false => LayerKind::Image,
+        };
+        [
+            &self.tenant_id,
+            &self.shard_id,
+            &self.timeline_id,
+            level.into(),
+            kind.into(),
+        ]
+    }
+
+    /// Generates TIMELINE_LAYER labels for a frozen ephemeral layer.
+    fn make_frozen_layer_labels(&self, _layer: &InMemoryLayer) -> [&str; 5] {
+        [
+            &self.tenant_id,
+            &self.shard_id,
+            &self.timeline_id,
+            LayerLevel::Frozen.into(),
+            LayerKind::Delta.into(), // by definition
+        ]
+    }
+
+    /// Removes a frozen ephemeral layer to TIMELINE_LAYER metrics.
+    pub fn dec_frozen_layer(&self, layer: &InMemoryLayer) {
+        assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
+        let labels = self.make_frozen_layer_labels(layer);
+        let size = layer.try_len().expect("frozen layer should have no writer");
+        TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .dec();
+        TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .sub(size);
+    }
+
+    /// Adds a frozen ephemeral layer to TIMELINE_LAYER metrics.
+    pub fn inc_frozen_layer(&self, layer: &InMemoryLayer) {
+        assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
+        let labels = self.make_frozen_layer_labels(layer);
+        let size = layer.try_len().expect("frozen layer should have no writer");
+        TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .inc();
+        TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .add(size);
+    }
+
+    /// Removes a persistent layer from TIMELINE_LAYER metrics.
+    pub fn dec_layer(&self, layer_desc: &PersistentLayerDesc) {
+        let labels = self.make_layer_labels(layer_desc);
+        TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .dec();
+        TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .sub(layer_desc.file_size);
+    }
+
+    /// Adds a persistent layer to TIMELINE_LAYER metrics.
+    pub fn inc_layer(&self, layer_desc: &PersistentLayerDesc) {
+        let labels = self.make_layer_labels(layer_desc);
+        TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .inc();
+        TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .add(layer_desc.file_size);
+    }
+
     pub(crate) fn shutdown(&self) {
         let was_shutdown = self
             .shutdown
@@ -2869,30 +2953,14 @@ impl TimelineMetrics {
         let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
 
-        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Image.into(),
-        ]);
-        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Image.into(),
-        ]);
-        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Delta.into(),
-        ]);
-        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Delta.into(),
-        ]);
+        for ref level in LayerLevel::iter() {
+            for ref kind in LayerKind::iter() {
+                let labels: [&str; 5] =
+                    [tenant_id, shard_id, timeline_id, level.into(), kind.into()];
+                let _ = TIMELINE_LAYER_SIZE.remove_label_values(&labels);
+                let _ = TIMELINE_LAYER_COUNT.remove_label_values(&labels);
+            }
+        }
 
         let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
@@ -2974,24 +3042,6 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
     // we leave the BROKEN_TENANTS_SET entry if any
 }
 
-use futures::Future;
-use pin_project_lite::pin_project;
-use std::collections::HashMap;
-use std::num::NonZeroUsize;
-use std::pin::Pin;
-use std::sync::atomic::AtomicU64;
-use std::sync::{Arc, Mutex};
-use std::task::{Context, Poll};
-use std::time::{Duration, Instant};
-
-use crate::config::PageServerConf;
-use crate::context::{PageContentKind, RequestContext};
-use crate::task_mgr::TaskKind;
-use crate::tenant::mgr::TenantSlot;
-use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::throttle::ThrottleResult;
-use crate::tenant::Timeline;
-
 /// Maintain a per timeline gauge in addition to the global gauge.
 pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
     last_set: AtomicU64,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 2a86885f6b..99e0ff1aa5 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -701,13 +701,7 @@ impl Drop for LayerInner {
         if let Some(timeline) = timeline.as_ref() {
             // Only need to decrement metrics if the timeline still exists: otherwise
             // it will have already de-registered these metrics via TimelineMetrics::shutdown
-            if self.desc.is_delta() {
-                timeline.metrics.layer_count_delta.dec();
-                timeline.metrics.layer_size_delta.sub(self.desc.file_size);
-            } else {
-                timeline.metrics.layer_count_image.dec();
-                timeline.metrics.layer_size_image.sub(self.desc.file_size);
-            }
+            timeline.metrics.dec_layer(&self.desc);
 
             if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
                 debug_assert!(
@@ -817,13 +811,7 @@ impl LayerInner {
         };
 
         // This object acts as a RAII guard on these metrics: increment on construction
-        if desc.is_delta() {
-            timeline.metrics.layer_count_delta.inc();
-            timeline.metrics.layer_size_delta.add(desc.file_size);
-        } else {
-            timeline.metrics.layer_count_image.inc();
-            timeline.metrics.layer_size_image.add(desc.file_size);
-        }
+        timeline.metrics.inc_layer(&desc);
 
         // New layers are visible by default. This metric is later updated on drop or in set_visibility
         timeline
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 24bc7890c6..b4b30fcd23 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3703,7 +3703,7 @@ impl Timeline {
             let mut guard = self.layers.write().await;
             guard
                 .open_mut()?
-                .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock)
+                .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock, &self.metrics)
                 .await
         };
 
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index f1cef7778c..cb7783d779 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -91,6 +91,7 @@ impl LayerManager {
                 layer_map,
                 layer_fmgr: LayerFileManager(hashmap),
             }) => {
+                // NB: no need to decrement layer metrics; metrics are removed on timeline shutdown.
                 let open = layer_map.open_layer.take();
                 let frozen = layer_map.frozen_layers.len();
                 let taken_writer_state = writer_state.take();
@@ -234,6 +235,7 @@ impl OpenLayerManager {
         lsn: Lsn,
         last_freeze_at: &AtomicLsn,
         write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
+        metrics: &TimelineMetrics,
     ) -> bool {
         let Lsn(last_record_lsn) = lsn;
         let end_lsn = Lsn(last_record_lsn + 1);
@@ -242,6 +244,11 @@ impl OpenLayerManager {
             let open_layer_rc = Arc::clone(open_layer);
             open_layer.freeze(end_lsn).await;
 
+            // Increment the frozen layer metrics. This is decremented in `finish_flush_l0_layer()`.
+            // TODO: It would be nicer to do this via `InMemoryLayer::drop()`, but it requires a
+            // reference to the timeline metrics. Other methods use a metrics borrow as well.
+            metrics.inc_frozen_layer(open_layer);
+
             // The layer is no longer open, update the layer map to reflect this.
             // We will replace it with on-disk historics below.
             self.layer_map.frozen_layers.push_back(open_layer_rc);
@@ -298,6 +305,7 @@ impl OpenLayerManager {
             .frozen_layers
             .pop_front()
             .expect("there must be a inmem layer to flush");
+        metrics.dec_frozen_layer(&inmem);
 
         // Only one task may call this function at a time (for this
         // timeline). If two tasks tried to flush the same frozen

From 9dff6cc2a4d8c2ddb645df6872b3ac1985944264 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 29 Jan 2025 16:32:50 -0500
Subject: [PATCH 302/583] fix(pageserver): skip repartition if we need L0
 compaction (#10547)

## Problem

Repartition is slow, but it's only used in image layer creation. We can
skip it if we have a lot of L0 layers to ingest.

## Summary of changes

If L0 compaction is not complete, do not repartition and do not create
image layers.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 107 ++++++++++---------
 1 file changed, 57 insertions(+), 50 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 589aea18b4..5f0e6dc3ec 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -624,7 +624,13 @@ impl Timeline {
 
         // High level strategy for compaction / image creation:
         //
-        // 1. First, calculate the desired "partitioning" of the
+        // 1. First, do a L0 compaction to ensure we move the L0
+        // layers into the historic layer map get flat levels of
+        // layers. If we did not compact all L0 layers, we will
+        // prioritize compacting the timeline again and not do
+        // any of the compactions below.
+        //
+        // 2. Then, calculate the desired "partitioning" of the
         // currently in-use key space. The goal is to partition the
         // key space into roughly fixed-size chunks, but also take into
         // account any existing image layers, and try to align the
@@ -638,7 +644,7 @@ impl Timeline {
         // identify a relation. This is just an optimization,
         // though.
         //
-        // 2. Once we know the partitioning, for each partition,
+        // 3. Once we know the partitioning, for each partition,
         // decide if it's time to create a new image layer. The
         // criteria is: there has been too much "churn" since the last
         // image layer? The "churn" is fuzzy concept, it's a
@@ -646,15 +652,8 @@ impl Timeline {
         // total in the delta file. Or perhaps: if creating an image
         // file would allow to delete some older files.
         //
-        // 3. After that, we compact all level0 delta files if there
-        // are too many of them.  While compacting, we also garbage
-        // collect any page versions that are no longer needed because
-        // of the new image layers we created in step 2.
-        //
-        // TODO: This high level strategy hasn't been implemented yet.
-        // Below are functions compact_level0() and create_image_layers()
-        // but they are a bit ad hoc and don't quite work like it's explained
-        // above. Rewrite it.
+        // 4. In the end, if the tenant gets auto-sharded, we will run
+        // a shard-ancestor compaction.
 
         // Is the timeline being deleted?
         if self.is_stopping() {
@@ -666,10 +665,32 @@ impl Timeline {
 
         // Define partitioning schema if needed
 
-        // FIXME: the match should only cover repartitioning, not the next steps
-        let (partition_count, has_pending_tasks) = match self
+        // 1. L0 Compact
+        let fully_compacted = {
+            let timer = self.metrics.compact_time_histo.start_timer();
+            let fully_compacted = self
+                .compact_level0(
+                    target_file_size,
+                    options.flags.contains(CompactFlags::ForceL0Compaction),
+                    ctx,
+                )
+                .await?;
+            timer.stop_and_record();
+            fully_compacted
+        };
+
+        if !fully_compacted {
+            // Yield and do not do any other kind of compaction. True means
+            // that we have pending L0 compaction tasks and the compaction scheduler
+            // will prioritize compacting this tenant/timeline again.
+            info!("skipping image layer generation and shard ancestor compaction due to L0 compaction did not include all layers.");
+            return Ok(true);
+        }
+
+        // 2. Repartition and create image layers if necessary
+        let partition_count = match self
             .repartition(
-                self.get_last_record_lsn(),
+                self.get_last_record_lsn(), // TODO: use L0-L1 boundary
                 self.get_compaction_target_size(),
                 options.flags,
                 ctx,
@@ -682,46 +703,30 @@ impl Timeline {
                     .access_stats_behavior(AccessStatsBehavior::Skip)
                     .build();
 
-                // 2. Compact
-                let timer = self.metrics.compact_time_histo.start_timer();
-                let fully_compacted = self
-                    .compact_level0(
-                        target_file_size,
-                        options.flags.contains(CompactFlags::ForceL0Compaction),
-                        ctx,
-                    )
-                    .await?;
-                timer.stop_and_record();
-
                 let mut partitioning = dense_partitioning;
                 partitioning
                     .parts
                     .extend(sparse_partitioning.into_dense().parts);
 
-                // 3. Create new image layers for partitions that have been modified
-                // "enough". Skip image layer creation if L0 compaction cannot keep up.
-                if fully_compacted {
-                    let image_layers = self
-                        .create_image_layers(
-                            &partitioning,
-                            lsn,
-                            if options
-                                .flags
-                                .contains(CompactFlags::ForceImageLayerCreation)
-                            {
-                                ImageLayerCreationMode::Force
-                            } else {
-                                ImageLayerCreationMode::Try
-                            },
-                            &image_ctx,
-                        )
-                        .await?;
+                // 3. Create new image layers for partitions that have been modified "enough".
+                let image_layers = self
+                    .create_image_layers(
+                        &partitioning,
+                        lsn,
+                        if options
+                            .flags
+                            .contains(CompactFlags::ForceImageLayerCreation)
+                        {
+                            ImageLayerCreationMode::Force
+                        } else {
+                            ImageLayerCreationMode::Try
+                        },
+                        &image_ctx,
+                    )
+                    .await?;
 
-                    self.upload_new_image_layers(image_layers)?;
-                } else {
-                    info!("skipping image layer generation due to L0 compaction did not include all layers.");
-                }
-                (partitioning.parts.len(), !fully_compacted)
+                self.upload_new_image_layers(image_layers)?;
+                partitioning.parts.len()
             }
             Err(err) => {
                 // no partitioning? This is normal, if the timeline was just created
@@ -733,10 +738,12 @@ impl Timeline {
                 if !self.cancel.is_cancelled() && !err.is_cancelled() {
                     tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
                 }
-                (1, false)
+                1
             }
         };
 
+        // 4. Shard ancestor compaction
+
         if self.shard_identity.count >= ShardCount::new(2) {
             // Limit the number of layer rewrites to the number of partitions: this means its
             // runtime should be comparable to a full round of image layer creations, rather than
@@ -746,7 +753,7 @@ impl Timeline {
             self.compact_shard_ancestors(rewrite_max, ctx).await?;
         }
 
-        Ok(has_pending_tasks)
+        Ok(false)
     }
 
     /// Check for layers that are elegible to be rewritten:

From 77ea9b16fe5ba35dea47358a36bdb22bf51b66ec Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 29 Jan 2025 19:05:40 -0500
Subject: [PATCH 303/583] fix(pageserver): use the larger one of upper limit
 and threshold (#10571)

## Problem

Follow up of https://github.com/neondatabase/neon/pull/10550 in case the
upper limit is set larger than threshold. It does not make sense for
someone to enforce the behavior like "if there are >= 50 L0s, only
compact 10 of them".

## Summary of changes

Use the maximum of compaction threshold and upper limit when selecting
L0 files to compact.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 5f0e6dc3ec..5f7b5f1af5 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1122,7 +1122,13 @@ impl Timeline {
         // Under normal circumstances, we will accumulate up to compaction_upper_limit L0s of size
         // checkpoint_distance each.  To avoid edge cases using extra system resources, bound our
         // work in this function to only operate on this much delta data at once.
-        let delta_size_limit = self.get_compaction_upper_limit() as u64
+        //
+        // In general, compaction_threshold should be <= compaction_upper_limit, but in case that
+        // the constraint is not respected, we use the larger of the two.
+        let delta_size_limit = std::cmp::max(
+            self.get_compaction_upper_limit(),
+            self.get_compaction_threshold(),
+        ) as u64
             * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);
 
         let mut fully_compacted = true;

From a7a706cff7c9e11ac93d6ed487988c3dd0b74569 Mon Sep 17 00:00:00 2001
From: Alexander Lakhin <exclusion@gmail.com>
Date: Thu, 30 Jan 2025 11:09:43 +0200
Subject: [PATCH 304/583] Fix submodule reference after #10473 (#10577)

---
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index b654fa88b6..f0ffc8279d 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit b654fa88b6fd2ad24a03a14a7cd417ec66e518f9
+Subproject commit f0ffc8279dbcbbc439981a4fd001a9687e5d665d
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 982f537692..c3eaeac927 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v17": [
     "17.2",
-    "b654fa88b6fd2ad24a03a14a7cd417ec66e518f9"
+    "f0ffc8279dbcbbc439981a4fd001a9687e5d665d"
   ],
   "v16": [
     "16.6",

From b24727134c5eebd9965297c1a65618cfda5c4a52 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 30 Jan 2025 10:27:40 +0100
Subject: [PATCH 305/583] pageserver: improve read amp metric (#10573)

## Problem

The current global `pageserver_layers_visited_per_vectored_read_global`
metric does not appear to accurately measure read amplification. It
divides the layer count by the number of reads in a batch, but this
means that e.g. 10 reads with 100 L0 layers will only measure a read amp
of 10 per read, while the actual read amp was 100.

While the cost of layer visits are amortized across the batch, and some
layers may not intersect with a given key, each visited layer
contributes directly to the observed latency for every read in the
batch, which is what we care about.

Touches https://github.com/neondatabase/cloud/issues/23283.
Extracted from #10566.

## Summary of changes

* Count the number of layers visited towards each read in the batch,
instead of the average across the batch.
* Rename `pageserver_layers_visited_per_vectored_read_global` to
`pageserver_layers_per_read_global`.
* Reduce the read amp log warning threshold down from 512 to 100.
---
 pageserver/src/metrics.rs              | 16 ++++++++-----
 pageserver/src/tenant/timeline.rs      | 31 ++++++++++++++------------
 test_runner/regress/test_compaction.py | 10 ++++-----
 3 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 77c0967afc..30f3a49a9d 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -116,11 +116,17 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
+/// Measures layers visited per read (i.e. read amplification).
+///
+/// NB: for a batch, we count all visited layers towards each read. While the cost of layer visits
+/// are amortized across the batch, and some layers may not intersect with a given key, each visited
+/// layer contributes directly to the observed latency for every read in the batch, which is what we
+/// care about.
+pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
-        "pageserver_layers_visited_per_vectored_read_global",
-        "Average number of layers visited to reconstruct one key",
-        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
+        "pageserver_layers_per_read_global",
+        "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
+        vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
     )
     .expect("failed to define a metric")
 });
@@ -3912,7 +3918,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
 
     // histograms
     [
-        &VEC_READ_NUM_LAYERS_VISITED,
+        &LAYERS_PER_READ_GLOBAL,
         &WAIT_LSN_TIME,
         &WAL_REDO_TIME,
         &WAL_REDO_RECORDS_HISTOGRAM,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b4b30fcd23..71f0b4c9bf 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -51,6 +51,7 @@ use tokio::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::rate_limit::RateLimit;
 use utils::{
     fs_ext,
     guard_arc_swap::GuardArcSwap,
@@ -115,7 +116,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
 
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
-use crate::metrics::TimelineMetrics;
+use crate::metrics::{TimelineMetrics, LAYERS_PER_READ_GLOBAL};
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
@@ -1044,7 +1045,7 @@ impl Timeline {
     }
 
     pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
-    pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0;
+    pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u32 = 100;
 
     /// Look up multiple page versions at a given LSN
     ///
@@ -1221,25 +1222,27 @@ impl Timeline {
         // (this is a requirement, not a bug). Skip updating the metric in these cases
         // to avoid infinite results.
         if !results.is_empty() {
-            let avg = layers_visited as f64 / results.len() as f64;
-            if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<RateLimit>> =
+            // Record the total number of layers visited towards each key in the batch. While some
+            // layers may not intersect with a given read, and the cost of layer visits are
+            // amortized across the batch, each visited layer contributes directly to the observed
+            // latency for every read in the batch, which is what we care about.
+            if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
+                static LOG_PACER: Lazy<Mutex<RateLimit>> =
                     Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
-                let mut rate_limit = LOGGED.lock().unwrap();
-                rate_limit.call(|| {
+                LOG_PACER.lock().unwrap().call(|| {
+                    let num_keys = keyspace.total_raw_size();
+                    let num_pages = results.len();
                     tracing::info!(
                       shard_id = %self.tenant_shard_id.shard_slug(),
                       lsn = %lsn,
-                      "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned",
-                      keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size());
+                      "Vectored read for {keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
+                    );
                 });
             }
 
-            // Note that this is an approximation. Tracking the exact number of layers visited
-            // per key requires virtually unbounded memory usage and is inefficient
-            // (i.e. segment tree tracking each range queried from a layer)
-            crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(avg);
+            for _ in &results {
+                LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64);
+            }
         }
 
         Ok(results)
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 2edfc884ad..763a63c2e5 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -86,9 +86,9 @@ page_cache_size=10
     log.info("Checking layer access metrics ...")
 
     layer_access_metric_names = [
-        "pageserver_layers_visited_per_vectored_read_global_sum",
-        "pageserver_layers_visited_per_vectored_read_global_count",
-        "pageserver_layers_visited_per_vectored_read_global_bucket",
+        "pageserver_layers_per_read_global_sum",
+        "pageserver_layers_per_read_global_count",
+        "pageserver_layers_per_read_global_bucket",
     ]
 
     metrics = env.pageserver.http_client().get_metrics()
@@ -96,8 +96,8 @@ page_cache_size=10
         layer_access_metrics = metrics.query_all(name)
         log.info(f"Got metrics: {layer_access_metrics}")
 
-    vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
-    vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
+    vectored_sum = metrics.query_one("pageserver_layers_per_read_global_sum")
+    vectored_count = metrics.query_one("pageserver_layers_per_read_global_count")
     if vectored_count.value > 0:
         assert vectored_sum.value > 0
         vectored_average = vectored_sum.value / vectored_count.value

From d3db96c2112381e7a0601719e2e7d55d11d946ed Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 30 Jan 2025 11:55:07 +0100
Subject: [PATCH 306/583] pageserver: add `pageserver_deltas_per_read_global`
 metric (#10570)

## Problem

We suspect that Postgres checkpoints will limit the number of page
deltas necessary to reconstruct a page, but don't know for certain.

Touches https://github.com/neondatabase/cloud/issues/23283.

## Summary of changes

Add `pageserver_deltas_per_read_global` metric.

This pairs with `pageserver_layers_per_read_global` from #10573.
---
 pageserver/src/metrics.rs              | 11 +++++++++++
 pageserver/src/tenant/storage_layer.rs | 10 ++++++++++
 pageserver/src/tenant/timeline.rs      |  3 ++-
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 30f3a49a9d..bf75ea3bc6 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -131,6 +131,16 @@ pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    // We expect this to be low because of Postgres checkpoints. Let's see if that holds.
+    register_histogram!(
+        "pageserver_deltas_per_read_global",
+        "Number of delta pages applied to image page per read",
+        vec![0.0, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0],
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static CONCURRENT_INITDBS: Lazy<UIntGauge> = Lazy::new(|| {
     register_uint_gauge!(
         "pageserver_concurrent_initdb",
@@ -3919,6 +3929,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
     // histograms
     [
         &LAYERS_PER_READ_GLOBAL,
+        &DELTAS_PER_READ_GLOBAL,
         &WAIT_LSN_TIME,
         &WAL_REDO_TIME,
         &WAL_REDO_RECORDS_HISTOGRAM,
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index c1fe67c87c..3800852ccc 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -80,6 +80,16 @@ pub(crate) struct ValueReconstructState {
     pub(crate) img: Option<(Lsn, Bytes)>,
 }
 
+impl ValueReconstructState {
+    /// Returns the number of page deltas applied to the page image.
+    pub fn num_deltas(&self) -> usize {
+        match self.img {
+            Some(_) => self.records.len(),
+            None => self.records.len() - 1, // omit will_init record
+        }
+    }
+}
+
 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
 pub(crate) enum ValueReconstructSituation {
     Complete,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 71f0b4c9bf..4a69376239 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -116,7 +116,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
 
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
-use crate::metrics::{TimelineMetrics, LAYERS_PER_READ_GLOBAL};
+use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL};
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
@@ -1195,6 +1195,7 @@ impl Timeline {
                             return (key, Err(err));
                         }
                     };
+                    DELTAS_PER_READ_GLOBAL.observe(converted.num_deltas() as f64);
 
                     // The walredo module expects the records to be descending in terms of Lsn.
                     // And we submit the IOs in that order, so, there shuold be no need to sort here.

From 8804d5894336762d077147e2dac9b780653f2a8c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 30 Jan 2025 11:18:07 +0000
Subject: [PATCH 307/583] Nightly Benchmarks: use pgbench from artifacts 
 (#10370)

We don't use statically linked OpenSSL anymore (#10302),
it's ok to switch to Neon's pgbench for pgvector benchmarks
---
 .github/workflows/benchmarking.yml | 55 +++++++++---------------------
 1 file changed, 16 insertions(+), 39 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 32747d825c..9446b4d17b 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -319,7 +319,7 @@ jobs:
                       { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
                       { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
                       { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
-                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, 
+                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
                       { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
                       { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
         }'
@@ -458,7 +458,7 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-    # we want to compare Neon project OLTP throughput and latency at scale factor 10 GB 
+    # we want to compare Neon project OLTP throughput and latency at scale factor 10 GB
     # without (neonvm-captest-new)
     # and with (neonvm-captest-new-many-tables) many relations in the database
     - name: Create many relations before the run
@@ -590,36 +590,20 @@ jobs:
     steps:
     - uses: actions/checkout@v4
 
-    # until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16
-    # instead of using Neon artifacts containing pgbench
-    - name: Install postgresql-16 where pytest expects it
-      run: |
-        # Just to make it easier to test things locally on macOS (with arm64)
-        arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g')
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
 
-        cd /home/nonroot
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.2-1.pgdg120+1_${arch}.deb"
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb"
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.6-1.pgdg120+1_${arch}.deb"
-        dpkg -x libpq5_17.2-1.pgdg120+1_${arch}.deb pg
-        dpkg -x postgresql-16_16.6-1.pgdg120+1_${arch}.deb pg
-        dpkg -x postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb pg
-
-        mkdir -p /tmp/neon/pg_install/v16/bin
-        mkdir -p /tmp/neon/pg_install/v17/bin
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql    /tmp/neon/pg_install/v16/bin/psql
-        ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu     /tmp/neon/pg_install/v16/lib
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v17/bin/pgbench
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql    /tmp/neon/pg_install/v17/bin/psql
-        ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu     /tmp/neon/pg_install/v17/lib
-
-        LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH:-}"
-        export LD_LIBRARY_PATH
-        echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ${GITHUB_ENV}
-
-        /tmp/neon/pg_install/v16/bin/pgbench --version
-        /tmp/neon/pg_install/v16/bin/psql --version
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Set up Connection String
       id: set-up-connstr
@@ -642,13 +626,6 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-    - name: Configure AWS credentials
-      uses: aws-actions/configure-aws-credentials@v4
-      with:
-        aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-        role-duration-seconds: 18000 # 5 hours
-
     - name: Benchmark pgvector hnsw indexing
       uses: ./.github/actions/run-python-test-set
       with:
@@ -906,7 +883,7 @@ jobs:
             exit 1
             ;;
         esac
-        
+
         echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV
 
     - name: Set up Connection String

From 6a2afa0c0216c183e51d68d2730ed67862247de2 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 30 Jan 2025 12:24:49 +0100
Subject: [PATCH 308/583] pageserver: add per-timeline read amp histogram
 (#10566)

## Problem

We don't have per-timeline observability for read amplification.

Touches https://github.com/neondatabase/cloud/issues/23283.

## Summary of changes

Add a per-timeline `pageserver_layers_per_read` histogram.

NB: per-timeline histograms are expensive, but probably worth it in this
case.
---
 pageserver/src/metrics.rs         | 19 +++++++++++++++++++
 pageserver/src/tenant/timeline.rs |  1 +
 test_runner/fixtures/metrics.py   |  3 +++
 3 files changed, 23 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index bf75ea3bc6..f9edf88553 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -122,6 +122,17 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
 /// are amortized across the batch, and some layers may not intersect with a given key, each visited
 /// layer contributes directly to the observed latency for every read in the batch, which is what we
 /// care about.
+pub(crate) static LAYERS_PER_READ: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_layers_per_read",
+        "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
+        &["tenant_id", "shard_id", "timeline_id"],
+        // Low resolution to reduce cardinality.
+        vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0],
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_layers_per_read_global",
@@ -2648,6 +2659,7 @@ pub(crate) struct TimelineMetrics {
     pub disk_consistent_lsn_gauge: IntGauge,
     pub pitr_history_size: UIntGauge,
     pub archival_size: UIntGauge,
+    pub layers_per_read: Histogram,
     pub standby_horizon_gauge: IntGauge,
     pub resident_physical_size_gauge: UIntGauge,
     pub visible_physical_size_gauge: UIntGauge,
@@ -2745,6 +2757,10 @@ impl TimelineMetrics {
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
 
+        let layers_per_read = LAYERS_PER_READ
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
         let standby_horizon_gauge = STANDBY_HORIZON
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -2809,6 +2825,7 @@ impl TimelineMetrics {
             disk_consistent_lsn_gauge,
             pitr_history_size,
             archival_size,
+            layers_per_read,
             standby_horizon_gauge,
             resident_physical_size_gauge,
             visible_physical_size_gauge,
@@ -2978,6 +2995,8 @@ impl TimelineMetrics {
             }
         }
 
+        let _ = LAYERS_PER_READ.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+
         let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4a69376239..ab4b3cac63 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1242,6 +1242,7 @@ impl Timeline {
             }
 
             for _ in &results {
+                self.metrics.layers_per_read.observe(layers_visited as f64);
                 LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64);
             }
         }
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index fd7e193778..83a1a87611 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -158,6 +158,9 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     "pageserver_pitr_history_size",
     "pageserver_layer_bytes",
     "pageserver_layer_count",
+    "pageserver_layers_per_read_bucket",
+    "pageserver_layers_per_read_count",
+    "pageserver_layers_per_read_sum",
     "pageserver_visible_physical_size",
     "pageserver_storage_operations_seconds_count_total",
     "pageserver_storage_operations_seconds_sum_total",

From ab627ad9fd355e6a84a7403accee307284f8e017 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 30 Jan 2025 11:54:02 +0000
Subject: [PATCH 309/583] storcon_cli: fix spurious error setting preferred AZ
 (#10568)

## Problem

The client code for `tenant-set-preferred-az` declared response type
`()`, so printed a spurious error on each use:
```
Error: receive body: error decoding response body: invalid type: map, expected unit at line 1 column 0
```

The requests were successful anyway.

## Summary of changes

- Declare the proper return type, so that the command succeeds quietly.
---
 control_plane/storcon_cli/src/main.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index d9b76b9600..985fe6b3b1 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -10,8 +10,8 @@ use pageserver_api::{
     controller_api::{
         AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
         SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
-        ShardsPreferredAzsRequest, SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse,
-        TenantPolicyRequest,
+        ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy,
+        TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
     },
     models::{
         EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -800,7 +800,7 @@ async fn main() -> anyhow::Result<()> {
                     .collect(),
             };
             storcon_client
-                .dispatch::<ShardsPreferredAzsRequest, ()>(
+                .dispatch::<ShardsPreferredAzsRequest, ShardsPreferredAzsResponse>(
                     Method::PUT,
                     "control/v1/preferred_azs".to_string(),
                     Some(req),

From 93714c4c7bc472a80eb0e13c6e33ea3bf19389d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 30 Jan 2025 13:03:36 +0100
Subject: [PATCH 310/583] secondary downloader: load metadata on loading of
 timeline (#10539)

Related to #10308, we might have legitimate changes in file size or
generation. Those changes should not cause warn log lines.

In order to detect changes of the generation number while the file size
stayed the same, load the metadata that we store on disk on loading of
the timeline.

Still do a comparison with the on-disk layer sizes to find any
discrepancies that might occur due to race conditions (new metadata file
gets written but layer file has not been updated yet, and PS shuts
down). However, as it's possible to hit it in a race conditon, downgrade
it to a warning.

Also fix a mistake in #10529: we want to compare the old with the new
metadata, not the old metadata with itself.
---
 pageserver/src/tenant/secondary/downloader.rs | 101 ++++++++++++++----
 pageserver/src/virtual_file.rs                |  53 +++++----
 2 files changed, 114 insertions(+), 40 deletions(-)

diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index cf524fcb25..2e8c3946bd 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -673,12 +673,30 @@ impl<'a> TenantDownloader<'a> {
             HeatMapDownload::Modified(m) => m,
         };
 
-        let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
-
-        // Save the heatmap: this will be useful on restart, allowing us to reconstruct
-        // layer metadata without having to re-download it.
+        // Heatmap storage location
         let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id);
 
+        let last_heatmap = if last_download.is_none() {
+            match load_heatmap(&heatmap_path, ctx).await {
+                Ok(htm) => htm,
+                Err(e) => {
+                    tracing::warn!("Couldn't load heatmap from {heatmap_path}: {e:?}");
+                    None
+                }
+            }
+        } else {
+            None
+        };
+
+        let last_heatmap_timelines = last_heatmap.as_ref().map(|htm| {
+            htm.timelines
+                .iter()
+                .map(|tl| (tl.timeline_id, tl))
+                .collect::<HashMap<_, _>>()
+        });
+
+        let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
+
         let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
         let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
         let heatmap_path_bg = heatmap_path.clone();
@@ -707,10 +725,17 @@ impl<'a> TenantDownloader<'a> {
             let timeline_state = match timeline_state {
                 Some(t) => t,
                 None => {
+                    let last_heatmap =
+                        last_heatmap_timelines
+                            .as_ref()
+                            .and_then(|last_heatmap_timelines| {
+                                last_heatmap_timelines.get(&timeline.timeline_id).copied()
+                            });
                     // We have no existing state: need to scan local disk for layers first.
                     let timeline_state = init_timeline_state(
                         self.conf,
                         tenant_shard_id,
+                        last_heatmap,
                         timeline,
                         &self.secondary_state.resident_size_metric,
                     )
@@ -1079,12 +1104,12 @@ impl<'a> TenantDownloader<'a> {
                 }
             }
 
-            if on_disk.metadata.generation_file_size() != on_disk.metadata.generation_file_size() {
+            if on_disk.metadata.generation_file_size() != layer.metadata.generation_file_size() {
                 tracing::info!(
                     "Re-downloading layer {} with changed size or generation: {:?}->{:?}",
                     layer.name,
                     on_disk.metadata.generation_file_size(),
-                    on_disk.metadata.generation_file_size()
+                    layer.metadata.generation_file_size()
                 );
                 return LayerAction::Download;
             }
@@ -1277,6 +1302,7 @@ impl<'a> TenantDownloader<'a> {
 async fn init_timeline_state(
     conf: &'static PageServerConf,
     tenant_shard_id: &TenantShardId,
+    last_heatmap: Option<&HeatMapTimeline>,
     heatmap: &HeatMapTimeline,
     resident_metric: &UIntGauge,
 ) -> SecondaryDetailTimeline {
@@ -1306,6 +1332,13 @@ async fn init_timeline_state(
     let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
         heatmap.layers.iter().map(|l| (&l.name, l)).collect();
 
+    let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
+        if let Some(last_heatmap) = last_heatmap {
+            last_heatmap.layers.iter().map(|l| (&l.name, l)).collect()
+        } else {
+            HashMap::new()
+        };
+
     while let Some(dentry) = dir
         .next_entry()
         .await
@@ -1339,18 +1372,32 @@ async fn init_timeline_state(
         match LayerName::from_str(file_name) {
             Ok(name) => {
                 let remote_meta = heatmap_metadata.get(&name);
+                let last_meta = last_heatmap_metadata.get(&name);
+                let mut remove = false;
                 match remote_meta {
                     Some(remote_meta) => {
+                        let last_meta_generation_file_size = last_meta
+                            .map(|m| m.metadata.generation_file_size())
+                            .unwrap_or(remote_meta.metadata.generation_file_size());
                         // TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
-                        if local_meta.len() != remote_meta.metadata.file_size {
-                            // This should not happen, because we do crashsafe write-then-rename when downloading
-                            // layers, and layers in remote storage are immutable.  Remove the local file because
-                            // we cannot trust it.
-                            tracing::warn!(
+                        if remote_meta.metadata.generation_file_size()
+                            != last_meta_generation_file_size
+                        {
+                            tracing::info!(
+                                "Removing local layer {name} as on-disk json metadata has different generation or file size from remote: {:?} -> {:?}",
+                                last_meta_generation_file_size,
+                                remote_meta.metadata.generation_file_size()
+                            );
+                            remove = true;
+                        } else if local_meta.len() != remote_meta.metadata.file_size {
+                            // This can happen in the presence of race conditions: the remote and on-disk metadata have changed, but we haven't had
+                            // the chance yet to download the new layer to disk, before the process restarted.
+                            tracing::info!(
                                 "Removing local layer {name} with unexpected local size {} != {}",
                                 local_meta.len(),
                                 remote_meta.metadata.file_size
                             );
+                            remove = true;
                         } else {
                             // We expect the access time to be initialized immediately afterwards, when
                             // the latest heatmap is applied to the state.
@@ -1372,15 +1419,18 @@ async fn init_timeline_state(
                             "Removing secondary local layer {} because it's absent in heatmap",
                             name
                         );
-                        tokio::fs::remove_file(&dentry.path())
-                            .await
-                            .or_else(fs_ext::ignore_not_found)
-                            .fatal_err(&format!(
-                                "Removing layer {}",
-                                dentry.path().to_string_lossy()
-                            ));
+                        remove = true;
                     }
                 }
+                if remove {
+                    tokio::fs::remove_file(&dentry.path())
+                        .await
+                        .or_else(fs_ext::ignore_not_found)
+                        .fatal_err(&format!(
+                            "Removing layer {}",
+                            dentry.path().to_string_lossy()
+                        ));
+                }
             }
             Err(_) => {
                 // Ignore it.
@@ -1391,3 +1441,18 @@ async fn init_timeline_state(
 
     detail
 }
+
+/// Loads a json-encoded heatmap file from the provided on-disk path
+async fn load_heatmap(
+    path: &Utf8PathBuf,
+    ctx: &RequestContext,
+) -> Result<Option<HeatMapTenant>, anyhow::Error> {
+    let mut file = match VirtualFile::open(path, ctx).await {
+        Ok(file) => file,
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(None),
+        Err(e) => Err(e)?,
+    };
+    let st = file.read_to_string(ctx).await?;
+    let htm = serde_json::from_str(&st)?;
+    Ok(Some(htm))
+}
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 8a7f4a4bf5..9d539198c7 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -234,6 +234,19 @@ impl VirtualFile {
     ) -> (FullSlice<Buf>, Result<usize, Error>) {
         self.inner.write_all(buf, ctx).await
     }
+
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
+        self.inner.read_to_end(buf, ctx).await
+    }
+
+    pub(crate) async fn read_to_string(
+        &mut self,
+        ctx: &RequestContext,
+    ) -> Result<String, anyhow::Error> {
+        let mut buf = Vec::new();
+        self.read_to_end(&mut buf, ctx).await?;
+        Ok(String::from_utf8(buf)?)
+    }
 }
 
 /// Indicates whether to enable fsync, fdatasync, or O_SYNC/O_DSYNC when writing
@@ -993,6 +1006,24 @@ impl VirtualFileInner {
             (buf, result)
         })
     }
+
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
+        let mut tmp = vec![0; 128];
+        loop {
+            let slice = tmp.slice(..128);
+            let (slice, res) = self.read_at(slice, self.pos, ctx).await;
+            match res {
+                Ok(0) => return Ok(()),
+                Ok(n) => {
+                    self.pos += n as u64;
+                    buf.extend_from_slice(&slice[..n]);
+                }
+                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
+                Err(e) => return Err(e),
+            }
+            tmp = slice.into_inner();
+        }
+    }
 }
 
 // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
@@ -1237,10 +1268,6 @@ impl VirtualFile {
     ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
         self.inner.read_blk(blknum, ctx).await
     }
-
-    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
-        self.inner.read_to_end(buf, ctx).await
-    }
 }
 
 #[cfg(test)]
@@ -1260,24 +1287,6 @@ impl VirtualFileInner {
             slice.into_inner(),
         ))
     }
-
-    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
-        let mut tmp = vec![0; 128];
-        loop {
-            let slice = tmp.slice(..128);
-            let (slice, res) = self.read_at(slice, self.pos, ctx).await;
-            match res {
-                Ok(0) => return Ok(()),
-                Ok(n) => {
-                    self.pos += n as u64;
-                    buf.extend_from_slice(&slice[..n]);
-                }
-                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return Err(e),
-            }
-            tmp = slice.into_inner();
-        }
-    }
 }
 
 impl Drop for VirtualFileInner {

From be51b10da73cf0e7c3109e8b6b2f5f3ad9f219b9 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 30 Jan 2025 15:31:49 +0100
Subject: [PATCH 311/583] chore(compute): Print some compute_ctl errors in
 debug mode (#10586)

## Problem

In some cases, we were returning a very shallow error like `error
sending request for url (XXX)`, which made it very hard to figure out
the actual error.

## Summary of changes

Use `{:?}` in a few places, and remove it from places where we were
printing a string anyway.
---
 compute_tools/src/extension_server.rs | 14 +++++++-------
 compute_tools/src/migration.rs        |  2 +-
 compute_tools/src/spec.rs             |  4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index 64c338f4d7..00f46386e7 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -258,14 +258,11 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
 async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
     let uri = format!("{}/{}", ext_remote_storage, ext_path);
 
-    info!("Download extension {:?} from uri {:?}", ext_path, uri);
+    info!("Download extension {} from uri {}", ext_path, uri);
 
     match do_extension_server_request(&uri).await {
         Ok(resp) => {
-            info!(
-                "Successfully downloaded remote extension data {:?}",
-                ext_path
-            );
+            info!("Successfully downloaded remote extension data {}", ext_path);
             REMOTE_EXT_REQUESTS_TOTAL
                 .with_label_values(&[&StatusCode::OK.to_string()])
                 .inc();
@@ -285,7 +282,10 @@ async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Res
 async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, String)> {
     let resp = reqwest::get(uri).await.map_err(|e| {
         (
-            format!("could not perform remote extensions server request: {}", e),
+            format!(
+                "could not perform remote extensions server request: {:?}",
+                e
+            ),
             UNKNOWN_HTTP_STATUS.to_string(),
         )
     })?;
@@ -295,7 +295,7 @@ async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, String
         StatusCode::OK => match resp.bytes().await {
             Ok(resp) => Ok(resp),
             Err(e) => Err((
-                format!("could not read remote extensions server response: {}", e),
+                format!("could not read remote extensions server response: {:?}", e),
                 // It's fine to return and report error with status as 200 OK,
                 // because we still failed to read the response.
                 status.to_string(),
diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs
index aa3c6b01f0..7b7b042d84 100644
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -125,7 +125,7 @@ impl<'m> MigrationRunner<'m> {
                     info!("Finished migration id={}", migration_id);
                 }
                 Err(e) => {
-                    error!("Failed to run migration id={}: {}", migration_id, e);
+                    error!("Failed to run migration id={}: {:?}", migration_id, e);
                     DB_MIGRATION_FAILED
                         .with_label_values(&[migration_id.to_string().as_str()])
                         .inc();
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 43a820885b..37d5d3a1a6 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -28,7 +28,7 @@ fn do_control_plane_request(
         .map_err(|e| {
             (
                 true,
-                format!("could not perform spec request to control plane: {}", e),
+                format!("could not perform spec request to control plane: {:?}", e),
                 UNKNOWN_HTTP_STATUS.to_string(),
             )
         })?;
@@ -39,7 +39,7 @@ fn do_control_plane_request(
             Ok(spec_resp) => Ok(spec_resp),
             Err(e) => Err((
                 true,
-                format!("could not deserialize control plane response: {}", e),
+                format!("could not deserialize control plane response: {:?}", e),
                 status.to_string(),
             )),
         },

From cf6dee946efa212078cbc6781549f4d5e27e8255 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 30 Jan 2025 10:25:29 -0500
Subject: [PATCH 312/583] fix(pageserver): gc-compaction race with read
 (#10543)

## Problem

close https://github.com/neondatabase/neon/issues/10482

## Summary of changes

Add an extra lock on the read path to protect against races. The read
path has an implication that only certain kind of compactions can be
performed. Garbage keys must first have an image layer covering the
range, and then being gc-ed -- they cannot be done in one operation. An
alternative to fix this is to move the layers read guard to be acquired
at the beginning of `get_vectored_reconstruct_data_timeline`, but that
was intentionally optimized out and I don't want to regress.

The race is not limited to image layers. Gc-compaction will consolidate
deltas automatically and produce a flat delta layer (i.e., when we have
retain_lsns below the gc-horizon). The same race would also cause
behaviors like getting an un-replayable key history as in
https://github.com/neondatabase/neon/issues/10049.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs            |  6 ++++
 pageserver/src/tenant/timeline/compaction.rs | 37 +++++++++++++++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ab4b3cac63..f387c81c29 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -341,6 +341,8 @@ pub struct Timeline {
     // Needed to ensure that we can't create a branch at a point that was already garbage collected
     pub latest_gc_cutoff_lsn: Rcu<Lsn>,
 
+    pub(crate) gc_compaction_layer_update_lock: tokio::sync::RwLock<()>,
+
     // List of child timelines and their branch points. This is needed to avoid
     // garbage collecting data that is still needed by the child timelines.
     pub(crate) gc_info: std::sync::RwLock<GcInfo>,
@@ -2437,6 +2439,7 @@ impl Timeline {
                 shard_identity,
                 pg_version,
                 layers: Default::default(),
+                gc_compaction_layer_update_lock: tokio::sync::RwLock::new(()),
 
                 walredo_mgr,
                 walreceiver: Mutex::new(None),
@@ -3480,6 +3483,9 @@ impl Timeline {
         // image layer).
         let _gc_cutoff_holder = timeline.get_latest_gc_cutoff_lsn();
 
+        // See `compaction::compact_with_gc` for why we need this.
+        let _guard = timeline.gc_compaction_layer_update_lock.read().await;
+
         loop {
             if cancel.is_cancelled() {
                 return Err(GetVectoredError::Cancelled);
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 5f7b5f1af5..7242f73a82 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -2919,10 +2919,45 @@ impl Timeline {
         // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only
         // operate on L1 layers.
         {
+            // Gc-compaction will rewrite the history of a key. This could happen in two ways:
+            //
+            // 1. We create an image layer to replace all the deltas below the compact LSN. In this case, assume
+            // we have 2 delta layers A and B, both below the compact LSN. We create an image layer I to replace
+            // A and B at the compact LSN. If the read path finishes reading A, yields, and now we update the layer
+            // map, the read path then cannot find any keys below A, reporting a missing key error, while the key
+            // now gets stored in I at the compact LSN.
+            //
+            // ---------------                                       ---------------
+            //   delta1@LSN20                                         image1@LSN20
+            // ---------------  (read path collects delta@LSN20,  => ---------------  (read path cannot find anything
+            //   delta1@LSN10    yields)                                               below LSN 20)
+            // ---------------
+            //
+            // 2. We create a delta layer to replace all the deltas below the compact LSN, and in the delta layers,
+            // we combines the history of a key into a single image. For example, we have deltas at LSN 1, 2, 3, 4,
+            // Assume one delta layer contains LSN 1, 2, 3 and the other contains LSN 4.
+            //
+            // We let gc-compaction combine delta 2, 3, 4 into an image at LSN 4, which produces a delta layer that
+            // contains the delta at LSN 1, the image at LSN 4. If the read path finishes reading the original delta
+            // layer containing 4, yields, and we update the layer map to put the delta layer.
+            //
+            // ---------------                                      ---------------
+            //   delta1@LSN4                                          image1@LSN4
+            // ---------------  (read path collects delta@LSN4,  => ---------------  (read path collects LSN4 and LSN1,
+            //  delta1@LSN1-3    yields)                              delta1@LSN1     which is an invalid history)
+            // ---------------                                      ---------------
+            //
+            // Therefore, the gc-compaction layer update operation should wait for all ongoing reads, block all pending reads,
+            // and only allow reads to continue after the update is finished.
+
+            let update_guard = self.gc_compaction_layer_update_lock.write().await;
+            // Acquiring the update guard ensures current read operations end and new read operations are blocked.
+            // TODO: can we use `latest_gc_cutoff` Rcu to achieve the same effect?
             let mut guard = self.layers.write().await;
             guard
                 .open_mut()?
-                .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
+                .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics);
+            drop(update_guard); // Allow new reads to start ONLY after we finished updating the layer map.
         };
 
         // Schedule an index-only upload to update the `latest_gc_cutoff` in the index_part.json.

From efe42db264fc00e9ea13d16851452a2c7c1e58a5 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 30 Jan 2025 18:11:26 +0200
Subject: [PATCH 313/583] tests: test_pgdata_import_smoke requires the
 'testing' cargo feature (#10569)

It took me ages to figure out why it was failing on my laptop. What I
saw was that when the test makes the 'import_pgdata' in the pageserver,
the pageserver actually performs a regular 'bootstrap' timeline creation
by running initdb, with no importing. It boiled down to the json request
that the test uses:

```
        {
            "new_timeline_id": str(timeline_id),
            "import_pgdata": {
                "idempotency_key": str(idempotency),
                "location": {"LocalFs": {"path": str(importbucket.absolute())}},
            },
        },
```

and how serde deserializes into rust structs. The 'LocalFs' enum variant
in `models.rs` is gated on the 'testing' cargo feature. On a non-testing
build, that got deserialized into the default Bootstrap enum variant, as
a valid TimelineCreateRequestModeImportPgdata variant could not be
formed.

PS. IMHO we should get rid of the testing feature, compile in all the
functionality, and have a runtime flag to disable anything dangeorous.
With that, you would've gotten a nice "feature only enabled in testing
mode" error in this case, or the test would've simply worked. But that's
another story.
---
 test_runner/regress/test_import_pgdata.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py
index 182f715b0e..086d4b67c9 100644
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -59,6 +59,9 @@ def test_pgdata_import_smoke(
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
     env = neon_env_builder.init_start()
 
+    # The test needs LocalFs support, which is only built in testing mode.
+    env.pageserver.is_testing_enabled_or_skip()
+
     env.pageserver.patch_config_toml_nonrecursive(
         {
             "import_pgdata_upcall_api": f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/path/to/mgmt/api"

From 6c8fc909d68f1e541c42f00162526ff1e59f6237 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 30 Jan 2025 17:41:46 +0100
Subject: [PATCH 314/583] Benchmarking PostgreSQL17: for OLAP need specific
 connstr secrets  (#10587)

## Problem

for OLAP benchmarks we need specific connstr secrets with different
database names for each job step

This is a follow-up for https://github.com/neondatabase/neon/pull/10536
In previous PR we used a common GitHub secret for a shared re-use
project that has 4 databases: neondb, tpch, clickbench and userexamples.

[Failure
example](https://neon-github-public-dev.s3.amazonaws.com/reports/main/13044872855/index.html#suites/54d0af6f403f1d8611e8894c2e07d023/fc029330265e9f6e/):


```log
# /tmp/neon/pg_install/v17/bin/psql user=neondb_owner dbname=neondb host=ep-broad-brook-w2luwzzv.us-east-2.aws.neon.build sslmode=require options='-cstatement_timeout=0 ' -c -- $ID$
-- TPC-H/TPC-R Pricing Summary Report Query (Q1)
-- Functional Query Definition
-- Approved February 1998
...
ERROR:  relation "lineitem" does not exist

```

## Summary of changes

We need dedicated GitHub secrets and dedicated connection strings for
each of the use cases.

## Test run
https://github.com/neondatabase/neon/actions/runs/13053968231
---
 .github/workflows/benchmarking.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 9446b4d17b..49f23e895b 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -741,10 +741,10 @@ jobs:
           neonvm-captest-reuse)
             case "${PG_VERSION}" in
               16)
-                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR_V16 }}
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
                 ;;
               17)
-                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }}
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_CONNSTR_PG17 }}
                 ;;
               *)
                 echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
@@ -864,7 +864,7 @@ jobs:
                 CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_S10_CONNSTR"
                 ;;
               17)
-                CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_CONNSTR_PG17"
+                CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_CONNSTR_PG17"
                 ;;
               *)
                 echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
@@ -984,7 +984,7 @@ jobs:
                 CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
                 ;;
               17)
-                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }}
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_USER_EXAMPLE_CONNSTR_PG17 }}
                 ;;
               *)
                 echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"

From 8293b252b2a7aaf82f0e108997483387ff2106be Mon Sep 17 00:00:00 2001
From: Cheng Chen <cheng@mooncakelabs.com>
Date: Thu, 30 Jan 2025 10:33:25 -0800
Subject: [PATCH 315/583] chore(compute): pg_mooncake v0.1.1 (#10578)

## Problem
Upgrade pg_mooncake to v0.1.1

## Summary of changes

https://github.com/Mooncake-Labs/pg_mooncake/blob/main/CHANGELOG.md#011-2025-01-29
---
 compute/compute-node.Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index a428c61f34..e9f6c03768 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1140,8 +1140,8 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
 FROM rust-extensions-build AS pg-mooncake-build
 ARG PG_VERSION
 
-RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \
-    echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.1/pg_mooncake-0.1.1.tar.gz -O pg_mooncake.tar.gz && \
+    echo "a2d16eff7948dde64f072609ca5d2962d6b4d07cb89d45952add473529c55f55 pg_mooncake.tar.gz" | sha256sum --check && \
     mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \
     make release -j $(getconf _NPROCESSORS_ONLN) && \
     make install -j $(getconf _NPROCESSORS_ONLN) && \

From bae0de643e75bc4e9154bc6e6e62368ba5c41d9d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 30 Jan 2025 19:22:59 +0000
Subject: [PATCH 316/583] tests: relax constraints on
 test_timeline_archival_chaos (#10595)

## Problem

The test asserts that it completes at least 10 full timeline lifecycles,
but the noisy CI environment sometimes doesn't meet that goal.

Related: https://github.com/neondatabase/neon/issues/10389

## Summary of changes

- Sleep for longer between pageserver restarts, so that the timeline
workers have more chance to make progress
- Sleep for shorter between retries from timeline worker, so that they
have better chance to get in while a pageserver is up between restarts
- Relax the success condition to complete at least 5 iterations instead
of 10
---
 test_runner/regress/test_timeline_archive.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index bec8270582..306e971657 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -582,12 +582,12 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
                 # This is expected: we are injecting chaos, API calls will sometimes fail.
                 # TODO: can we narrow this to assert we are getting friendly 503s?
                 log.info(f"Iteration error, will retry: {e}")
-                shutdown.wait(random.random())
+                shutdown.wait(random.random() * 0.5)
             except requests.exceptions.RetryError as e:
                 # Retryable error repeated more times than `requests` is configured to tolerate, this
                 # is expected when a pageserver remains unavailable for a couple seconds
                 log.info(f"Iteration error, will retry: {e}")
-                shutdown.wait(random.random())
+                shutdown.wait(random.random() * 0.5)
             except Exception as e:
                 log.warning(
                     f"Unexpected worker exception (current timeline {state.timeline_id}): {e}"
@@ -632,7 +632,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
 
                 # Make sure we're up for as long as we spent restarting, to ensure operations can make progress
                 log.info(f"Staying alive for {restart_duration}s")
-                time.sleep(restart_duration)
+                time.sleep(restart_duration * 2)
             else:
                 # Migrate our tenant between pageservers
                 origin_ps = env.get_tenant_pageserver(tenant_shard_id)
@@ -651,7 +651,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
 
     # Sanity check that during our run we did exercise some full timeline lifecycles, in case
     # one of our workers got stuck
-    assert len(timelines_deleted) > 10
+    assert len(timelines_deleted) > 5
 
     # That no invariant-violations were reported by workers
     assert violations == []

From 4d2c2e946007444b8b4bbeb5348d20986174ee16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 30 Jan 2025 20:23:25 +0100
Subject: [PATCH 317/583] Revert "storcon: switch to diesel-async and
 tokio-postgres (#10280)" (#10592)

There was a regression of #10280, tracked in
[#23583](https://github.com/neondatabase/cloud/issues/23583).

I have ideas how to fix the issue, but we are too close to the release
cutoff, so revert #10280 for now. We can revert the revert later :).
---
 .github/workflows/_build-and-test-locally.yml |   4 +
 .github/workflows/build-macos.yml             |   2 +-
 .github/workflows/neon_extra_builds.yml       |   2 +-
 Cargo.lock                                    | 159 ++--
 Dockerfile                                    |   2 +-
 Makefile                                      |   2 +
 storage_controller/Cargo.toml                 |   5 +-
 storage_controller/src/main.rs                |   2 +-
 storage_controller/src/persistence.rs         | 783 ++++++++----------
 9 files changed, 397 insertions(+), 564 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index f97402a90b..2daed90386 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -158,6 +158,8 @@ jobs:
 
       - name: Run cargo build
         run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
       # Do install *before* running rust tests because they might recompile the
@@ -215,6 +217,8 @@ jobs:
         env:
           NEXTEST_RETRIES: 3
         run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
           LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib
           export LD_LIBRARY_PATH
 
diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml
index 347a511e98..01d82a1ed2 100644
--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -235,7 +235,7 @@ jobs:
           echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
 
       - name: Run cargo build (only for v17)
-        run: cargo build --all --release -j$(sysctl -n hw.ncpu)
+        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release -j$(sysctl -n hw.ncpu)
 
       - name: Check that no warnings are produced (only for v17)
         run: ./run_clippy.sh
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index f077e04d1c..5b5910badf 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -114,7 +114,7 @@ jobs:
         run: make walproposer-lib -j$(nproc)
 
       - name: Produce the build stats
-        run: cargo build --all --release --timings -j$(nproc)
+        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)
 
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@v4
diff --git a/Cargo.lock b/Cargo.lock
index 9ba90355df..359f989a76 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -941,18 +941,6 @@ version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
 
-[[package]]
-name = "bb8"
-version = "0.8.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d89aabfae550a5c44b43ab941844ffcd2e993cb6900b342debf59e9ea74acdb8"
-dependencies = [
- "async-trait",
- "futures-util",
- "parking_lot 0.12.1",
- "tokio",
-]
-
 [[package]]
 name = "bcder"
 version = "0.7.4"
@@ -1312,7 +1300,7 @@ dependencies = [
  "tar",
  "thiserror 1.0.69",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-stream",
  "tokio-util",
  "tower 0.5.2",
@@ -1421,7 +1409,7 @@ dependencies = [
  "storage_broker",
  "thiserror 1.0.69",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-util",
  "toml",
  "toml_edit",
@@ -1797,24 +1785,11 @@ dependencies = [
  "chrono",
  "diesel_derives",
  "itoa",
+ "pq-sys",
+ "r2d2",
  "serde_json",
 ]
 
-[[package]]
-name = "diesel-async"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51a307ac00f7c23f526a04a77761a0519b9f0eb2838ebf5b905a58580095bdcb"
-dependencies = [
- "async-trait",
- "bb8",
- "diesel",
- "futures-util",
- "scoped-futures",
- "tokio",
- "tokio-postgres 0.7.12",
-]
-
 [[package]]
 name = "diesel_derives"
 version = "2.2.1"
@@ -4060,8 +4035,8 @@ dependencies = [
  "pageserver_compaction",
  "pin-project-lite",
  "postgres",
- "postgres-protocol 0.6.6",
- "postgres-types 0.2.6",
+ "postgres-protocol",
+ "postgres-types",
  "postgres_backend",
  "postgres_connection",
  "postgres_ffi",
@@ -4092,7 +4067,7 @@ dependencies = [
  "tokio",
  "tokio-epoll-uring",
  "tokio-io-timeout",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
@@ -4150,7 +4125,7 @@ dependencies = [
  "serde",
  "thiserror 1.0.69",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-stream",
  "tokio-util",
  "utils",
@@ -4456,7 +4431,7 @@ dependencies = [
  "futures-util",
  "log",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
 ]
 
 [[package]]
@@ -4477,24 +4452,6 @@ dependencies = [
  "stringprep",
 ]
 
-[[package]]
-name = "postgres-protocol"
-version = "0.6.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acda0ebdebc28befa84bee35e651e4c5f09073d668c7aed4cf7e23c3cda84b23"
-dependencies = [
- "base64 0.22.1",
- "byteorder",
- "bytes",
- "fallible-iterator",
- "hmac",
- "md-5",
- "memchr",
- "rand 0.8.5",
- "sha2",
- "stringprep",
-]
-
 [[package]]
 name = "postgres-protocol2"
 version = "0.1.0"
@@ -4519,18 +4476,7 @@ dependencies = [
  "bytes",
  "chrono",
  "fallible-iterator",
- "postgres-protocol 0.6.6",
-]
-
-[[package]]
-name = "postgres-types"
-version = "0.2.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f66ea23a2d0e5734297357705193335e0a957696f34bed2f2faefacb2fec336f"
-dependencies = [
- "bytes",
- "fallible-iterator",
- "postgres-protocol 0.6.7",
+ "postgres-protocol",
 ]
 
 [[package]]
@@ -4555,7 +4501,7 @@ dependencies = [
  "serde",
  "thiserror 1.0.69",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-postgres-rustls",
  "tokio-rustls 0.26.0",
  "tokio-util",
@@ -4570,7 +4516,7 @@ dependencies = [
  "itertools 0.10.5",
  "once_cell",
  "postgres",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "url",
 ]
 
@@ -4657,6 +4603,15 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
+[[package]]
+name = "pq-sys"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6cc05d7ea95200187117196eee9edd0644424911821aeb28a18ce60ea0b8793"
+dependencies = [
+ "vcpkg",
+]
+
 [[package]]
 name = "pq_proto"
 version = "0.1.0"
@@ -4664,7 +4619,7 @@ dependencies = [
  "byteorder",
  "bytes",
  "itertools 0.10.5",
- "postgres-protocol 0.6.6",
+ "postgres-protocol",
  "rand 0.8.5",
  "serde",
  "thiserror 1.0.69",
@@ -4912,7 +4867,7 @@ dependencies = [
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-postgres2",
  "tokio-rustls 0.26.0",
  "tokio-tungstenite 0.21.0",
@@ -4969,6 +4924,17 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "r2d2"
+version = "0.8.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93"
+dependencies = [
+ "log",
+ "parking_lot 0.12.1",
+ "scheduled-thread-pool",
+]
+
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -5700,7 +5666,7 @@ dependencies = [
  "pageserver_api",
  "parking_lot 0.12.1",
  "postgres",
- "postgres-protocol 0.6.6",
+ "postgres-protocol",
  "postgres_backend",
  "postgres_ffi",
  "pprof",
@@ -5724,7 +5690,7 @@ dependencies = [
  "tikv-jemallocator",
  "tokio",
  "tokio-io-timeout",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
@@ -5783,12 +5749,12 @@ dependencies = [
 ]
 
 [[package]]
-name = "scoped-futures"
-version = "0.1.4"
+name = "scheduled-thread-pool"
+version = "0.2.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b24aae2d0636530f359e9d5ef0c04669d11c5e756699b27a6a6d845d8329091"
+checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19"
 dependencies = [
- "pin-project-lite",
+ "parking_lot 0.12.1",
 ]
 
 [[package]]
@@ -6323,7 +6289,6 @@ dependencies = [
  "clap",
  "control_plane",
  "diesel",
- "diesel-async",
  "diesel_migrations",
  "fail",
  "futures",
@@ -6338,10 +6303,10 @@ dependencies = [
  "pageserver_api",
  "pageserver_client",
  "postgres_connection",
+ "r2d2",
  "rand 0.8.5",
  "reqwest",
  "routerify",
- "scoped-futures",
  "scopeguard",
  "serde",
  "serde_json",
@@ -6394,7 +6359,7 @@ dependencies = [
  "serde_json",
  "storage_controller_client",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-postgres-rustls",
  "tokio-stream",
  "tokio-util",
@@ -6873,34 +6838,8 @@ dependencies = [
  "percent-encoding",
  "phf",
  "pin-project-lite",
- "postgres-protocol 0.6.6",
- "postgres-types 0.2.6",
- "rand 0.8.5",
- "socket2",
- "tokio",
- "tokio-util",
- "whoami",
-]
-
-[[package]]
-name = "tokio-postgres"
-version = "0.7.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b5d3742945bc7d7f210693b0c58ae542c6fd47b17adbbda0885f3dcb34a6bdb"
-dependencies = [
- "async-trait",
- "byteorder",
- "bytes",
- "fallible-iterator",
- "futures-channel",
- "futures-util",
- "log",
- "parking_lot 0.12.1",
- "percent-encoding",
- "phf",
- "pin-project-lite",
- "postgres-protocol 0.6.7",
- "postgres-types 0.2.8",
+ "postgres-protocol",
+ "postgres-types",
  "rand 0.8.5",
  "socket2",
  "tokio",
@@ -6917,7 +6856,7 @@ dependencies = [
  "ring",
  "rustls 0.23.18",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-rustls 0.26.0",
  "x509-certificate",
 ]
@@ -7576,6 +7515,12 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
 
+[[package]]
+name = "vcpkg"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+
 [[package]]
 name = "version_check"
 version = "0.9.4"
@@ -7595,7 +7540,7 @@ dependencies = [
  "serde_json",
  "sysinfo",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-util",
  "tracing",
  "tracing-subscriber",
diff --git a/Dockerfile b/Dockerfile
index 7ba54c8ca5..f80666529b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,7 +45,7 @@ COPY --chown=nonroot . .
 
 ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
-    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
+    && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
       --bin pg_sni_router  \
       --bin pageserver  \
       --bin pagectl  \
diff --git a/Makefile b/Makefile
index d1238caebf..22ebfea7d5 100644
--- a/Makefile
+++ b/Makefile
@@ -64,6 +64,8 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
 CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
 # Force cargo not to print progress bar
 CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
+# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
+CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
 
 CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
 
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 9860bd5d0e..caaa22d0a5 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -45,11 +45,12 @@ strum_macros.workspace = true
 
 diesel = { version = "2.2.6", features = [
     "serde_json",
+    "postgres",
+    "r2d2",
     "chrono",
 ] }
-diesel-async = { version = "0.5.2", features = ["postgres", "bb8", "async-connection-wrapper"] }
 diesel_migrations = { version = "2.2.0" }
-scoped-futures = "0.1.4"
+r2d2 = { version = "0.8.10" }
 
 utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 659c088d51..801409d612 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -308,7 +308,7 @@ async fn async_main() -> anyhow::Result<()> {
     // Validate that we can connect to the database
     Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
 
-    let persistence = Arc::new(Persistence::new(secrets.database_url).await);
+    let persistence = Arc::new(Persistence::new(secrets.database_url));
 
     let service = Service::spawn(config, persistence.clone()).await?;
 
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 35eb15b297..37bfaf1139 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -5,12 +5,9 @@ use std::time::Duration;
 use std::time::Instant;
 
 use self::split_state::SplitState;
+use diesel::pg::PgConnection;
 use diesel::prelude::*;
-use diesel_async::async_connection_wrapper::AsyncConnectionWrapper;
-use diesel_async::pooled_connection::bb8::Pool;
-use diesel_async::pooled_connection::AsyncDieselConnectionManager;
-use diesel_async::RunQueryDsl;
-use diesel_async::{AsyncConnection, AsyncPgConnection};
+use diesel::Connection;
 use itertools::Itertools;
 use pageserver_api::controller_api::AvailabilityZone;
 use pageserver_api::controller_api::MetadataHealthRecord;
@@ -23,7 +20,6 @@ use pageserver_api::shard::ShardConfigError;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
-use scoped_futures::ScopedBoxFuture;
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};
@@ -64,7 +60,7 @@ const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
 /// updated, and reads of nodes are always from memory, not the database.  We only require that
 /// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
 pub struct Persistence {
-    connection_pool: Pool<AsyncPgConnection>,
+    connection_pool: diesel::r2d2::Pool<diesel::r2d2::ConnectionManager<PgConnection>>,
 }
 
 /// Legacy format, for use in JSON compat objects in test environment
@@ -80,7 +76,7 @@ pub(crate) enum DatabaseError {
     #[error(transparent)]
     Connection(#[from] diesel::result::ConnectionError),
     #[error(transparent)]
-    ConnectionPool(#[from] diesel_async::pooled_connection::bb8::RunError),
+    ConnectionPool(#[from] r2d2::Error),
     #[error("Logical error: {0}")]
     Logical(String),
     #[error("Migration error: {0}")]
@@ -128,7 +124,6 @@ pub(crate) enum AbortShardSplitStatus {
 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
 
 /// Some methods can operate on either a whole tenant or a single shard
-#[derive(Clone)]
 pub(crate) enum TenantFilter {
     Tenant(TenantId),
     Shard(TenantShardId),
@@ -141,11 +136,6 @@ pub(crate) struct ShardGenerationState {
     pub(crate) generation_pageserver: Option<NodeId>,
 }
 
-// A generous allowance for how many times we may retry serializable transactions
-// before giving up.  This is not expected to be hit: it is a defensive measure in case we
-// somehow engineer a situation where duelling transactions might otherwise live-lock.
-const MAX_RETRIES: usize = 128;
-
 impl Persistence {
     // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
     // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
@@ -155,12 +145,12 @@ impl Persistence {
     const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10);
     const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60);
 
-    pub async fn new(database_url: String) -> Self {
-        let manager = AsyncDieselConnectionManager::<AsyncPgConnection>::new(database_url);
+    pub fn new(database_url: String) -> Self {
+        let manager = diesel::r2d2::ConnectionManager::<PgConnection>::new(database_url);
 
         // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time
         // to execute queries (database queries are not generally on latency-sensitive paths).
-        let connection_pool = Pool::builder()
+        let connection_pool = diesel::r2d2::Pool::builder()
             .max_size(Self::MAX_CONNECTIONS)
             .max_lifetime(Some(Self::MAX_CONNECTION_LIFETIME))
             .idle_timeout(Some(Self::IDLE_CONNECTION_TIMEOUT))
@@ -168,7 +158,6 @@ impl Persistence {
             .min_idle(Some(1))
             .test_on_check_out(true)
             .build(manager)
-            .await
             .expect("Could not build connection pool");
 
         Self { connection_pool }
@@ -182,7 +171,7 @@ impl Persistence {
     ) -> Result<(), diesel::ConnectionError> {
         let started_at = Instant::now();
         loop {
-            match AsyncPgConnection::establish(database_url).await {
+            match PgConnection::establish(database_url) {
                 Ok(_) => {
                     tracing::info!("Connected to database.");
                     return Ok(());
@@ -203,22 +192,57 @@ impl Persistence {
     pub(crate) async fn migration_run(&self) -> DatabaseResult<()> {
         use diesel_migrations::{HarnessWithOutput, MigrationHarness};
 
-        // Can't use self.with_conn here as we do spawn_blocking which requires static.
-        let conn = self
-            .connection_pool
-            .dedicated_connection()
-            .await
-            .map_err(|e| DatabaseError::Migration(e.to_string()))?;
-        let mut async_wrapper: AsyncConnectionWrapper<AsyncPgConnection> =
-            AsyncConnectionWrapper::from(conn);
-        tokio::task::spawn_blocking(move || {
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            HarnessWithOutput::write_to_stdout(conn)
+                .run_pending_migrations(MIGRATIONS)
+                .map(|_| ())
+                .map_err(|e| DatabaseError::Migration(e.to_string()))
+        })
+        .await
+    }
+
+    /// Wraps `with_conn` in order to collect latency and error metrics
+    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
+    where
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        R: Send + 'static,
+    {
+        let latency = &METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_database_query_latency;
+        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
+
+        let res = self.with_conn(func).await;
+
+        if let Err(err) = &res {
+            let error_counter = &METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_database_query_error;
+            error_counter.inc(DatabaseQueryErrorLabelGroup {
+                error_type: err.error_label(),
+                operation: op,
+            })
+        }
+
+        res
+    }
+
+    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
+    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
+    where
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        R: Send + 'static,
+    {
+        // A generous allowance for how many times we may retry serializable transactions
+        // before giving up.  This is not expected to be hit: it is a defensive measure in case we
+        // somehow engineer a situation where duelling transactions might otherwise live-lock.
+        const MAX_RETRIES: usize = 128;
+
+        let mut conn = self.connection_pool.get()?;
+        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
             let mut retry_count = 0;
             loop {
-                let result = HarnessWithOutput::write_to_stdout(&mut async_wrapper)
-                    .run_pending_migrations(MIGRATIONS)
-                    .map(|_| ())
-                    .map_err(|e| DatabaseError::Migration(e.to_string()));
-                match result {
+                match conn.build_transaction().serializable().run(|c| func(c)) {
                     Ok(r) => break Ok(r),
                     Err(
                         err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
@@ -247,112 +271,33 @@ impl Persistence {
             }
         })
         .await
-        .map_err(|e| DatabaseError::Migration(e.to_string()))??;
-        Ok(())
-    }
-
-    /// Wraps `with_conn` in order to collect latency and error metrics
-    async fn with_measured_conn<'a, 'b, F, R>(
-        &self,
-        op: DatabaseOperation,
-        func: F,
-    ) -> DatabaseResult<R>
-    where
-        F: for<'r> Fn(&'r mut AsyncPgConnection) -> ScopedBoxFuture<'b, 'r, DatabaseResult<R>>
-            + Send
-            + std::marker::Sync
-            + 'a,
-        R: Send + 'b,
-    {
-        let latency = &METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_database_query_latency;
-        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
-
-        let res = self.with_conn(func).await;
-
-        if let Err(err) = &res {
-            let error_counter = &METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_database_query_error;
-            error_counter.inc(DatabaseQueryErrorLabelGroup {
-                error_type: err.error_label(),
-                operation: op,
-            })
-        }
-
-        res
-    }
-
-    /// Call the provided function with a Diesel database connection in a retry loop
-    async fn with_conn<'a, 'b, F, R>(&self, func: F) -> DatabaseResult<R>
-    where
-        F: for<'r> Fn(&'r mut AsyncPgConnection) -> ScopedBoxFuture<'b, 'r, DatabaseResult<R>>
-            + Send
-            + std::marker::Sync
-            + 'a,
-        R: Send + 'b,
-    {
-        let mut retry_count = 0;
-        loop {
-            let mut conn = self.connection_pool.get().await?;
-            match conn
-                .build_transaction()
-                .serializable()
-                .run(|c| func(c))
-                .await
-            {
-                Ok(r) => break Ok(r),
-                Err(
-                    err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
-                        diesel::result::DatabaseErrorKind::SerializationFailure,
-                        _,
-                    )),
-                ) => {
-                    retry_count += 1;
-                    if retry_count > MAX_RETRIES {
-                        tracing::error!(
-                            "Exceeded max retries on SerializationFailure errors: {err:?}"
-                        );
-                        break Err(err);
-                    } else {
-                        // Retry on serialization errors: these are expected, because even though our
-                        // transactions don't fight for the same rows, they will occasionally collide
-                        // on index pages (e.g. increment_generation for unrelated shards can collide)
-                        tracing::debug!("Retrying transaction on serialization failure {err:?}");
-                        continue;
-                    }
-                }
-                Err(e) => break Err(e),
-            }
-        }
+        .expect("Task panic")
     }
 
     /// When a node is first registered, persist it before using it for anything
     pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
-        let np = &node.to_persistent();
-        self.with_measured_conn(DatabaseOperation::InsertNode, move |conn| {
-            Box::pin(async move {
+        let np = node.to_persistent();
+        self.with_measured_conn(
+            DatabaseOperation::InsertNode,
+            move |conn| -> DatabaseResult<()> {
                 diesel::insert_into(crate::schema::nodes::table)
-                    .values(np)
-                    .execute(conn)
-                    .await?;
+                    .values(&np)
+                    .execute(conn)?;
                 Ok(())
-            })
-        })
+            },
+        )
         .await
     }
 
     /// At startup, populate the list of nodes which our shards may be placed on
     pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
         let nodes: Vec<NodePersistence> = self
-            .with_measured_conn(DatabaseOperation::ListNodes, move |conn| {
-                Box::pin(async move {
-                    Ok(crate::schema::nodes::table
-                        .load::<NodePersistence>(conn)
-                        .await?)
-                })
-            })
+            .with_measured_conn(
+                DatabaseOperation::ListNodes,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
+                },
+            )
             .await?;
 
         tracing::info!("list_nodes: loaded {} nodes", nodes.len());
@@ -368,14 +313,11 @@ impl Persistence {
         use crate::schema::nodes::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
-                Box::pin(async move {
-                    let updated = diesel::update(nodes)
-                        .filter(node_id.eq(input_node_id.0 as i64))
-                        .set((scheduling_policy.eq(String::from(input_scheduling)),))
-                        .execute(conn)
-                        .await?;
-                    Ok(updated)
-                })
+                let updated = diesel::update(nodes)
+                    .filter(node_id.eq(input_node_id.0 as i64))
+                    .set((scheduling_policy.eq(String::from(input_scheduling)),))
+                    .execute(conn)?;
+                Ok(updated)
             })
             .await?;
 
@@ -397,16 +339,17 @@ impl Persistence {
         &self,
     ) -> DatabaseResult<Vec<TenantShardPersistence>> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::ListTenantShards, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::ListTenantShards,
+            move |conn| -> DatabaseResult<_> {
                 let query = tenant_shards.filter(
                     placement_policy.ne(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
                 );
-                let result = query.load::<TenantShardPersistence>(conn).await?;
+                let result = query.load::<TenantShardPersistence>(conn)?;
 
                 Ok(result)
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -416,14 +359,15 @@ impl Persistence {
         filter_tenant_id: TenantId,
     ) -> DatabaseResult<Vec<TenantShardPersistence>> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::LoadTenant, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::LoadTenant,
+            move |conn| -> DatabaseResult<_> {
                 let query = tenant_shards.filter(tenant_id.eq(filter_tenant_id.to_string()));
-                let result = query.load::<TenantShardPersistence>(conn).await?;
+                let result = query.load::<TenantShardPersistence>(conn)?;
 
                 Ok(result)
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -449,22 +393,19 @@ impl Persistence {
             })
             .collect::<Vec<_>>();
 
-        let shards = &shards;
-        let metadata_health_records = &metadata_health_records;
-        self.with_measured_conn(DatabaseOperation::InsertTenantShards, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::InsertTenantShards,
+            move |conn| -> DatabaseResult<()> {
                 diesel::insert_into(tenant_shards::table)
-                    .values(shards)
-                    .execute(conn)
-                    .await?;
+                    .values(&shards)
+                    .execute(conn)?;
 
                 diesel::insert_into(metadata_health::table)
-                    .values(metadata_health_records)
-                    .execute(conn)
-                    .await?;
+                    .values(&metadata_health_records)
+                    .execute(conn)?;
                 Ok(())
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -472,31 +413,31 @@ impl Persistence {
     /// the tenant from memory on this server.
     pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::DeleteTenant, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::DeleteTenant,
+            move |conn| -> DatabaseResult<()> {
                 // `metadata_health` status (if exists) is also deleted based on the cascade behavior.
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(del_tenant_id.to_string()))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
                 Ok(())
-            })
-        })
+            },
+        )
         .await
     }
 
     pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
         use crate::schema::nodes::dsl::*;
-        self.with_measured_conn(DatabaseOperation::DeleteNode, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::DeleteNode,
+            move |conn| -> DatabaseResult<()> {
                 diesel::delete(nodes)
                     .filter(node_id.eq(del_node_id.0 as i64))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
 
                 Ok(())
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -513,41 +454,34 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
-                Box::pin(async move {
-                    let rows_updated = diesel::update(tenant_shards)
-                        .filter(generation_pageserver.eq(input_node_id.0 as i64))
-                        .set(generation.eq(generation + 1))
-                        .execute(conn)
-                        .await?;
+                let rows_updated = diesel::update(tenant_shards)
+                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
+                    .set(generation.eq(generation + 1))
+                    .execute(conn)?;
 
-                    tracing::info!("Incremented {} tenants' generations", rows_updated);
+                tracing::info!("Incremented {} tenants' generations", rows_updated);
 
-                    // TODO: UPDATE+SELECT in one query
+                // TODO: UPDATE+SELECT in one query
 
-                    let updated = tenant_shards
-                        .filter(generation_pageserver.eq(input_node_id.0 as i64))
-                        .select(TenantShardPersistence::as_select())
-                        .load(conn)
-                        .await?;
+                let updated = tenant_shards
+                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
+                    .select(TenantShardPersistence::as_select())
+                    .load(conn)?;
 
-                    // If the node went through a drain and restart phase before re-attaching,
-                    // then reset it's node scheduling policy to active.
-                    diesel::update(nodes)
-                        .filter(node_id.eq(input_node_id.0 as i64))
-                        .filter(
-                            scheduling_policy
-                                .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
-                                .or(scheduling_policy
-                                    .eq(String::from(NodeSchedulingPolicy::Draining)))
-                                .or(scheduling_policy
-                                    .eq(String::from(NodeSchedulingPolicy::Filling))),
-                        )
-                        .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
-                        .execute(conn)
-                        .await?;
+                // If the node went through a drain and restart phase before re-attaching,
+                // then reset it's node scheduling policy to active.
+                diesel::update(nodes)
+                    .filter(node_id.eq(input_node_id.0 as i64))
+                    .filter(
+                        scheduling_policy
+                            .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
+                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Draining)))
+                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Filling))),
+                    )
+                    .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
+                    .execute(conn)?;
 
-                    Ok(updated)
-                })
+                Ok(updated)
             })
             .await?;
 
@@ -584,22 +518,19 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
-                Box::pin(async move {
-                    let updated = diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                        .set((
-                            generation.eq(generation + 1),
-                            generation_pageserver.eq(node_id.0 as i64),
-                        ))
-                        // TODO: only returning() the generation column
-                        .returning(TenantShardPersistence::as_returning())
-                        .get_result(conn)
-                        .await?;
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .set((
+                        generation.eq(generation + 1),
+                        generation_pageserver.eq(node_id.0 as i64),
+                    ))
+                    // TODO: only returning() the generation column
+                    .returning(TenantShardPersistence::as_returning())
+                    .get_result(conn)?;
 
-                    Ok(updated)
-                })
+                Ok(updated)
             })
             .await?;
 
@@ -631,15 +562,12 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let rows = self
             .with_measured_conn(DatabaseOperation::TenantGenerations, move |conn| {
-                Box::pin(async move {
-                    let result = tenant_shards
-                        .filter(tenant_id.eq(filter_tenant_id.to_string()))
-                        .select(TenantShardPersistence::as_select())
-                        .order(shard_number)
-                        .load(conn)
-                        .await?;
-                    Ok(result)
-                })
+                let result = tenant_shards
+                    .filter(tenant_id.eq(filter_tenant_id.to_string()))
+                    .select(TenantShardPersistence::as_select())
+                    .order(shard_number)
+                    .load(conn)?;
+                Ok(result)
             })
             .await?;
 
@@ -687,18 +615,15 @@ impl Persistence {
                 break;
             }
 
-            let in_clause = &in_clause;
             let chunk_rows = self
                 .with_measured_conn(DatabaseOperation::ShardGenerations, move |conn| {
-                    Box::pin(async move {
-                        // diesel doesn't support multi-column IN queries, so we compose raw SQL.  No escaping is required because
-                        // the inputs are strongly typed and cannot carry any user-supplied raw string content.
-                        let result : Vec<TenantShardPersistence> = diesel::sql_query(
-                            format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str()
-                        ).load(conn).await?;
+                    // diesel doesn't support multi-column IN queries, so we compose raw SQL.  No escaping is required because
+                    // the inputs are strongly typed and cannot carry any user-supplied raw string content.
+                    let result : Vec<TenantShardPersistence> = diesel::sql_query(
+                        format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str()
+                    ).load(conn)?;
 
-                        Ok(result)
-                    })
+                    Ok(result)
                 })
                 .await?;
             rows.extend(chunk_rows.into_iter())
@@ -732,58 +657,51 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
 
-        let tenant = &tenant;
-        let input_placement_policy = &input_placement_policy;
-        let input_config = &input_config;
-        let input_generation = &input_generation;
-        let input_scheduling_policy = &input_scheduling_policy;
         self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
-            Box::pin(async move {
-                let query = match tenant {
-                    TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                        .into_boxed(),
-                    TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(input_tenant_id.to_string()))
-                        .into_boxed(),
-                };
+            let query = match tenant {
+                TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .into_boxed(),
+                TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(input_tenant_id.to_string()))
+                    .into_boxed(),
+            };
 
-                // Clear generation_pageserver if we are moving into a state where we won't have
-                // any attached pageservers.
-                let input_generation_pageserver = match input_placement_policy {
-                    None | Some(PlacementPolicy::Attached(_)) => None,
-                    Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
-                };
+            // Clear generation_pageserver if we are moving into a state where we won't have
+            // any attached pageservers.
+            let input_generation_pageserver = match input_placement_policy {
+                None | Some(PlacementPolicy::Attached(_)) => None,
+                Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
+            };
 
-                #[derive(AsChangeset)]
-                #[diesel(table_name = crate::schema::tenant_shards)]
-                struct ShardUpdate {
-                    generation: Option<i32>,
-                    placement_policy: Option<String>,
-                    config: Option<String>,
-                    scheduling_policy: Option<String>,
-                    generation_pageserver: Option<Option<i64>>,
-                }
+            #[derive(AsChangeset)]
+            #[diesel(table_name = crate::schema::tenant_shards)]
+            struct ShardUpdate {
+                generation: Option<i32>,
+                placement_policy: Option<String>,
+                config: Option<String>,
+                scheduling_policy: Option<String>,
+                generation_pageserver: Option<Option<i64>>,
+            }
 
-                let update = ShardUpdate {
-                    generation: input_generation.map(|g| g.into().unwrap() as i32),
-                    placement_policy: input_placement_policy
-                        .as_ref()
-                        .map(|p| serde_json::to_string(&p).unwrap()),
-                    config: input_config
-                        .as_ref()
-                        .map(|c| serde_json::to_string(&c).unwrap()),
-                    scheduling_policy: input_scheduling_policy
-                        .map(|p| serde_json::to_string(&p).unwrap()),
-                    generation_pageserver: input_generation_pageserver,
-                };
+            let update = ShardUpdate {
+                generation: input_generation.map(|g| g.into().unwrap() as i32),
+                placement_policy: input_placement_policy
+                    .as_ref()
+                    .map(|p| serde_json::to_string(&p).unwrap()),
+                config: input_config
+                    .as_ref()
+                    .map(|c| serde_json::to_string(&c).unwrap()),
+                scheduling_policy: input_scheduling_policy
+                    .map(|p| serde_json::to_string(&p).unwrap()),
+                generation_pageserver: input_generation_pageserver,
+            };
 
-                query.set(update).execute(conn).await?;
+            query.set(update).execute(conn)?;
 
-                Ok(())
-            })
+            Ok(())
         })
         .await?;
 
@@ -797,27 +715,23 @@ impl Persistence {
     ) -> DatabaseResult<Vec<(TenantShardId, Option<AvailabilityZone>)>> {
         use crate::schema::tenant_shards::dsl::*;
 
-        let preferred_azs = preferred_azs.as_slice();
         self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
-            Box::pin(async move {
-                let mut shards_updated = Vec::default();
+            let mut shards_updated = Vec::default();
 
-                for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
-                    let updated = diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                        .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
-                        .execute(conn)
-                        .await?;
+            for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
+                    .execute(conn)?;
 
-                    if updated == 1 {
-                        shards_updated.push((*tenant_shard_id, preferred_az.clone()));
-                    }
+                if updated == 1 {
+                    shards_updated.push((*tenant_shard_id, preferred_az.clone()));
                 }
+            }
 
-                Ok(shards_updated)
-            })
+            Ok(shards_updated)
         })
         .await
     }
@@ -825,21 +739,17 @@ impl Persistence {
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
         use crate::schema::tenant_shards::dsl::*;
         self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
-            Box::pin(async move {
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .set((
-                        generation_pageserver.eq(Option::<i64>::None),
-                        placement_policy
-                            .eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
-                    ))
-                    .execute(conn)
-                    .await?;
+            let updated = diesel::update(tenant_shards)
+                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                .set((
+                    generation_pageserver.eq(Option::<i64>::None),
+                    placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
+                ))
+                .execute(conn)?;
 
-                Ok(updated)
-            })
+            Ok(updated)
         })
         .await?;
 
@@ -858,16 +768,14 @@ impl Persistence {
         parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        let parent_to_children = parent_to_children.as_slice();
-        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
             // Mark parent shards as splitting
 
             let updated = diesel::update(tenant_shards)
                 .filter(tenant_id.eq(split_tenant_id.to_string()))
                 .filter(shard_count.eq(old_shard_count.literal() as i32))
                 .set((splitting.eq(1),))
-                .execute(conn).await?;
+                .execute(conn)?;
             if u8::try_from(updated)
                 .map_err(|_| DatabaseError::Logical(
                     format!("Overflow existing shard count {} while splitting", updated))
@@ -880,7 +788,7 @@ impl Persistence {
             }
 
             // FIXME: spurious clone to sidestep closure move rules
-            let parent_to_children = parent_to_children.to_vec();
+            let parent_to_children = parent_to_children.clone();
 
             // Insert child shards
             for (parent_shard_id, children) in parent_to_children {
@@ -888,7 +796,7 @@ impl Persistence {
                     .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
                     .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
                     .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
-                    .load::<TenantShardPersistence>(conn).await?;
+                    .load::<TenantShardPersistence>(conn)?;
                 let parent = if parent.len() != 1 {
                     return Err(DatabaseError::Logical(format!(
                         "Parent shard {parent_shard_id} not found"
@@ -903,13 +811,12 @@ impl Persistence {
                     debug_assert!(shard.splitting == SplitState::Splitting);
                     diesel::insert_into(tenant_shards)
                         .values(shard)
-                        .execute(conn).await?;
+                        .execute(conn)?;
                 }
             }
 
             Ok(())
         })
-        })
         .await
     }
 
@@ -921,26 +828,25 @@ impl Persistence {
         old_shard_count: ShardCount,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::CompleteShardSplit, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::CompleteShardSplit,
+            move |conn| -> DatabaseResult<()> {
                 // Drop parent shards
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.eq(old_shard_count.literal() as i32))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
 
                 // Clear sharding flag
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .set((splitting.eq(0),))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
                 debug_assert!(updated > 0);
 
                 Ok(())
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -952,15 +858,15 @@ impl Persistence {
         new_shard_count: ShardCount,
     ) -> DatabaseResult<AbortShardSplitStatus> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::AbortShardSplit, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::AbortShardSplit,
+            move |conn| -> DatabaseResult<AbortShardSplitStatus> {
                 // Clear the splitting state on parent shards
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.ne(new_shard_count.literal() as i32))
                     .set((splitting.eq(0),))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
 
                 // Parent shards are already gone: we cannot abort.
                 if updated == 0 {
@@ -980,12 +886,11 @@ impl Persistence {
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.eq(new_shard_count.literal() as i32))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
 
                 Ok(AbortShardSplitStatus::Aborted)
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -1001,28 +906,25 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::metadata_health::dsl::*;
 
-        let healthy_records = healthy_records.as_slice();
-        let unhealthy_records = unhealthy_records.as_slice();
-        self.with_measured_conn(DatabaseOperation::UpdateMetadataHealth, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::UpdateMetadataHealth,
+            move |conn| -> DatabaseResult<_> {
                 diesel::insert_into(metadata_health)
-                    .values(healthy_records)
+                    .values(&healthy_records)
                     .on_conflict((tenant_id, shard_number, shard_count))
                     .do_update()
                     .set((healthy.eq(true), last_scrubbed_at.eq(now)))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
 
                 diesel::insert_into(metadata_health)
-                    .values(unhealthy_records)
+                    .values(&unhealthy_records)
                     .on_conflict((tenant_id, shard_number, shard_count))
                     .do_update()
                     .set((healthy.eq(false), last_scrubbed_at.eq(now)))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
                 Ok(())
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -1031,13 +933,15 @@ impl Persistence {
     pub(crate) async fn list_metadata_health_records(
         &self,
     ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
-        self.with_measured_conn(DatabaseOperation::ListMetadataHealth, move |conn| {
-            Box::pin(async {
-                Ok(crate::schema::metadata_health::table
-                    .load::<MetadataHealthPersistence>(conn)
-                    .await?)
-            })
-        })
+        self.with_measured_conn(
+            DatabaseOperation::ListMetadataHealth,
+            move |conn| -> DatabaseResult<_> {
+                Ok(
+                    crate::schema::metadata_health::table
+                        .load::<MetadataHealthPersistence>(conn)?,
+                )
+            },
+        )
         .await
     }
 
@@ -1049,15 +953,10 @@ impl Persistence {
         use crate::schema::metadata_health::dsl::*;
         self.with_measured_conn(
             DatabaseOperation::ListMetadataHealthUnhealthy,
-            move |conn| {
-                Box::pin(async {
-                    DatabaseResult::Ok(
-                        crate::schema::metadata_health::table
-                            .filter(healthy.eq(false))
-                            .load::<MetadataHealthPersistence>(conn)
-                            .await?,
-                    )
-                })
+            move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::metadata_health::table
+                    .filter(healthy.eq(false))
+                    .load::<MetadataHealthPersistence>(conn)?)
             },
         )
         .await
@@ -1071,14 +970,15 @@ impl Persistence {
     ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
         use crate::schema::metadata_health::dsl::*;
 
-        self.with_measured_conn(DatabaseOperation::ListMetadataHealthOutdated, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::ListMetadataHealthOutdated,
+            move |conn| -> DatabaseResult<_> {
                 let query = metadata_health.filter(last_scrubbed_at.lt(earlier));
-                let res = query.load::<MetadataHealthPersistence>(conn).await?;
+                let res = query.load::<MetadataHealthPersistence>(conn)?;
 
                 Ok(res)
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -1086,13 +986,12 @@ impl Persistence {
     /// It is an error for the table to contain more than one entry.
     pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
         let mut leader: Vec<ControllerPersistence> = self
-            .with_measured_conn(DatabaseOperation::GetLeader, move |conn| {
-                Box::pin(async move {
-                    Ok(crate::schema::controllers::table
-                        .load::<ControllerPersistence>(conn)
-                        .await?)
-                })
-            })
+            .with_measured_conn(
+                DatabaseOperation::GetLeader,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::controllers::table.load::<ControllerPersistence>(conn)?)
+                },
+            )
             .await?;
 
         if leader.len() > 1 {
@@ -1115,33 +1014,26 @@ impl Persistence {
         use crate::schema::controllers::dsl::*;
 
         let updated = self
-            .with_measured_conn(DatabaseOperation::UpdateLeader, move |conn| {
-                let prev = prev.clone();
-                let new = new.clone();
-                Box::pin(async move {
+            .with_measured_conn(
+                DatabaseOperation::UpdateLeader,
+                move |conn| -> DatabaseResult<usize> {
                     let updated = match &prev {
-                        Some(prev) => {
-                            diesel::update(controllers)
-                                .filter(address.eq(prev.address.clone()))
-                                .filter(started_at.eq(prev.started_at))
-                                .set((
-                                    address.eq(new.address.clone()),
-                                    started_at.eq(new.started_at),
-                                ))
-                                .execute(conn)
-                                .await?
-                        }
-                        None => {
-                            diesel::insert_into(controllers)
-                                .values(new.clone())
-                                .execute(conn)
-                                .await?
-                        }
+                        Some(prev) => diesel::update(controllers)
+                            .filter(address.eq(prev.address.clone()))
+                            .filter(started_at.eq(prev.started_at))
+                            .set((
+                                address.eq(new.address.clone()),
+                                started_at.eq(new.started_at),
+                            ))
+                            .execute(conn)?,
+                        None => diesel::insert_into(controllers)
+                            .values(new.clone())
+                            .execute(conn)?,
                     };
 
                     Ok(updated)
-                })
-            })
+                },
+            )
             .await?;
 
         if updated == 0 {
@@ -1156,13 +1048,12 @@ impl Persistence {
     /// At startup, populate the list of nodes which our shards may be placed on
     pub(crate) async fn list_safekeepers(&self) -> DatabaseResult<Vec<SafekeeperPersistence>> {
         let safekeepers: Vec<SafekeeperPersistence> = self
-            .with_measured_conn(DatabaseOperation::ListNodes, move |conn| {
-                Box::pin(async move {
-                    Ok(crate::schema::safekeepers::table
-                        .load::<SafekeeperPersistence>(conn)
-                        .await?)
-                })
-            })
+            .with_measured_conn(
+                DatabaseOperation::ListNodes,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::safekeepers::table.load::<SafekeeperPersistence>(conn)?)
+                },
+            )
             .await?;
 
         tracing::info!("list_safekeepers: loaded {} nodes", safekeepers.len());
@@ -1175,14 +1066,11 @@ impl Persistence {
         id: i64,
     ) -> Result<SafekeeperPersistence, DatabaseError> {
         use crate::schema::safekeepers::dsl::{id as id_column, safekeepers};
-        self.with_conn(move |conn| {
-            Box::pin(async move {
-                Ok(safekeepers
-                    .filter(id_column.eq(&id))
-                    .select(SafekeeperPersistence::as_select())
-                    .get_result(conn)
-                    .await?)
-            })
+        self.with_conn(move |conn| -> DatabaseResult<SafekeeperPersistence> {
+            Ok(safekeepers
+                .filter(id_column.eq(&id))
+                .select(SafekeeperPersistence::as_select())
+                .get_result(conn)?)
         })
         .await
     }
@@ -1193,30 +1081,26 @@ impl Persistence {
     ) -> Result<(), DatabaseError> {
         use crate::schema::safekeepers::dsl::*;
 
-        self.with_conn(move |conn| {
-            let record = record.clone();
-            Box::pin(async move {
-                let bind = record
-                    .as_insert_or_update()
-                    .map_err(|e| DatabaseError::Logical(format!("{e}")))?;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            let bind = record
+                .as_insert_or_update()
+                .map_err(|e| DatabaseError::Logical(format!("{e}")))?;
 
-                let inserted_updated = diesel::insert_into(safekeepers)
-                    .values(&bind)
-                    .on_conflict(id)
-                    .do_update()
-                    .set(&bind)
-                    .execute(conn)
-                    .await?;
+            let inserted_updated = diesel::insert_into(safekeepers)
+                .values(&bind)
+                .on_conflict(id)
+                .do_update()
+                .set(&bind)
+                .execute(conn)?;
 
-                if inserted_updated != 1 {
-                    return Err(DatabaseError::Logical(format!(
-                        "unexpected number of rows ({})",
-                        inserted_updated
-                    )));
-                }
+            if inserted_updated != 1 {
+                return Err(DatabaseError::Logical(format!(
+                    "unexpected number of rows ({})",
+                    inserted_updated
+                )));
+            }
 
-                Ok(())
-            })
+            Ok(())
         })
         .await
     }
@@ -1228,29 +1112,26 @@ impl Persistence {
     ) -> Result<(), DatabaseError> {
         use crate::schema::safekeepers::dsl::*;
 
-        self.with_conn(move |conn| {
-            Box::pin(async move {
-                #[derive(Insertable, AsChangeset)]
-                #[diesel(table_name = crate::schema::safekeepers)]
-                struct UpdateSkSchedulingPolicy<'a> {
-                    id: i64,
-                    scheduling_policy: &'a str,
-                }
-                let scheduling_policy_ = String::from(scheduling_policy_);
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            #[derive(Insertable, AsChangeset)]
+            #[diesel(table_name = crate::schema::safekeepers)]
+            struct UpdateSkSchedulingPolicy<'a> {
+                id: i64,
+                scheduling_policy: &'a str,
+            }
+            let scheduling_policy_ = String::from(scheduling_policy_);
 
-                let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
-                    .set(scheduling_policy.eq(scheduling_policy_))
-                    .execute(conn)
-                    .await?;
+            let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
+                .set(scheduling_policy.eq(scheduling_policy_))
+                .execute(conn)?;
 
-                if rows_affected != 1 {
-                    return Err(DatabaseError::Logical(format!(
-                        "unexpected number of rows ({rows_affected})",
-                    )));
-                }
+            if rows_affected != 1 {
+                return Err(DatabaseError::Logical(format!(
+                    "unexpected number of rows ({rows_affected})",
+                )));
+            }
 
-                Ok(())
-            })
+            Ok(())
         })
         .await
     }

From bf6d5e93baa176f1f0015833beb108f9995a5142 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Thu, 30 Jan 2025 20:32:35 +0100
Subject: [PATCH 318/583] Run tests of the contrib extensions (#10392)

## Problem
We don't test the extensions, shipped with contrib
## Summary of changes
The tests are now running
---
 compute/compute-node.Dockerfile           |   1 +
 compute/patches/contrib_pg16.patch        | 242 ++++++++++++++++++++++
 compute/patches/contrib_pg17.patch        | 196 ++++++++++++++++++
 docker-compose/compute_wrapper/Dockerfile |   2 +-
 docker-compose/docker_compose_test.sh     |  35 +++-
 docker-compose/run-tests.sh               |   6 +-
 6 files changed, 469 insertions(+), 13 deletions(-)
 create mode 100644 compute/patches/contrib_pg16.patch
 create mode 100644 compute/patches/contrib_pg17.patch

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index e9f6c03768..1ef449f0b0 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1345,6 +1345,7 @@ FROM neon-pg-ext-build AS neon-pg-ext-test
 ARG PG_VERSION
 RUN mkdir /ext-src
 
+COPY --from=pg-build /postgres /postgres
 #COPY --from=postgis-build /postgis.tar.gz /ext-src/
 #COPY --from=postgis-build /sfcgal/* /usr
 COPY --from=plv8-build /plv8.tar.gz /ext-src/
diff --git a/compute/patches/contrib_pg16.patch b/compute/patches/contrib_pg16.patch
new file mode 100644
index 0000000000..71adaabe7d
--- /dev/null
+++ b/compute/patches/contrib_pg16.patch
@@ -0,0 +1,242 @@
+diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out
+index 979e5e8..2375b45 100644
+--- a/contrib/amcheck/expected/check_heap.out
++++ b/contrib/amcheck/expected/check_heap.out
+@@ -80,12 +80,9 @@ INSERT INTO heaptest (a, b)
+ -- same transaction.  The heaptest table is smaller than the default
+ -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
+ -- shared_buffers.  A transaction delays that and excludes any autovacuum.
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
+ SELECT sum(reads) AS stats_bulkreads_before
+   FROM pg_stat_io WHERE context = 'bulkread' \gset
+ BEGIN;
+-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
+ -- Check that valid options are not rejected nor corruption reported
+ -- for a non-empty table
+ SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
+@@ -118,14 +115,6 @@ SELECT pg_stat_force_next_flush();
+  
+ (1 row)
+ 
+-SELECT sum(reads) AS stats_bulkreads_after
+-  FROM pg_stat_io WHERE context = 'bulkread' \gset
+-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
+- ?column? 
+-----------
+- t
+-(1 row)
+-
+ CREATE ROLE regress_heaptest_role;
+ -- verify permissions are checked (error due to function not callable)
+ SET ROLE regress_heaptest_role;
+@@ -233,7 +222,6 @@ ERROR:  cannot check relation "test_foreign_table"
+ DETAIL:  This operation is not supported for foreign tables.
+ -- cleanup
+ DROP TABLE heaptest;
+-DROP TABLESPACE regress_test_stats_tblspc;
+ DROP TABLE test_partition;
+ DROP TABLE test_partitioned;
+ DROP OWNED BY regress_heaptest_role; -- permissions
+diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql
+index 1745bae..3b429c3 100644
+--- a/contrib/amcheck/sql/check_heap.sql
++++ b/contrib/amcheck/sql/check_heap.sql
+@@ -40,12 +40,9 @@ INSERT INTO heaptest (a, b)
+ -- same transaction.  The heaptest table is smaller than the default
+ -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
+ -- shared_buffers.  A transaction delays that and excludes any autovacuum.
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
+ SELECT sum(reads) AS stats_bulkreads_before
+   FROM pg_stat_io WHERE context = 'bulkread' \gset
+ BEGIN;
+-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
+ -- Check that valid options are not rejected nor corruption reported
+ -- for a non-empty table
+ SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
+@@ -58,9 +55,6 @@ COMMIT;
+ --   ALTER TABLE ... SET TABLESPACE ...
+ -- causing an additional bulkread, which should be reflected in pg_stat_io.
+ SELECT pg_stat_force_next_flush();
+-SELECT sum(reads) AS stats_bulkreads_after
+-  FROM pg_stat_io WHERE context = 'bulkread' \gset
+-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
+ 
+ CREATE ROLE regress_heaptest_role;
+ 
+@@ -140,7 +134,6 @@ SELECT * FROM verify_heapam('test_foreign_table',
+ 
+ -- cleanup
+ DROP TABLE heaptest;
+-DROP TABLESPACE regress_test_stats_tblspc;
+ DROP TABLE test_partition;
+ DROP TABLE test_partitioned;
+ DROP OWNED BY regress_heaptest_role; -- permissions
+diff --git a/contrib/citext/expected/create_index_acl.out b/contrib/citext/expected/create_index_acl.out
+index 33be13a..70a406c 100644
+--- a/contrib/citext/expected/create_index_acl.out
++++ b/contrib/citext/expected/create_index_acl.out
+@@ -5,9 +5,6 @@
+ -- owner having as few applicable privileges as possible.  (The privileges.sql
+ -- regress_sro_user tests look for the opposite defect; they confirm that
+ -- DefineIndex() uses the table owner userid where necessary.)
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
+-RESET allow_in_place_tablespaces;
+ BEGIN;
+ CREATE ROLE regress_minimal;
+ CREATE SCHEMA s;
+@@ -49,11 +46,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
+ -- Empty-table DefineIndex()
+ CREATE UNIQUE INDEX u0rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Make the table nonempty.
+ INSERT INTO s.x VALUES ('foo'), ('bar');
+@@ -66,11 +61,9 @@ RESET search_path;
+ GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
+ CREATE UNIQUE INDEX u2rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Shall not find s.coll via search_path, despite the s.const->public.setter
+ -- call having set search_path=s during expression planning.  Suppress the
+@@ -78,9 +71,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+ \set VERBOSITY sqlstate
+ ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ ERROR:  42704
+ \set VERBOSITY default
+ ROLLBACK;
+-DROP TABLESPACE regress_create_idx_tblspace;
+diff --git a/contrib/citext/sql/create_index_acl.sql b/contrib/citext/sql/create_index_acl.sql
+index 10b5225..ae442e1 100644
+--- a/contrib/citext/sql/create_index_acl.sql
++++ b/contrib/citext/sql/create_index_acl.sql
+@@ -6,10 +6,6 @@
+ -- regress_sro_user tests look for the opposite defect; they confirm that
+ -- DefineIndex() uses the table owner userid where necessary.)
+ 
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
+-RESET allow_in_place_tablespaces;
+-
+ BEGIN;
+ CREATE ROLE regress_minimal;
+ CREATE SCHEMA s;
+@@ -51,11 +47,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
+ -- Empty-table DefineIndex()
+ CREATE UNIQUE INDEX u0rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Make the table nonempty.
+ INSERT INTO s.x VALUES ('foo'), ('bar');
+@@ -68,11 +62,9 @@ RESET search_path;
+ GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
+ CREATE UNIQUE INDEX u2rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Shall not find s.coll via search_path, despite the s.const->public.setter
+ -- call having set search_path=s during expression planning.  Suppress the
+@@ -80,9 +72,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+ \set VERBOSITY sqlstate
+ ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ \set VERBOSITY default
+ ROLLBACK;
+ 
+-DROP TABLESPACE regress_create_idx_tblspace;
+diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out
+index 72304e0..ebe131b 100644
+--- a/contrib/file_fdw/expected/file_fdw.out
++++ b/contrib/file_fdw/expected/file_fdw.out
+@@ -4,6 +4,7 @@
+ -- directory paths are passed to us in environment variables
+ \getenv abs_srcdir PG_ABS_SRCDIR
+ -- Clean up in case a prior regression run failed
++SET compute_query_id TO 'off';
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
+ RESET client_min_messages;
+diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql
+index f0548e1..848a08c 100644
+--- a/contrib/file_fdw/sql/file_fdw.sql
++++ b/contrib/file_fdw/sql/file_fdw.sql
+@@ -6,6 +6,7 @@
+ \getenv abs_srcdir PG_ABS_SRCDIR
+ 
+ -- Clean up in case a prior regression run failed
++SET compute_query_id TO 'off';
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
+ RESET client_min_messages;
+diff --git a/contrib/pageinspect/expected/gist.out b/contrib/pageinspect/expected/gist.out
+index d1adbab..38b52ac 100644
+--- a/contrib/pageinspect/expected/gist.out
++++ b/contrib/pageinspect/expected/gist.out
+@@ -10,25 +10,6 @@ BEGIN;
+ CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM
+     generate_series(1,1000) i;
+ CREATE INDEX test_gist_idx ON test_gist USING gist (p);
+--- Page 0 is the root, the rest are leaf pages
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0));
+- lsn | nsn | rightlink  | flags 
+------+-----+------------+-------
+- 0/1 | 0/0 | 4294967295 | {}
+-(1 row)
+-
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1));
+- lsn | nsn | rightlink  | flags  
+------+-----+------------+--------
+- 0/1 | 0/0 | 4294967295 | {leaf}
+-(1 row)
+-
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2));
+- lsn | nsn | rightlink | flags  
+------+-----+-----------+--------
+- 0/1 | 0/0 |         1 | {leaf}
+-(1 row)
+-
+ COMMIT;
+ SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx');
+  itemoffset |   ctid    | itemlen | dead |             keys              
+diff --git a/contrib/pageinspect/sql/gist.sql b/contrib/pageinspect/sql/gist.sql
+index d263542..607992f 100644
+--- a/contrib/pageinspect/sql/gist.sql
++++ b/contrib/pageinspect/sql/gist.sql
+@@ -12,11 +12,6 @@ CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM
+     generate_series(1,1000) i;
+ CREATE INDEX test_gist_idx ON test_gist USING gist (p);
+ 
+--- Page 0 is the root, the rest are leaf pages
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0));
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1));
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2));
+-
+ COMMIT;
+ 
+ SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx');
diff --git a/compute/patches/contrib_pg17.patch b/compute/patches/contrib_pg17.patch
new file mode 100644
index 0000000000..0d6c1203b0
--- /dev/null
+++ b/compute/patches/contrib_pg17.patch
@@ -0,0 +1,196 @@
+diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out
+index 979e5e8..2375b45 100644
+--- a/contrib/amcheck/expected/check_heap.out
++++ b/contrib/amcheck/expected/check_heap.out
+@@ -80,12 +80,9 @@ INSERT INTO heaptest (a, b)
+ -- same transaction.  The heaptest table is smaller than the default
+ -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
+ -- shared_buffers.  A transaction delays that and excludes any autovacuum.
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
+ SELECT sum(reads) AS stats_bulkreads_before
+   FROM pg_stat_io WHERE context = 'bulkread' \gset
+ BEGIN;
+-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
+ -- Check that valid options are not rejected nor corruption reported
+ -- for a non-empty table
+ SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
+@@ -118,14 +115,6 @@ SELECT pg_stat_force_next_flush();
+  
+ (1 row)
+ 
+-SELECT sum(reads) AS stats_bulkreads_after
+-  FROM pg_stat_io WHERE context = 'bulkread' \gset
+-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
+- ?column? 
+-----------
+- t
+-(1 row)
+-
+ CREATE ROLE regress_heaptest_role;
+ -- verify permissions are checked (error due to function not callable)
+ SET ROLE regress_heaptest_role;
+@@ -233,7 +222,6 @@ ERROR:  cannot check relation "test_foreign_table"
+ DETAIL:  This operation is not supported for foreign tables.
+ -- cleanup
+ DROP TABLE heaptest;
+-DROP TABLESPACE regress_test_stats_tblspc;
+ DROP TABLE test_partition;
+ DROP TABLE test_partitioned;
+ DROP OWNED BY regress_heaptest_role; -- permissions
+diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql
+index 1745bae..3b429c3 100644
+--- a/contrib/amcheck/sql/check_heap.sql
++++ b/contrib/amcheck/sql/check_heap.sql
+@@ -40,12 +40,9 @@ INSERT INTO heaptest (a, b)
+ -- same transaction.  The heaptest table is smaller than the default
+ -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
+ -- shared_buffers.  A transaction delays that and excludes any autovacuum.
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
+ SELECT sum(reads) AS stats_bulkreads_before
+   FROM pg_stat_io WHERE context = 'bulkread' \gset
+ BEGIN;
+-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
+ -- Check that valid options are not rejected nor corruption reported
+ -- for a non-empty table
+ SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
+@@ -58,9 +55,6 @@ COMMIT;
+ --   ALTER TABLE ... SET TABLESPACE ...
+ -- causing an additional bulkread, which should be reflected in pg_stat_io.
+ SELECT pg_stat_force_next_flush();
+-SELECT sum(reads) AS stats_bulkreads_after
+-  FROM pg_stat_io WHERE context = 'bulkread' \gset
+-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
+ 
+ CREATE ROLE regress_heaptest_role;
+ 
+@@ -140,7 +134,6 @@ SELECT * FROM verify_heapam('test_foreign_table',
+ 
+ -- cleanup
+ DROP TABLE heaptest;
+-DROP TABLESPACE regress_test_stats_tblspc;
+ DROP TABLE test_partition;
+ DROP TABLE test_partitioned;
+ DROP OWNED BY regress_heaptest_role; -- permissions
+diff --git a/contrib/citext/expected/create_index_acl.out b/contrib/citext/expected/create_index_acl.out
+index 33be13a..70a406c 100644
+--- a/contrib/citext/expected/create_index_acl.out
++++ b/contrib/citext/expected/create_index_acl.out
+@@ -5,9 +5,6 @@
+ -- owner having as few applicable privileges as possible.  (The privileges.sql
+ -- regress_sro_user tests look for the opposite defect; they confirm that
+ -- DefineIndex() uses the table owner userid where necessary.)
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
+-RESET allow_in_place_tablespaces;
+ BEGIN;
+ CREATE ROLE regress_minimal;
+ CREATE SCHEMA s;
+@@ -49,11 +46,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
+ -- Empty-table DefineIndex()
+ CREATE UNIQUE INDEX u0rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Make the table nonempty.
+ INSERT INTO s.x VALUES ('foo'), ('bar');
+@@ -66,11 +61,9 @@ RESET search_path;
+ GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
+ CREATE UNIQUE INDEX u2rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Shall not find s.coll via search_path, despite the s.const->public.setter
+ -- call having set search_path=s during expression planning.  Suppress the
+@@ -78,9 +71,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+ \set VERBOSITY sqlstate
+ ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ ERROR:  42704
+ \set VERBOSITY default
+ ROLLBACK;
+-DROP TABLESPACE regress_create_idx_tblspace;
+diff --git a/contrib/citext/sql/create_index_acl.sql b/contrib/citext/sql/create_index_acl.sql
+index 10b5225..ae442e1 100644
+--- a/contrib/citext/sql/create_index_acl.sql
++++ b/contrib/citext/sql/create_index_acl.sql
+@@ -6,10 +6,6 @@
+ -- regress_sro_user tests look for the opposite defect; they confirm that
+ -- DefineIndex() uses the table owner userid where necessary.)
+ 
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
+-RESET allow_in_place_tablespaces;
+-
+ BEGIN;
+ CREATE ROLE regress_minimal;
+ CREATE SCHEMA s;
+@@ -51,11 +47,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
+ -- Empty-table DefineIndex()
+ CREATE UNIQUE INDEX u0rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Make the table nonempty.
+ INSERT INTO s.x VALUES ('foo'), ('bar');
+@@ -68,11 +62,9 @@ RESET search_path;
+ GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
+ CREATE UNIQUE INDEX u2rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Shall not find s.coll via search_path, despite the s.const->public.setter
+ -- call having set search_path=s during expression planning.  Suppress the
+@@ -80,9 +72,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+ \set VERBOSITY sqlstate
+ ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ \set VERBOSITY default
+ ROLLBACK;
+ 
+-DROP TABLESPACE regress_create_idx_tblspace;
+diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out
+index 86c148a..81bdb2c 100644
+--- a/contrib/file_fdw/expected/file_fdw.out
++++ b/contrib/file_fdw/expected/file_fdw.out
+@@ -4,6 +4,7 @@
+ -- directory paths are passed to us in environment variables
+ \getenv abs_srcdir PG_ABS_SRCDIR
+ -- Clean up in case a prior regression run failed
++SET compute_query_id TO 'off';
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
+ RESET client_min_messages;
+diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql
+index f0548e1..848a08c 100644
+--- a/contrib/file_fdw/sql/file_fdw.sql
++++ b/contrib/file_fdw/sql/file_fdw.sql
+@@ -6,6 +6,7 @@
+ \getenv abs_srcdir PG_ABS_SRCDIR
+ 
+ -- Clean up in case a prior regression run failed
++SET compute_query_id TO 'off';
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
+ RESET client_min_messages;
diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile
index 61f44681da..b5f0f47ceb 100644
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -13,6 +13,6 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
                        jq   \
                        netcat-openbsd
 #This is required for the pg_hintplan test
-RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src
+RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw
 
 USER postgres
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index a05d6c043d..e0c537edf3 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -61,17 +61,32 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
         docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
         docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
         rm -rf $TMPDIR
+        # The following block does the same for the contrib/file_fdw test
+        TMPDIR=$(mktemp -d)
+        docker cp $TEST_CONTAINER_NAME:/postgres/contrib/file_fdw/data $TMPDIR/data
+        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/postgres/contrib/file_fdw/data
+        rm -rf $TMPDIR
+        # Apply patches
+        cat ../compute/patches/contrib_pg${pg_version}.patch | docker exec -i $TEST_CONTAINER_NAME bash -c "(cd /postgres && patch -p1)"
         # We are running tests now
-        if ! docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
-            $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
-        then
-            FAILED=$(tail -1 testout.txt)
-            for d in $FAILED
-            do
-                mkdir $d
-                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.diffs $d || true
-                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.out $d || true
-                cat $d/regression.out $d/regression.diffs || true
+        rm -f testout.txt testout_contrib.txt
+        docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
+        $TEST_CONTAINER_NAME /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
+        docker exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
+        $TEST_CONTAINER_NAME /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
+        if [ $EXT_SUCCESS -eq 0 ] || [ $CONTRIB_SUCCESS -eq 0 ]; then
+            CONTRIB_FAILED=
+            FAILED=
+            [ $EXT_SUCCESS -eq 0 ] && FAILED=$(tail -1 testout.txt | awk '{for(i=1;i<=NF;i++){print "/ext-src/"$i;}}')
+            [ $CONTRIB_SUCCESS -eq 0 ] && CONTRIB_FAILED=$(tail -1 testout_contrib.txt | awk '{for(i=0;i<=NF;i++){print "/postgres/contrib/"$i;}}')
+            for d in $FAILED $CONTRIB_FAILED; do
+                dn="$(basename $d)"
+                rm -rf $dn
+                mkdir $dn
+                docker cp $TEST_CONTAINER_NAME:$d/regression.diffs $dn || [ $? -eq 1 ]
+                docker cp $TEST_CONTAINER_NAME:$d/regression.out $dn || [ $? -eq 1 ]
+                cat $dn/regression.out $dn/regression.diffs || true
+                rm -rf $dn
             done
         rm -rf $FAILED
         exit 1
diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh
index 1e794a42a1..72ae61b032 100644
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -1,9 +1,11 @@
 #!/bin/bash
 set -x
 
-cd /ext-src || exit 2
+extdir=${1}
+
+cd "${extdir}" || exit 2
 FAILED=
-LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
+LIST=$( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u)
 for d in ${LIST}; do
     [ -d "${d}" ] || continue
     if ! psql -w -c "select 1" >/dev/null; then

From 6da7c556c2f3725d51ea2c626491a8011be97f04 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 30 Jan 2025 20:33:22 +0000
Subject: [PATCH 319/583] pageserver: fix race cleaning up timeline files when
 shut down during bootstrap (#10532)

## Problem

Timeline bootstrap starts a flush loop, but doesn't reliably shut down
the timeline (incl. waiting for flush loop to exit) before destroying
UninitializedTimeline, and that destructor tries to clean up local
storage. If local storage is still being written to, then this is
unsound.

Currently the symptom is that we see a "Directory not empty" error log,
e.g.
https://neon-github-public-dev.s3.amazonaws.com/reports/main/12966756686/index.html#testresult/5523f7d15f46f7f7/retries

## Summary of changes

- Move fallible IO part of bootstrap into a function (notably, this is
fallible in the case of the tenant being shut down while creation is
happening)
- When that function returns an error, call shutdown() on the timeline
---
 pageserver/src/http/routes.rs            |   8 +-
 pageserver/src/tenant.rs                 |  68 ++++++------
 pageserver/src/tenant/timeline/uninit.rs | 129 +++++++++++++++++------
 3 files changed, 131 insertions(+), 74 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 0f3e9fdab6..2548a11b2e 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3169,12 +3169,16 @@ async fn put_tenant_timeline_import_basebackup(
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
 
-    let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    let span = info_span!("import_basebackup",
+        tenant_id=%tenant_id, timeline_id=%timeline_id, shard_id=%tenant_shard_id.shard_slug(),
+        base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
     async move {
         let state = get_state(&request);
         let tenant = state
             .tenant_manager
-            .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?;
+            .get_attached_tenant_shard(tenant_shard_id)?;
 
         let broker_client = state.broker_client.clone();
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 085f76c05d..657cc78e2c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2426,7 +2426,7 @@ impl Tenant {
         // Make sure the freeze_and_flush reaches remote storage.
         tline.remote_client.wait_completion().await.unwrap();
 
-        let tl = uninit_tl.finish_creation()?;
+        let tl = uninit_tl.finish_creation().await?;
         // The non-test code would call tl.activate() here.
         tl.set_state(TimelineState::Active);
         Ok(tl)
@@ -4702,7 +4702,7 @@ impl Tenant {
             )
             .await?;
 
-        let new_timeline = uninitialized_timeline.finish_creation()?;
+        let new_timeline = uninitialized_timeline.finish_creation().await?;
 
         // Root timeline gets its layers during creation and uploads them along with the metadata.
         // A branch timeline though, when created, can get no writes for some time, hence won't get any layers created.
@@ -4892,10 +4892,11 @@ impl Tenant {
         }
 
         // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
+        let pgdata_path_deferred = pgdata_path.clone();
         scopeguard::defer! {
-            if let Err(e) = fs::remove_dir_all(&pgdata_path) {
+            if let Err(e) = fs::remove_dir_all(&pgdata_path_deferred) {
                 // this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call
-                error!("Failed to remove temporary initdb directory '{pgdata_path}': {e}");
+                error!("Failed to remove temporary initdb directory '{pgdata_path_deferred}': {e}");
             }
         }
         if let Some(existing_initdb_timeline_id) = load_existing_initdb {
@@ -4962,7 +4963,7 @@ impl Tenant {
             pgdata_lsn,
             pg_version,
         );
-        let raw_timeline = self
+        let mut raw_timeline = self
             .prepare_new_timeline(
                 timeline_id,
                 &new_metadata,
@@ -4973,42 +4974,33 @@ impl Tenant {
             .await?;
 
         let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id;
-        let unfinished_timeline = raw_timeline.raw_timeline()?;
-
-        // Flush the new layer files to disk, before we make the timeline as available to
-        // the outside world.
-        //
-        // Flush loop needs to be spawned in order to be able to flush.
-        unfinished_timeline.maybe_spawn_flush_loop();
-
-        import_datadir::import_timeline_from_postgres_datadir(
-            unfinished_timeline,
-            &pgdata_path,
-            pgdata_lsn,
-            ctx,
-        )
-        .await
-        .with_context(|| {
-            format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}")
-        })?;
-
-        fail::fail_point!("before-checkpoint-new-timeline", |_| {
-            Err(CreateTimelineError::Other(anyhow::anyhow!(
-                "failpoint before-checkpoint-new-timeline"
-            )))
-        });
-
-        unfinished_timeline
-            .freeze_and_flush()
-            .await
-            .with_context(|| {
-                format!(
-                    "Failed to flush after pgdatadir import for timeline {tenant_shard_id}/{timeline_id}"
+        raw_timeline
+            .write(|unfinished_timeline| async move {
+                import_datadir::import_timeline_from_postgres_datadir(
+                    &unfinished_timeline,
+                    &pgdata_path,
+                    pgdata_lsn,
+                    ctx,
                 )
-            })?;
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}"
+                    )
+                })?;
+
+                fail::fail_point!("before-checkpoint-new-timeline", |_| {
+                    Err(CreateTimelineError::Other(anyhow::anyhow!(
+                        "failpoint before-checkpoint-new-timeline"
+                    )))
+                });
+
+                Ok(())
+            })
+            .await?;
 
         // All done!
-        let timeline = raw_timeline.finish_creation()?;
+        let timeline = raw_timeline.finish_creation().await?;
 
         // Callers are responsible to wait for uploads to complete and for activating the timeline.
 
diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs
index 80a09b4840..3074463384 100644
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -1,4 +1,4 @@
-use std::{collections::hash_map::Entry, fs, sync::Arc};
+use std::{collections::hash_map::Entry, fs, future::Future, sync::Arc};
 
 use anyhow::Context;
 use camino::Utf8PathBuf;
@@ -8,7 +8,8 @@ use utils::{fs_ext, id::TimelineId, lsn::Lsn, sync::gate::GateGuard};
 use crate::{
     context::RequestContext,
     import_datadir,
-    tenant::{CreateTimelineIdempotency, Tenant, TimelineOrOffloaded},
+    span::debug_assert_current_span_has_tenant_and_timeline_id,
+    tenant::{CreateTimelineError, CreateTimelineIdempotency, Tenant, TimelineOrOffloaded},
 };
 
 use super::Timeline;
@@ -24,6 +25,9 @@ pub struct UninitializedTimeline<'t> {
     pub(crate) owning_tenant: &'t Tenant,
     timeline_id: TimelineId,
     raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
+    /// Whether we spawned the inner Timeline's tasks such that we must later shut it down
+    /// if aborting the timeline creation
+    needs_shutdown: bool,
 }
 
 impl<'t> UninitializedTimeline<'t> {
@@ -36,6 +40,50 @@ impl<'t> UninitializedTimeline<'t> {
             owning_tenant,
             timeline_id,
             raw_timeline,
+            needs_shutdown: false,
+        }
+    }
+
+    /// When writing data to this timeline during creation, use this wrapper: it will take care of
+    /// setup of Timeline tasks required for I/O (flush loop) and making sure they are torn down
+    /// later.
+    pub(crate) async fn write<F, Fut>(&mut self, f: F) -> anyhow::Result<()>
+    where
+        F: FnOnce(Arc<Timeline>) -> Fut,
+        Fut: Future<Output = Result<(), CreateTimelineError>>,
+    {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
+        // Remember that we did I/O (spawned the flush loop), so that we can check we shut it down on drop
+        self.needs_shutdown = true;
+
+        let timeline = self.raw_timeline()?;
+
+        // Spawn flush loop so that the Timeline is ready to accept writes
+        timeline.maybe_spawn_flush_loop();
+
+        // Invoke the provided function, which will write some data into the new timeline
+        if let Err(e) = f(timeline.clone()).await {
+            self.abort().await;
+            return Err(e.into());
+        }
+
+        // Flush the underlying timeline's ephemeral layers to disk
+        if let Err(e) = timeline
+            .freeze_and_flush()
+            .await
+            .context("Failed to flush after timeline creation writes")
+        {
+            self.abort().await;
+            return Err(e);
+        }
+
+        Ok(())
+    }
+
+    pub(crate) async fn abort(&self) {
+        if let Some((raw_timeline, _)) = self.raw_timeline.as_ref() {
+            raw_timeline.shutdown(super::ShutdownMode::Hard).await;
         }
     }
 
@@ -44,11 +92,13 @@ impl<'t> UninitializedTimeline<'t> {
     /// This function launches the flush loop if not already done.
     ///
     /// The caller is responsible for activating the timeline (function `.activate()`).
-    pub(crate) fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
+    pub(crate) async fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
         let timeline_id = self.timeline_id;
         let tenant_shard_id = self.owning_tenant.tenant_shard_id;
 
         if self.raw_timeline.is_none() {
+            self.abort().await;
+
             return Err(anyhow::anyhow!(
                 "No timeline for initialization found for {tenant_shard_id}/{timeline_id}"
             ));
@@ -62,16 +112,25 @@ impl<'t> UninitializedTimeline<'t> {
             .0
             .get_disk_consistent_lsn();
 
-        anyhow::ensure!(
-            new_disk_consistent_lsn.is_valid(),
-            "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn"
-        );
+        if !new_disk_consistent_lsn.is_valid() {
+            self.abort().await;
+
+            return Err(anyhow::anyhow!(
+                "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn"
+            ));
+        }
 
         let mut timelines = self.owning_tenant.timelines.lock().unwrap();
         match timelines.entry(timeline_id) {
-            Entry::Occupied(_) => anyhow::bail!(
+            Entry::Occupied(_) => {
+                // Unexpected, bug in the caller.  Tenant is responsible for preventing concurrent creation of the same timeline.
+                //
+                // We do not call Self::abort here.  Because we don't cleanly shut down our Timeline, [`Self::drop`] should
+                // skip trying to delete the timeline directory too.
+                anyhow::bail!(
                 "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map"
-            ),
+                )
+            }
             Entry::Vacant(v) => {
                 // after taking here should be no fallible operations, because the drop guard will not
                 // cleanup after and would block for example the tenant deletion
@@ -93,36 +152,31 @@ impl<'t> UninitializedTimeline<'t> {
 
     /// Prepares timeline data by loading it from the basebackup archive.
     pub(crate) async fn import_basebackup_from_tar(
-        self,
+        mut self,
         tenant: Arc<Tenant>,
         copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
         base_lsn: Lsn,
         broker_client: storage_broker::BrokerClientChannel,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
-        let raw_timeline = self.raw_timeline()?;
+        self.write(|raw_timeline| async move {
+            import_datadir::import_basebackup_from_tar(&raw_timeline, copyin_read, base_lsn, ctx)
+                .await
+                .context("Failed to import basebackup")
+                .map_err(CreateTimelineError::Other)?;
 
-        import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx)
-            .await
-            .context("Failed to import basebackup")?;
+            fail::fail_point!("before-checkpoint-new-timeline", |_| {
+                Err(CreateTimelineError::Other(anyhow::anyhow!(
+                    "failpoint before-checkpoint-new-timeline"
+                )))
+            });
 
-        // Flush the new layer files to disk, before we make the timeline as available to
-        // the outside world.
-        //
-        // Flush loop needs to be spawned in order to be able to flush.
-        raw_timeline.maybe_spawn_flush_loop();
-
-        fail::fail_point!("before-checkpoint-new-timeline", |_| {
-            anyhow::bail!("failpoint before-checkpoint-new-timeline");
-        });
-
-        raw_timeline
-            .freeze_and_flush()
-            .await
-            .context("Failed to flush after basebackup import")?;
+            Ok(())
+        })
+        .await?;
 
         // All the data has been imported. Insert the Timeline into the tenant's timelines map
-        let tl = self.finish_creation()?;
+        let tl = self.finish_creation().await?;
         tl.activate(tenant, broker_client, None, ctx);
         Ok(tl)
     }
@@ -143,12 +197,19 @@ impl<'t> UninitializedTimeline<'t> {
 
 impl Drop for UninitializedTimeline<'_> {
     fn drop(&mut self) {
-        if let Some((_, create_guard)) = self.raw_timeline.take() {
+        if let Some((timeline, create_guard)) = self.raw_timeline.take() {
             let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered();
-            // This is unusual, but can happen harmlessly if the pageserver is stopped while
-            // creating a timeline.
-            info!("Timeline got dropped without initializing, cleaning its files");
-            cleanup_timeline_directory(create_guard);
+            if self.needs_shutdown && !timeline.gate.close_complete() {
+                // This should not happen: caller should call [`Self::abort`] on failures
+                tracing::warn!(
+                    "Timeline not shut down after initialization failure, cannot clean up files"
+                );
+            } else {
+                // This is unusual, but can happen harmlessly if the pageserver is stopped while
+                // creating a timeline.
+                info!("Timeline got dropped without initializing, cleaning its files");
+                cleanup_timeline_directory(create_guard);
+            }
         }
     }
 }

From d18f6198e15d5201db3c3fa5e3d475ad9de334fb Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 30 Jan 2025 22:17:07 +0000
Subject: [PATCH 320/583] storcon: fix AZ-driven tenant selection in chaos
 (#10443)

## Problem

In https://github.com/neondatabase/neon/pull/10438 I had got the
function for picking tenants backwards, and it was preferring to move
things _away_ from their preferred AZ.

## Summary of changes

- Fix condition in `is_attached_outside_preferred_az`
---
 storage_controller/src/tenant_shard.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index cbc2696b26..d344e27e31 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1806,7 +1806,7 @@ impl TenantShard {
                         .get(&node_id)
                         .expect("referenced node exists")
                         .get_availability_zone_id(),
-                ) == self.intent.preferred_az_id.as_ref()
+                ) != self.intent.preferred_az_id.as_ref()
             })
             .unwrap_or(false)
     }

From e1273acdb1e018352d97ea11bf65adf90db69c78 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 30 Jan 2025 22:43:36 +0000
Subject: [PATCH 321/583] pageserver: handle shutdown cleanly in layer download
 API (#10598)

## Problem

This API is used in tests and occasionally for support. It cast all
errors to 500.

That can cause a failure on the log checks:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/13056992876/index.html#suites/ad9c266207b45eafe19909d1020dd987/683a7031d877f3db/

## Summary of changes

- Avoid using generic anyhow::Error for layer downloads
- Map shutdown cases to 503 in http route
---
 pageserver/src/http/routes.rs                 |  8 +++++++-
 pageserver/src/tenant/storage_layer/layer.rs  |  2 +-
 pageserver/src/tenant/timeline.rs             | 12 ++++++++++--
 test_runner/regress/test_ondemand_download.py | 19 +++++++------------
 4 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 2548a11b2e..eb9cb4da0c 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1472,7 +1472,13 @@ async fn layer_download_handler(
     let downloaded = timeline
         .download_layer(&layer_name)
         .await
-        .map_err(ApiError::InternalServerError)?;
+        .map_err(|e| match e {
+            tenant::storage_layer::layer::DownloadError::TimelineShutdown
+            | tenant::storage_layer::layer::DownloadError::DownloadCancelled => {
+                ApiError::ShuttingDown
+            }
+            other => ApiError::InternalServerError(other.into()),
+        })?;
 
     match downloaded {
         Some(true) => json_response(StatusCode::OK, ()),
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 99e0ff1aa5..92313afba7 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -340,7 +340,7 @@ impl Layer {
     /// Download the layer if evicted.
     ///
     /// Will not error when the layer is already downloaded.
-    pub(crate) async fn download(&self) -> anyhow::Result<()> {
+    pub(crate) async fn download(&self) -> Result<(), DownloadError> {
         self.0.get_or_maybe_download(true, None).await?;
         Ok(())
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f387c81c29..827601fa8b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2028,8 +2028,16 @@ impl Timeline {
     pub(crate) async fn download_layer(
         &self,
         layer_file_name: &LayerName,
-    ) -> anyhow::Result<Option<bool>> {
-        let Some(layer) = self.find_layer(layer_file_name).await? else {
+    ) -> Result<Option<bool>, super::storage_layer::layer::DownloadError> {
+        let Some(layer) = self
+            .find_layer(layer_file_name)
+            .await
+            .map_err(|e| match e {
+                layer_manager::Shutdown => {
+                    super::storage_layer::layer::DownloadError::TimelineShutdown
+                }
+            })?
+        else {
             return Ok(None);
         };
 
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 028d1c2e49..c344f30f4d 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -27,6 +27,7 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
 from fixtures.utils import query_scalar, wait_until
+from urllib3 import Retry
 
 if TYPE_CHECKING:
     from typing import Any
@@ -676,16 +677,14 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
             "compaction_period": "0s",
         }
     )
-    client = env.pageserver.http_client()
+
+    # Disable retries, because we'll hit code paths that can give us
+    # 503 and want to see that directly
+    client = env.pageserver.http_client(retries=Retry(status=0))
+
     failpoint = "before-downloading-layer-stream-pausable"
     client.configure_failpoints((failpoint, "pause"))
 
-    env.pageserver.allowed_errors.extend(
-        [
-            ".*downloading failed, possibly for shutdown.*",
-        ]
-    )
-
     info = client.layer_map_info(env.initial_tenant, env.initial_timeline)
     assert len(info.delta_layers()) == 1
 
@@ -720,13 +719,9 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
 
         client.configure_failpoints((failpoint, "off"))
 
-        with pytest.raises(
-            PageserverApiException, match="downloading failed, possibly for shutdown"
-        ):
+        with pytest.raises(PageserverApiException, match="Shutting down"):
             download.result()
 
-        env.pageserver.assert_log_contains(".*downloading failed, possibly for shutdown.*")
-
         detach.result()
 
         client.configure_failpoints((failpoint, "pause"))

From 5e0c40709f8a18477454beb0fe7cb6d9d522dbf7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 30 Jan 2025 22:45:43 +0000
Subject: [PATCH 322/583] storcon: refine chaos selection logic (#10600)

## Problem

In https://github.com/neondatabase/neon/pull/10438 it was pointed out
that it would be good to avoid picking tenants in ID order, and also to
avoid situations where we might double-select the same tenant.

There was an initial swing at this in
https://github.com/neondatabase/neon/pull/10443, where Chi suggested a
simpler approach which is done in this PR

## Summary of changes

- Split total set of tenants into in and out of home AZ
- Consume out of home AZ first, and if necessary shuffle + consume from
out of home AZ
---
 .../src/service/chaos_injector.rs             | 37 ++++++++++++-------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs
index 98034421d6..91d7183fde 100644
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -96,29 +96,38 @@ impl ChaosInjector {
         let batch_size = 128;
         let mut inner = self.service.inner.write().unwrap();
         let (nodes, tenants, scheduler) = inner.parts_mut();
-        let tenant_ids = tenants.keys().cloned().collect::<Vec<_>>();
 
         // Prefer to migrate tenants that are currently outside their home AZ.  This avoids the chaos injector
         // continuously pushing tenants outside their home AZ: instead, we'll tend to cycle between picking some
         // random tenants to move, and then on next chaos iteration moving them back, then picking some new
         // random tenants on the next iteration.
+        let (out_of_home_az, in_home_az): (Vec<_>, Vec<_>) = tenants
+            .values()
+            .map(|shard| {
+                (
+                    shard.tenant_shard_id,
+                    shard.is_attached_outside_preferred_az(nodes),
+                )
+            })
+            .partition(|(_id, is_outside)| *is_outside);
+
+        let mut out_of_home_az: Vec<_> = out_of_home_az.into_iter().map(|(id, _)| id).collect();
+        let mut in_home_az: Vec<_> = in_home_az.into_iter().map(|(id, _)| id).collect();
+
         let mut victims = Vec::with_capacity(batch_size);
-        for shard in tenants.values() {
-            if shard.is_attached_outside_preferred_az(nodes) {
-                victims.push(shard.tenant_shard_id);
-            }
+        if out_of_home_az.len() >= batch_size {
+            tracing::info!("Injecting chaos: found {batch_size} shards to migrate back to home AZ (total {} out of home AZ)", out_of_home_az.len());
 
-            if victims.len() >= batch_size {
-                break;
-            }
+            out_of_home_az.shuffle(&mut thread_rng());
+            victims.extend(out_of_home_az.into_iter().take(batch_size));
+        } else {
+            tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {} random shards to migrate", out_of_home_az.len(), std::cmp::min(batch_size - out_of_home_az.len(), in_home_az.len()));
+
+            victims.extend(out_of_home_az);
+            in_home_az.shuffle(&mut thread_rng());
+            victims.extend(in_home_az.into_iter().take(batch_size - victims.len()));
         }
 
-        let choose_random = batch_size.saturating_sub(victims.len());
-        tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {choose_random} random shards to migrate", victims.len());
-
-        let random_victims = tenant_ids.choose_multiple(&mut thread_rng(), choose_random);
-        victims.extend(random_victims);
-
         for victim in victims {
             self.maybe_migrate_to_secondary(victim, nodes, tenants, scheduler);
         }

From df87a55609d2c9824c7445ef59a1391157ff6b55 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 30 Jan 2025 23:55:17 +0100
Subject: [PATCH 323/583] tests: Speed up test_pgdata_import_smoke on Postgres
 v17 (#10567)

The test runs this query:

    select count(*), sum(data::bigint)::bigint from t

to validate the test results between each part of the test. It performs
a simple sequential scan and aggregation, but was taking an order of
magnitude longer on v17 than on previous Postgres versions, which
sometimes caused the test to time out. There were two reasons for that:

1. On v17, the planner estimates the table to have only only one row. In
reality it has 305790 rows, and older versions estimated it at 611580,
which is not too bad given that the table has not been analyzed so the
planner bases that estimate just on the number of pages and the widths
of the datatypes. The new estimate of 1 row is much worse, and it leads
the planner to disregard parallel plans, whereas on older versions you
got a Parallel Seq Scan.

I tracked this down to upstream commit 29cf61ade3, "Consider fillfactor
when estimating relation size". With that commit,
table_block_relation_estimate_size() function calculates that each page
accommodates less than 1 row when the fillfactor is taken into account,
which rounds down to 0. In reality, the executor will always place at
least one row on a page regardless of fillfactor, but the new estimation
formula doesn't take that into account.

I reported this to pgsql-hackers
(https://www.postgresql.org/message-id/2bf9d973-7789-4937-a7ca-0af9fb49c71e%40iki.fi),
we don't need to do anything more about it in neon. It's OK to not use
parallel scans here; once issue 2. below is addressed, the queries are
fast enough without parallelism..

2. On v17, prefetching was not happening for the sequential scan. That's
because starting with v17, buffers are reserved in the shared buffer
cache before prefetching is initiated, and we use a tiny
shared_buffers=1MB setting in the tests. The prefetching is effectively
disabled with such a small shared_buffers setting, to protect the system
from completely starving out of buffers.

   To address that, simply bump up shared_buffers in the test.

This patch addresses the second issue, which is enough to fix the
problem.
---
 test_runner/regress/test_import_pgdata.py | 24 +++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py
index 086d4b67c9..6b35f3c6bb 100644
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -70,6 +70,12 @@ def test_pgdata_import_smoke(
     env.pageserver.stop()
     env.pageserver.start()
 
+    # By default our tests run with a tiny shared_buffers=1MB setting. That
+    # doesn't allow any prefetching on v17 and above, where the new streaming
+    # read machinery keeps buffers pinned while prefetching them.  Use a higher
+    # setting to enable prefetching and speed up the tests
+    ep_config = ["shared_buffers=64MB"]
+
     #
     # Put data in vanilla pg
     #
@@ -246,7 +252,11 @@ def test_pgdata_import_smoke(
     #
 
     ro_endpoint = env.endpoints.create_start(
-        branch_name=import_branch_name, endpoint_id="ro", tenant_id=tenant_id, lsn=last_record_lsn
+        branch_name=import_branch_name,
+        endpoint_id="ro",
+        tenant_id=tenant_id,
+        lsn=last_record_lsn,
+        config_lines=ep_config,
     )
 
     validate_vanilla_equivalence(ro_endpoint)
@@ -276,7 +286,10 @@ def test_pgdata_import_smoke(
     # validate that we can write
     #
     rw_endpoint = env.endpoints.create_start(
-        branch_name=import_branch_name, endpoint_id="rw", tenant_id=tenant_id
+        branch_name=import_branch_name,
+        endpoint_id="rw",
+        tenant_id=tenant_id,
+        config_lines=ep_config,
     )
     rw_endpoint.safe_psql("create table othertable(values text)")
     rw_lsn = Lsn(rw_endpoint.safe_psql_scalar("select pg_current_wal_flush_lsn()"))
@@ -296,7 +309,7 @@ def test_pgdata_import_smoke(
         ancestor_start_lsn=rw_lsn,
     )
     br_tip_endpoint = env.endpoints.create_start(
-        branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id
+        branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id, config_lines=ep_config
     )
     validate_vanilla_equivalence(br_tip_endpoint)
     br_tip_endpoint.safe_psql("select * from othertable")
@@ -309,7 +322,10 @@ def test_pgdata_import_smoke(
         ancestor_start_lsn=initdb_lsn,
     )
     br_initdb_endpoint = env.endpoints.create_start(
-        branch_name="br-initdb", endpoint_id="br-initdb-ro", tenant_id=tenant_id
+        branch_name="br-initdb",
+        endpoint_id="br-initdb-ro",
+        tenant_id=tenant_id,
+        config_lines=ep_config,
     )
     validate_vanilla_equivalence(br_initdb_endpoint)
     with pytest.raises(psycopg2.errors.UndefinedTable):

From 423e2396174cc8e0c97661a6ceb58ad290a17982 Mon Sep 17 00:00:00 2001
From: Anna Stepanyan <anna.stepanyan@neon.tech>
Date: Fri, 31 Jan 2025 07:29:06 +0100
Subject: [PATCH 324/583] [infra/notes] impr: add issue types to issue
 templates (#10018)

refs #0000

---------

Co-authored-by: Fedor Dikarev <fedor@neon.tech>
---
 .github/ISSUE_TEMPLATE/bug-template.md  | 1 +
 .github/ISSUE_TEMPLATE/epic-template.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/ISSUE_TEMPLATE/bug-template.md b/.github/ISSUE_TEMPLATE/bug-template.md
index d33eec3cde..234d9b5a37 100644
--- a/.github/ISSUE_TEMPLATE/bug-template.md
+++ b/.github/ISSUE_TEMPLATE/bug-template.md
@@ -3,6 +3,7 @@ name: Bug Template
 about: Used for describing bugs
 title: ''
 labels: t/bug
+type: Bug
 assignees: ''
 
 ---
diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md
index c442f50fde..868fd084f1 100644
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -4,6 +4,7 @@ about: A set of related tasks contributing towards specific outcome, comprising
   more than 1 week of work.
 title: 'Epic: '
 labels: t/Epic
+type: Epic
 assignees: ''
 
 ---

From 738bf835836de94e8aa41b8575c6db78cb882c38 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 31 Jan 2025 09:53:43 +0000
Subject: [PATCH 325/583] chore: replace dashmap with clashmap (#10582)

## Problem

Because dashmap 6 switched to hashbrown RawTable API, it required us to
use unsafe code in the upgrade:
https://github.com/neondatabase/neon/pull/8107

## Summary of changes

Switch to clashmap, a fork maintained by me which removes much of the
unsafe and ultimately switches to HashTable instead of RawTable to
remove much of the unsafe requirement on us.
---
 Cargo.lock                             | 53 +++++++++++++++++++++++++-
 Cargo.toml                             |  2 +-
 proxy/Cargo.toml                       |  2 +-
 proxy/src/auth/backend/jwt.rs          |  6 +--
 proxy/src/cache/endpoints.rs           | 14 +++----
 proxy/src/cache/project_info.rs        | 12 +++---
 proxy/src/cancellation.rs              |  8 ++--
 proxy/src/control_plane/client/mod.rs  |  8 ++--
 proxy/src/rate_limiter/leaky_bucket.rs |  8 ++--
 proxy/src/rate_limiter/limiter.rs      |  6 +--
 proxy/src/redis/keys.rs                |  3 +-
 proxy/src/redis/kv_ops.rs              |  1 -
 proxy/src/serverless/conn_pool_lib.rs  | 12 +++---
 proxy/src/usage_metrics.rs             | 10 ++---
 14 files changed, 97 insertions(+), 48 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 359f989a76..e9cbebcd02 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1213,6 +1213,20 @@ version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"
 
+[[package]]
+name = "clashmap"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93bd59c81e2bd87a775ae2de75f070f7e2bfe97363a6ad652f46824564c23e4d"
+dependencies = [
+ "crossbeam-utils",
+ "hashbrown 0.15.2",
+ "lock_api",
+ "parking_lot_core 0.9.8",
+ "polonius-the-crab",
+ "replace_with",
+]
+
 [[package]]
 name = "colorchoice"
 version = "1.0.0"
@@ -2531,6 +2545,12 @@ dependencies = [
  "allocator-api2",
 ]
 
+[[package]]
+name = "hashbrown"
+version = "0.15.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
+
 [[package]]
 name = "hashlink"
 version = "0.9.1"
@@ -2581,6 +2601,15 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46"
 
+[[package]]
+name = "higher-kinded-types"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "561985554c8b8d4808605c90a5f1979cc6c31a5d20b78465cd59501233c6678e"
+dependencies = [
+ "never-say-never",
+]
+
 [[package]]
 name = "hmac"
 version = "0.12.1"
@@ -3544,6 +3573,12 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 
+[[package]]
+name = "never-say-never"
+version = "6.6.666"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf5a574dadd7941adeaa71823ecba5e28331b8313fb2e1c6a5c7e5981ea53ad6"
+
 [[package]]
 name = "nix"
 version = "0.25.1"
@@ -4421,6 +4456,16 @@ dependencies = [
  "plotters-backend",
 ]
 
+[[package]]
+name = "polonius-the-crab"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e97ca2c89572ae41bbec1c99498251f87dd5a94e500c5ec19c382dd593dd5ce9"
+dependencies = [
+ "higher-kinded-types",
+ "never-say-never",
+]
+
 [[package]]
 name = "postgres"
 version = "0.19.6"
@@ -4794,9 +4839,9 @@ dependencies = [
  "camino-tempfile",
  "chrono",
  "clap",
+ "clashmap",
  "compute_api",
  "consumption_metrics",
- "dashmap 5.5.0",
  "ecdsa 0.16.9",
  "ed25519-dalek",
  "env_logger 0.10.2",
@@ -5215,6 +5260,12 @@ dependencies = [
  "utils",
 ]
 
+[[package]]
+name = "replace_with"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3a8614ee435691de62bcffcf4a66d91b3594bf1428a5722e79103249a095690"
+
 [[package]]
 name = "reqwest"
 version = "0.12.4"
diff --git a/Cargo.toml b/Cargo.toml
index 9ccdb45f6d..9d15b78a93 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -77,10 +77,10 @@ camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive", "env"] }
+clashmap = { version = "1.0", features = ["raw-api"] }
 comfy-table = "7.1"
 const_format = "0.2"
 crc32c = "0.6"
-dashmap = { version = "5.5.0", features = ["raw-api"] }
 diatomic-waker = { version = "0.2.3" }
 either = "1.8"
 enum-map = "2.4.2"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index f362a45035..35574e945c 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -24,9 +24,9 @@ bytes = { workspace = true, features = ["serde"] }
 camino.workspace = true
 chrono.workspace = true
 clap = { workspace = true, features = ["derive", "env"] }
+clashmap.workspace = true
 compute_api.workspace = true
 consumption_metrics.workspace = true
-dashmap.workspace = true
 env_logger.workspace = true
 framed-websockets.workspace = true
 futures.workspace = true
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index df716f8455..e05a693cee 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -4,7 +4,7 @@ use std::sync::Arc;
 use std::time::{Duration, SystemTime};
 
 use arc_swap::ArcSwapOption;
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use jose_jwk::crypto::KeyInfo;
 use reqwest::{redirect, Client};
 use reqwest_retry::policies::ExponentialBackoff;
@@ -64,7 +64,7 @@ pub(crate) struct AuthRule {
 pub struct JwkCache {
     client: reqwest_middleware::ClientWithMiddleware,
 
-    map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
+    map: ClashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
 }
 
 pub(crate) struct JwkCacheEntry {
@@ -469,7 +469,7 @@ impl Default for JwkCache {
 
         JwkCache {
             client,
-            map: DashMap::default(),
+            map: ClashMap::default(),
         }
     }
 }
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 0136446d6d..b5c42cd23d 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -3,7 +3,7 @@ use std::future::pending;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, Mutex};
 
-use dashmap::DashSet;
+use clashmap::ClashSet;
 use redis::streams::{StreamReadOptions, StreamReadReply};
 use redis::{AsyncCommands, FromRedisValue, Value};
 use serde::Deserialize;
@@ -55,9 +55,9 @@ impl TryFrom<&Value> for ControlPlaneEvent {
 
 pub struct EndpointsCache {
     config: EndpointCacheConfig,
-    endpoints: DashSet<EndpointIdInt>,
-    branches: DashSet<BranchIdInt>,
-    projects: DashSet<ProjectIdInt>,
+    endpoints: ClashSet<EndpointIdInt>,
+    branches: ClashSet<BranchIdInt>,
+    projects: ClashSet<ProjectIdInt>,
     ready: AtomicBool,
     limiter: Arc<Mutex<GlobalRateLimiter>>,
 }
@@ -69,9 +69,9 @@ impl EndpointsCache {
                 config.limiter_info.clone(),
             ))),
             config,
-            endpoints: DashSet::new(),
-            branches: DashSet::new(),
-            projects: DashSet::new(),
+            endpoints: ClashSet::new(),
+            branches: ClashSet::new(),
+            projects: ClashSet::new(),
             ready: AtomicBool::new(false),
         }
     }
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index cab0b8b905..a5e71f1a87 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -5,7 +5,7 @@ use std::sync::Arc;
 use std::time::Duration;
 
 use async_trait::async_trait;
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use rand::{thread_rng, Rng};
 use smol_str::SmolStr;
 use tokio::sync::Mutex;
@@ -108,9 +108,9 @@ impl EndpointInfo {
 /// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available?
 /// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache.
 pub struct ProjectInfoCacheImpl {
-    cache: DashMap<EndpointIdInt, EndpointInfo>,
+    cache: ClashMap<EndpointIdInt, EndpointInfo>,
 
-    project2ep: DashMap<ProjectIdInt, HashSet<EndpointIdInt>>,
+    project2ep: ClashMap<ProjectIdInt, HashSet<EndpointIdInt>>,
     config: ProjectInfoCacheOptions,
 
     start_time: Instant,
@@ -176,8 +176,8 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
 impl ProjectInfoCacheImpl {
     pub(crate) fn new(config: ProjectInfoCacheOptions) -> Self {
         Self {
-            cache: DashMap::new(),
-            project2ep: DashMap::new(),
+            cache: ClashMap::new(),
+            project2ep: ClashMap::new(),
             config,
             ttl_disabled_since_us: AtomicU64::new(u64::MAX),
             start_time: Instant::now(),
@@ -302,7 +302,7 @@ impl ProjectInfoCacheImpl {
         let mut removed = 0;
         let shard = self.project2ep.shards()[shard].write();
         for (_, endpoints) in shard.iter() {
-            for endpoint in endpoints.get() {
+            for endpoint in endpoints {
                 self.cache.remove(endpoint);
                 removed += 1;
             }
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 34f708a36b..9a0b954341 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,3 +1,4 @@
+use std::convert::Infallible;
 use std::net::{IpAddr, SocketAddr};
 use std::sync::Arc;
 
@@ -8,7 +9,7 @@ use pq_proto::CancelKeyData;
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tokio::net::TcpStream;
-use tokio::sync::mpsc;
+use tokio::sync::{mpsc, oneshot};
 use tracing::{debug, info};
 
 use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo};
@@ -17,14 +18,11 @@ use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::error::ReportableError;
 use crate::ext::LockExt;
-use crate::metrics::CancelChannelSizeGuard;
-use crate::metrics::{CancellationRequest, Metrics, RedisMsgKind};
+use crate::metrics::{CancelChannelSizeGuard, CancellationRequest, Metrics, RedisMsgKind};
 use crate::rate_limiter::LeakyBucketRateLimiter;
 use crate::redis::keys::KeyPrefix;
 use crate::redis::kv_ops::RedisKVClient;
 use crate::tls::postgres_rustls::MakeRustlsConnect;
-use std::convert::Infallible;
-use tokio::sync::oneshot;
 
 type IpSubnetKey = IpNet;
 
diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs
index d559d96bbc..b879f3a59f 100644
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -6,7 +6,7 @@ use std::hash::Hash;
 use std::sync::Arc;
 use std::time::Duration;
 
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use tokio::time::Instant;
 use tracing::{debug, info};
 
@@ -148,7 +148,7 @@ impl ApiCaches {
 /// Various caches for [`control_plane`](super).
 pub struct ApiLocks<K> {
     name: &'static str,
-    node_locks: DashMap<K, Arc<DynamicLimiter>>,
+    node_locks: ClashMap<K, Arc<DynamicLimiter>>,
     config: RateLimiterConfig,
     timeout: Duration,
     epoch: std::time::Duration,
@@ -180,7 +180,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
     ) -> prometheus::Result<Self> {
         Ok(Self {
             name,
-            node_locks: DashMap::with_shard_amount(shards),
+            node_locks: ClashMap::with_shard_amount(shards),
             config,
             timeout,
             epoch,
@@ -238,7 +238,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
                 let mut lock = shard.write();
                 let timer = self.metrics.reclamation_lag_seconds.start_timer();
                 let count = lock
-                    .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
+                    .extract_if(|(_, semaphore)| Arc::strong_count(semaphore) == 1)
                     .count();
                 drop(lock);
                 self.metrics.semaphores_unregistered.inc_by(count as u64);
diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs
index bff800f0a2..9645eaf725 100644
--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -2,7 +2,7 @@ use std::hash::Hash;
 use std::sync::atomic::{AtomicUsize, Ordering};
 
 use ahash::RandomState;
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use rand::{thread_rng, Rng};
 use tokio::time::Instant;
 use tracing::info;
@@ -14,7 +14,7 @@ use crate::intern::EndpointIdInt;
 pub type EndpointRateLimiter = LeakyBucketRateLimiter<EndpointIdInt>;
 
 pub struct LeakyBucketRateLimiter<Key> {
-    map: DashMap<Key, LeakyBucketState, RandomState>,
+    map: ClashMap<Key, LeakyBucketState, RandomState>,
     config: utils::leaky_bucket::LeakyBucketConfig,
     access_count: AtomicUsize,
 }
@@ -27,7 +27,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
 
     pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self {
         Self {
-            map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards),
+            map: ClashMap::with_hasher_and_shard_amount(RandomState::new(), shards),
             config: config.into(),
             access_count: AtomicUsize::new(0),
         }
@@ -58,7 +58,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
         let shard = thread_rng().gen_range(0..n);
         self.map.shards()[shard]
             .write()
-            .retain(|_, value| !value.get().bucket_is_empty(now));
+            .retain(|(_, value)| !value.bucket_is_empty(now));
     }
 }
 
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index ec080f270b..ef6c39f230 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -5,7 +5,7 @@ use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Mutex;
 
 use anyhow::bail;
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use itertools::Itertools;
 use rand::rngs::StdRng;
 use rand::{Rng, SeedableRng};
@@ -62,7 +62,7 @@ impl GlobalRateLimiter {
 pub type WakeComputeRateLimiter = BucketRateLimiter<EndpointIdInt, StdRng, RandomState>;
 
 pub struct BucketRateLimiter<Key, Rand = StdRng, Hasher = RandomState> {
-    map: DashMap<Key, Vec<RateBucket>, Hasher>,
+    map: ClashMap<Key, Vec<RateBucket>, Hasher>,
     info: Cow<'static, [RateBucketInfo]>,
     access_count: AtomicUsize,
     rand: Mutex<Rand>,
@@ -202,7 +202,7 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
         info!(buckets = ?info, "endpoint rate limiter");
         Self {
             info,
-            map: DashMap::with_hasher_and_shard_amount(hasher, 64),
+            map: ClashMap::with_hasher_and_shard_amount(hasher, 64),
             access_count: AtomicUsize::new(1), // start from 1 to avoid GC on the first request
             rand: Mutex::new(rand),
         }
diff --git a/proxy/src/redis/keys.rs b/proxy/src/redis/keys.rs
index dddc7e2054..dcb9a59f87 100644
--- a/proxy/src/redis/keys.rs
+++ b/proxy/src/redis/keys.rs
@@ -1,7 +1,8 @@
+use std::io::ErrorKind;
+
 use anyhow::Ok;
 use pq_proto::{id_to_cancel_key, CancelKeyData};
 use serde::{Deserialize, Serialize};
-use std::io::ErrorKind;
 
 pub mod keyspace {
     pub const CANCEL_PREFIX: &str = "cancel";
diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs
index dcc6aac51b..3689bf7ae2 100644
--- a/proxy/src/redis/kv_ops.rs
+++ b/proxy/src/redis/kv_ops.rs
@@ -1,7 +1,6 @@
 use redis::{AsyncCommands, ToRedisArgs};
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
-
 use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
 
 pub struct RedisKVClient {
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index 44eac77e8f..a300198de4 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -5,7 +5,7 @@ use std::sync::atomic::{self, AtomicUsize};
 use std::sync::{Arc, Weak};
 use std::time::Duration;
 
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use parking_lot::RwLock;
 use postgres_client::ReadyForQueryStatus;
 use rand::Rng;
@@ -351,11 +351,11 @@ where
     //
     // That should be a fairly conteded map, so return reference to the per-endpoint
     // pool as early as possible and release the lock.
-    pub(crate) global_pool: DashMap<EndpointCacheKey, Arc<RwLock<P>>>,
+    pub(crate) global_pool: ClashMap<EndpointCacheKey, Arc<RwLock<P>>>,
 
     /// Number of endpoint-connection pools
     ///
-    /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
+    /// [`ClashMap::len`] iterates over all inner pools and acquires a read lock on each.
     /// That seems like far too much effort, so we're using a relaxed increment counter instead.
     /// It's only used for diagnostics.
     pub(crate) global_pool_size: AtomicUsize,
@@ -396,7 +396,7 @@ where
     pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
         let shards = config.pool_options.pool_shards;
         Arc::new(Self {
-            global_pool: DashMap::with_shard_amount(shards),
+            global_pool: ClashMap::with_shard_amount(shards),
             global_pool_size: AtomicUsize::new(0),
             config,
             global_connections_count: Arc::new(AtomicUsize::new(0)),
@@ -442,10 +442,10 @@ where
             .start_timer();
         let current_len = shard.len();
         let mut clients_removed = 0;
-        shard.retain(|endpoint, x| {
+        shard.retain(|(endpoint, x)| {
             // if the current endpoint pool is unique (no other strong or weak references)
             // then it is currently not in use by any connections.
-            if let Some(pool) = Arc::get_mut(x.get_mut()) {
+            if let Some(pool) = Arc::get_mut(x) {
                 let endpoints = pool.get_mut();
                 clients_removed = endpoints.clear_closed();
 
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index e1cc7e87b4..d369e3742f 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -10,9 +10,9 @@ use anyhow::{bail, Context};
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Bytes;
 use chrono::{DateTime, Datelike, Timelike, Utc};
+use clashmap::mapref::entry::Entry;
+use clashmap::ClashMap;
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
-use dashmap::mapref::entry::Entry;
-use dashmap::DashMap;
 use once_cell::sync::Lazy;
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use serde::{Deserialize, Serialize};
@@ -137,7 +137,7 @@ type FastHasher = std::hash::BuildHasherDefault<rustc_hash::FxHasher>;
 
 #[derive(Default)]
 pub(crate) struct Metrics {
-    endpoints: DashMap<Ids, Arc<MetricCounter>, FastHasher>,
+    endpoints: ClashMap<Ids, Arc<MetricCounter>, FastHasher>,
 }
 
 impl Metrics {
@@ -213,7 +213,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
 }
 
 fn collect_and_clear_metrics<C: Clearable>(
-    endpoints: &DashMap<Ids, Arc<C>, FastHasher>,
+    endpoints: &ClashMap<Ids, Arc<C>, FastHasher>,
 ) -> Vec<(Ids, u64)> {
     let mut metrics_to_clear = Vec::new();
 
@@ -271,7 +271,7 @@ fn create_event_chunks<'a>(
 #[expect(clippy::too_many_arguments)]
 #[instrument(skip_all)]
 async fn collect_metrics_iteration(
-    endpoints: &DashMap<Ids, Arc<MetricCounter>, FastHasher>,
+    endpoints: &ClashMap<Ids, Arc<MetricCounter>, FastHasher>,
     client: &http::ClientWithMiddleware,
     metric_collection_endpoint: &reqwest::Url,
     storage: Option<&GenericRemoteStorage>,

From 6041a935918f34f06ec7f0fb93cc6fc97c1dc1dd Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Fri, 31 Jan 2025 10:54:31 +0100
Subject: [PATCH 326/583] Update tokio base crates (#10556)

Update `tokio` base crates and their deps. Pin `tokio` to at least 1.41
which stabilized task ID APIs.

To dedup `mio` dep the `notify` crate is updated. It's used in
`compute_tools`.

https://github.com/neondatabase/neon/blob/9f81828429ad6475b4fbb1a814240213b74bec63/compute_tools/src/pg_helpers.rs#L258-L367
---
 Cargo.lock | 84 +++++++++++++++++++++++++++++++-----------------------
 Cargo.toml |  4 +--
 2 files changed, 51 insertions(+), 37 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e9cbebcd02..cada9604ff 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -966,7 +966,7 @@ version = "0.70.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "cexpr",
  "clang-sys",
  "itertools 0.12.1",
@@ -994,9 +994,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
 name = "bitflags"
-version = "2.4.1"
+version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
+checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"
 
 [[package]]
 name = "block-buffer"
@@ -1563,7 +1563,7 @@ version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "crossterm_winapi",
  "libc",
  "parking_lot 0.12.1",
@@ -1794,7 +1794,7 @@ version = "2.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ccf1bedf64cdb9643204a36dd15b19a6ce8e7aa7f7b105868e9f1fad5ffa7d12"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "byteorder",
  "chrono",
  "diesel_derives",
@@ -3088,11 +3088,11 @@ dependencies = [
 
 [[package]]
 name = "inotify"
-version = "0.9.6"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff"
+checksum = "f37dccff2791ab604f9babef0ba14fbe0be30bd368dc541e2b08d07c8aa908f3"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.8.0",
  "inotify-sys",
  "libc",
 ]
@@ -3269,9 +3269,9 @@ dependencies = [
 
 [[package]]
 name = "kqueue"
-version = "1.0.7"
+version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c8fc60ba15bf51257aa9807a48a61013db043fcf3a78cb0d916e8e396dcad98"
+checksum = "7447f1ca1b7b563588a205fe93dea8df60fd981423a768bc1c0ded35ed147d0c"
 dependencies = [
  "kqueue-sys",
  "libc",
@@ -3279,9 +3279,9 @@ dependencies = [
 
 [[package]]
 name = "kqueue-sys"
-version = "1.0.3"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8367585489f01bc55dd27404dcf56b95e6da061a256a666ab23be9ba96a2e587"
+checksum = "ed9625ffda8729b85e45cf04090035ac368927b8cebc34898e7c120f52e4838b"
 dependencies = [
  "bitflags 1.3.2",
  "libc",
@@ -3308,9 +3308,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.167"
+version = "0.2.169"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc"
+checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
 
 [[package]]
 name = "libloading"
@@ -3557,14 +3557,14 @@ dependencies = [
 
 [[package]]
 name = "mio"
-version = "0.8.11"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
+checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd"
 dependencies = [
  "libc",
  "log",
  "wasi 0.11.0+wasi-snapshot-preview1",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -3610,7 +3610,7 @@ version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "cfg-if",
  "libc",
  "memoffset 0.9.0",
@@ -3628,12 +3628,11 @@ dependencies = [
 
 [[package]]
 name = "notify"
-version = "6.1.1"
+version = "8.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d"
+checksum = "2fee8403b3d66ac7b26aee6e40a897d85dc5ce26f44da36b8b73e987cc52e943"
 dependencies = [
- "bitflags 2.4.1",
- "crossbeam-channel",
+ "bitflags 2.8.0",
  "filetime",
  "fsevent-sys",
  "inotify",
@@ -3641,10 +3640,17 @@ dependencies = [
  "libc",
  "log",
  "mio",
+ "notify-types",
  "walkdir",
- "windows-sys 0.48.0",
+ "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "notify-types"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e0826a989adedc2a244799e823aece04662b66609d96af8dff7ac6df9a8925d"
+
 [[package]]
 name = "ntapi"
 version = "0.4.1"
@@ -4705,7 +4711,7 @@ version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "chrono",
  "flate2",
  "hex",
@@ -4720,7 +4726,7 @@ version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "chrono",
  "hex",
 ]
@@ -5545,7 +5551,7 @@ version = "0.38.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "errno",
  "libc",
  "linux-raw-sys 0.4.14",
@@ -6819,21 +6825,20 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.38.1"
+version = "1.43.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb2caba9f80616f438e09748d5acda951967e1ea58508ef53d9c6402485a46df"
+checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e"
 dependencies = [
  "backtrace",
  "bytes",
  "libc",
  "mio",
- "num_cpus",
  "parking_lot 0.12.1",
  "pin-project-lite",
  "signal-hook-registry",
  "socket2",
  "tokio-macros",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -6864,9 +6869,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-macros"
-version = "2.3.0"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
+checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -7151,7 +7156,7 @@ version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
  "bytes",
  "http 1.1.0",
  "http-body 1.0.0",
@@ -7654,9 +7659,9 @@ dependencies = [
 
 [[package]]
 name = "walkdir"
-version = "2.3.3"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698"
+checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
 dependencies = [
  "same-file",
  "winapi-util",
@@ -7908,6 +7913,15 @@ dependencies = [
  "windows-targets 0.52.6",
 ]
 
+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.48.0"
diff --git a/Cargo.toml b/Cargo.toml
index 9d15b78a93..267a91d773 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -123,7 +123,7 @@ measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
 memoffset = "0.9"
 nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
-notify = "6.0.0"
+notify = "8.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
@@ -177,7 +177,7 @@ test-context = "0.3"
 thiserror = "1.0"
 tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
-tokio = { version = "1.17", features = ["macros"] }
+tokio = { version = "1.41", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"

From 765ba43438f91bf75773e9de358dbf58f34c5d87 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Fri, 31 Jan 2025 13:33:24 +0300
Subject: [PATCH 327/583] Allow pageserver unreachable errors in
 test_scrubber_tenant_snapshot (#10585)

## Problem

test_scrubber_tenant_snapshot restarts pageservers, but log validation
fails tests on any non white listed storcon warnings, making the test
flaky.

## Summary of changes

Allow warns like
2025-01-29T12:37:42.622179Z WARN reconciler{seq=1
tenant_id=2011077aea9b4e8a60e8e8a19407634c shard_id=0004}: Call to node
2 (localhost:15352) management API failed, will retry (attempt 1):
receive body: error sending request for url
(http://localhost:15352/v1/tenant/2011077aea9b4e8a60e8e8a19407634c-0004/location_config):
client error (Connect)

ref https://github.com/neondatabase/neon/issues/10462
---
 test_runner/regress/test_storage_scrubber.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 1304d302b7..7e92cc01cd 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -32,6 +32,12 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
     neon_env_builder.num_pageservers = shard_count if shard_count is not None else 1
 
     env = neon_env_builder.init_start()
+    # We restart pageserver(s), which will cause storage storage controller
+    # requests to fail and warn.
+    env.storage_controller.allowed_errors.append(".*management API still failed.*")
+    env.storage_controller.allowed_errors.append(
+        ".*Reconcile error.*error sending request for url.*"
+    )
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
     branch = "main"

From f09cfd11cb3029f9395c0ad0bccd61d2a848a6db Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 31 Jan 2025 10:54:14 +0000
Subject: [PATCH 328/583] pageserver: exclude archived timelines from
 freeze+flush on shutdown (#10594)

## Problem

If offloading races with normal shutdown, we get a "failed to freeze and
flush: cannot flush frozen layers when flush_loop is not running, state
is Exited". This is harmless but points to it being quite strange to try
and freeze and flush such a timeline. flushing on shutdown for an
archived timeline isn't useful.

Related: https://github.com/neondatabase/neon/issues/10389

## Summary of changes

- During Timeline::shutdown, ignore ShutdownMode::FreezeAndFlush if the
timeline is archived
---
 pageserver/src/tenant/timeline.rs | 71 ++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 827601fa8b..d6a8eaa4d9 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1818,7 +1818,7 @@ impl Timeline {
         self.last_record_lsn.shutdown();
 
         if let ShutdownMode::FreezeAndFlush = mode {
-            if let Some((open, frozen)) = self
+            let do_flush = if let Some((open, frozen)) = self
                 .layers
                 .read()
                 .await
@@ -1827,43 +1827,54 @@ impl Timeline {
                 .ok()
                 .filter(|(open, frozen)| *open || *frozen > 0)
             {
-                tracing::info!(?open, frozen, "flushing and freezing on shutdown");
+                if self.remote_client.is_archived() == Some(true) {
+                    // No point flushing on shutdown for an archived timeline: it is not important
+                    // to have it nice and fresh after our restart, and trying to flush here might
+                    // race with trying to offload it (which also stops the flush loop)
+                    false
+                } else {
+                    tracing::info!(?open, frozen, "flushing and freezing on shutdown");
+                    true
+                }
             } else {
-                // this is double-shutdown, ignore it
-            }
+                // this is double-shutdown, it'll be a no-op
+                true
+            };
 
             // we shut down walreceiver above, so, we won't add anything more
             // to the InMemoryLayer; freeze it and wait for all frozen layers
             // to reach the disk & upload queue, then shut the upload queue and
             // wait for it to drain.
-            match self.freeze_and_flush().await {
-                Ok(_) => {
-                    // drain the upload queue
-                    // if we did not wait for completion here, it might be our shutdown process
-                    // didn't wait for remote uploads to complete at all, as new tasks can forever
-                    // be spawned.
-                    //
-                    // what is problematic is the shutting down of RemoteTimelineClient, because
-                    // obviously it does not make sense to stop while we wait for it, but what
-                    // about corner cases like s3 suddenly hanging up?
-                    self.remote_client.shutdown().await;
+            if do_flush {
+                match self.freeze_and_flush().await {
+                    Ok(_) => {
+                        // drain the upload queue
+                        // if we did not wait for completion here, it might be our shutdown process
+                        // didn't wait for remote uploads to complete at all, as new tasks can forever
+                        // be spawned.
+                        //
+                        // what is problematic is the shutting down of RemoteTimelineClient, because
+                        // obviously it does not make sense to stop while we wait for it, but what
+                        // about corner cases like s3 suddenly hanging up?
+                        self.remote_client.shutdown().await;
+                    }
+                    Err(FlushLayerError::Cancelled) => {
+                        // this is likely the second shutdown, ignore silently.
+                        // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
+                        debug_assert!(self.cancel.is_cancelled());
+                    }
+                    Err(e) => {
+                        // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
+                        // we have some extra WAL replay to do next time the timeline starts.
+                        warn!("failed to freeze and flush: {e:#}");
+                    }
                 }
-                Err(FlushLayerError::Cancelled) => {
-                    // this is likely the second shutdown, ignore silently.
-                    // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
-                    debug_assert!(self.cancel.is_cancelled());
-                }
-                Err(e) => {
-                    // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
-                    // we have some extra WAL replay to do next time the timeline starts.
-                    warn!("failed to freeze and flush: {e:#}");
-                }
-            }
 
-            // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but
-            // we also do a final check here to ensure that the queue is empty.
-            if !self.remote_client.no_pending_work() {
-                warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
+                // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but
+                // we also do a final check here to ensure that the queue is empty.
+                if !self.remote_client.no_pending_work() {
+                    warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
+                }
             }
         }
 

From 7d5c70c717c5ad543fdbd6115e422e27e0e86da9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 31 Jan 2025 12:23:12 +0100
Subject: [PATCH 329/583] Update AWS SDK crates (#10588)

We want to keep the AWS SDK up to date as that way we benefit from new
developments and improvements.

Prior update was in #10056
---
 Cargo.lock | 75 ++++++++++++++++++++++++------------------------------
 1 file changed, 33 insertions(+), 42 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index cada9604ff..6b63c3c388 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -290,9 +290,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "aws-config"
-version = "1.5.10"
+version = "1.5.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924"
+checksum = "dc47e70fc35d054c8fcd296d47a61711f043ac80534a10b4f741904f81e73a90"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -301,7 +301,7 @@ dependencies = [
  "aws-sdk-sts",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json 0.60.7",
+ "aws-smithy-json",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -332,9 +332,9 @@ dependencies = [
 
 [[package]]
 name = "aws-runtime"
-version = "1.4.4"
+version = "1.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5ac934720fbb46206292d2c75b57e67acfc56fe7dfd34fb9a02334af08409ea"
+checksum = "bee7643696e7fdd74c10f9eb42848a87fe469d35eae9c3323f80aa98f350baac"
 dependencies = [
  "aws-credential-types",
  "aws-sigv4",
@@ -366,7 +366,7 @@ dependencies = [
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
  "aws-smithy-query",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
@@ -389,7 +389,7 @@ dependencies = [
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -414,7 +414,7 @@ dependencies = [
  "aws-smithy-checksums",
  "aws-smithy-eventstream",
  "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -437,15 +437,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sso"
-version = "1.50.0"
+version = "1.57.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05ca43a4ef210894f93096039ef1d6fa4ad3edfabb3be92b80908b9f2e4b4eab"
+checksum = "c54bab121fe1881a74c338c5f723d1592bf3b53167f80268a1274f404e1acc38"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -459,15 +459,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-ssooidc"
-version = "1.51.0"
+version = "1.58.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abaf490c2e48eed0bb8e2da2fb08405647bd7f253996e0f93b981958ea0f73b0"
+checksum = "8c8234fd024f7ac61c4e44ea008029bde934250f371efe7d4a39708397b1080c"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -481,15 +481,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sts"
-version = "1.51.0"
+version = "1.58.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b68fde0d69c8bfdc1060ea7da21df3e39f6014da316783336deff0a9ec28f4bf"
+checksum = "ba60e1d519d6f23a9df712c04fdeadd7872ac911c84b2f62a8bda92e129b7962"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
  "aws-smithy-query",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
@@ -504,9 +504,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sigv4"
-version = "1.2.6"
+version = "1.2.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d3820e0c08d0737872ff3c7c1f21ebbb6693d832312d6152bf18ef50a5471c2"
+checksum = "690118821e46967b3c4501d67d7d52dd75106a9c54cf36cefa1985cedbe94e05"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-eventstream",
@@ -533,9 +533,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-async"
-version = "1.2.1"
+version = "1.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c"
+checksum = "fa59d1327d8b5053c54bf2eaae63bf629ba9e904434d0835a28ed3c0ed0a614e"
 dependencies = [
  "futures-util",
  "pin-project-lite",
@@ -565,9 +565,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-eventstream"
-version = "0.60.5"
+version = "0.60.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cef7d0a272725f87e51ba2bf89f8c21e4df61b9e49ae1ac367a6d69916ef7c90"
+checksum = "8b18559a41e0c909b77625adf2b8c50de480a8041e5e4a3f5f7d177db70abc5a"
 dependencies = [
  "aws-smithy-types",
  "bytes",
@@ -576,9 +576,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http"
-version = "0.60.11"
+version = "0.60.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c8bc3e8fdc6b8d07d976e301c02fe553f72a39b7a9fea820e023268467d7ab6"
+checksum = "7809c27ad8da6a6a68c454e651d4962479e81472aa19ae99e59f9aba1f9713cc"
 dependencies = [
  "aws-smithy-eventstream",
  "aws-smithy-runtime-api",
@@ -597,18 +597,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-json"
-version = "0.60.7"
+version = "0.61.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6"
-dependencies = [
- "aws-smithy-types",
-]
-
-[[package]]
-name = "aws-smithy-json"
-version = "0.61.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee4e69cc50921eb913c6b662f8d909131bb3e6ad6cb6090d3a39b66fc5c52095"
+checksum = "623a51127f24c30776c8b374295f2df78d92517386f77ba30773f15a30ce1422"
 dependencies = [
  "aws-smithy-types",
 ]
@@ -625,9 +616,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime"
-version = "1.7.4"
+version = "1.7.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9f20685047ca9d6f17b994a07f629c813f08b5bce65523e47124879e60103d45"
+checksum = "865f7050bbc7107a6c98a397a9fcd9413690c27fa718446967cf03b2d3ac517e"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-http",
@@ -669,9 +660,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-types"
-version = "1.2.9"
+version = "1.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510"
+checksum = "a28f6feb647fb5e0d5b50f0472c19a7db9462b74e2fec01bb0b44eedcc834e97"
 dependencies = [
  "base64-simd",
  "bytes",
@@ -704,9 +695,9 @@ dependencies = [
 
 [[package]]
 name = "aws-types"
-version = "1.3.3"
+version = "1.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5221b91b3e441e6675310829fd8984801b772cb1546ef6c0e54dec9f1ac13fef"
+checksum = "b0df5a18c4f951c645300d365fec53a61418bcf4650f604f85fe2a665bfaa0c2"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-async",

From afbcebe7f761dd555a3433aa34802b601367a82f Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 31 Jan 2025 12:31:58 +0100
Subject: [PATCH 330/583] test_runner: force-compact in
 `test_sharding_autosplit` (#10605)

## Problem

This test may not fully detect data corruption during splits, since we
don't force-compact the entire keyspace.

## Summary of changes

Force-compact all data in `test_sharding_autosplit`.
---
 test_runner/performance/test_sharding_autosplit.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py
index 76c3ad01a4..e5a9f17da8 100644
--- a/test_runner/performance/test_sharding_autosplit.py
+++ b/test_runner/performance/test_sharding_autosplit.py
@@ -247,7 +247,7 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
         log.info(f"{shard_zero_id} timeline: {timeline_info}")
 
     # Run compaction for all tenants, restart endpoint so that on subsequent reads we will
-    # definitely hit pageserver for reads.  This compaction passis expected to drop unwanted
+    # definitely hit pageserver for reads.  This compaction pass is expected to drop unwanted
     # layers but not do any rewrites (we're still in the same generation)
     for tenant_id, tenant_state in tenants.items():
         tenant_state.endpoint.stop()
@@ -296,6 +296,16 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
         for fut in pgbench_futs:
             fut.result()
 
+    # Run a full forced compaction, to detect any data corruption.
+    for tenant_id, tenant_state in tenants.items():
+        for shard_id, shard_ps in tenant_get_shards(env, tenant_id):
+            shard_ps.http_client().timeline_compact(
+                shard_id,
+                tenant_state.timeline_id,
+                force_image_layer_creation=True,
+                force_l0_compaction=True,
+            )
+
     # Assert that some rewrites happened
     # TODO: uncomment this after https://github.com/neondatabase/neon/pull/7531 is merged
     # assert any(ps.log_contains(".*Rewriting layer after shard split.*") for ps in env.pageservers)

From 89cff08354f7c2f2bbb0a92df2eca6de828fa4fe Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Fri, 31 Jan 2025 12:46:33 +0100
Subject: [PATCH 331/583] unify pg-build-nonroot-with-cargo base layer and
 config retries in curl (#10575)

Ref: https://github.com/neondatabase/cloud/issues/23461

## Problem

Just made changes around and see these 2 base layers could be optimised.

and after review comment from @myrrc setting up timeouts and retries in
`alpine/curl` image

## Summary of changes
---
 compute/compute-node.Dockerfile | 43 +++++++++++++++------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 1ef449f0b0..32226c56a5 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -825,11 +825,11 @@ RUN case "${PG_VERSION}" in "v17") \
 
 #########################################################################################
 #
-# Layer "rust extensions"
-# This layer is used to build `pgrx` deps
+# Layer "pg build with nonroot user and cargo installed"
+# This layer is base and common for layers with `pgrx`
 #
 #########################################################################################
-FROM pg-build AS rust-extensions-build
+FROM pg-build AS pg-build-nonroot-with-cargo
 ARG PG_VERSION
 
 RUN apt update && \
@@ -847,8 +847,18 @@ RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 30
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
     chmod +x rustup-init && \
     ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
-    rm rustup-init && \
-    case "${PG_VERSION}" in \
+    rm rustup-init
+
+#########################################################################################
+#
+# Layer "rust extensions"
+# This layer is used to build `pgrx` deps
+#
+#########################################################################################
+FROM pg-build-nonroot-with-cargo AS rust-extensions-build
+ARG PG_VERSION
+
+RUN case "${PG_VERSION}" in \
         'v17') \
             echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \
     esac && \
@@ -867,26 +877,10 @@ USER root
 # and eventually get merged with `rust-extensions-build`
 #
 #########################################################################################
-FROM pg-build AS rust-extensions-build-pgrx12
+FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx12
 ARG PG_VERSION
 
-RUN apt update && \
-    apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
-    apt clean && rm -rf /var/lib/apt/lists/* && \
-    useradd -ms /bin/bash nonroot -b /home
-
-ENV HOME=/home/nonroot
-ENV PATH="/home/nonroot/.cargo/bin:$PATH"
-USER nonroot
-WORKDIR /home/nonroot
-
-RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc
-
-RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
-    chmod +x rustup-init && \
-    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
-    rm rustup-init && \
-    cargo install --locked --version 0.12.9 cargo-pgrx && \
+RUN cargo install --locked --version 0.12.9 cargo-pgrx && \
     /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
 
 USER root
@@ -1283,7 +1277,8 @@ FROM alpine/curl:${ALPINE_CURL_VERSION} AS exporters
 ARG TARGETARCH
 # Keep sql_exporter version same as in build-tools.Dockerfile and
 # test_runner/regress/test_compute_metrics.py
-RUN if [ "$TARGETARCH" = "amd64" ]; then\
+RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc; \
+    if [ "$TARGETARCH" = "amd64" ]; then\
         postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
         pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
         sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\

From 503bc72d31ce15620479346dfa1081771a7f0a95 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 31 Jan 2025 11:48:46 +0000
Subject: [PATCH 332/583] CI: add `diesel print-schema` check (#10527)

## Problem

We want to check that `diesel print-schema` doesn't generate any changes
(`storage_controller/src/schema.rs`) in comparison with the list of
migration.

## Summary of changes
- Add `diesel_cli` to `build-tools` image
- Add `Check diesel schema` step to `build-neon` job, at this stage we
have all required binaries, so don't need to compile anything
additionally
- Check runs only on x86 release builds to be sure we do it at least
once per CI run.
---
 .github/workflows/_build-and-test-locally.yml | 20 +++++++++++++++++++
 .github/workflows/_check-codestyle-rust.yml   |  3 +++
 build-tools.Dockerfile                        |  3 +++
 3 files changed, 26 insertions(+)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 2daed90386..e9483492c9 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -271,6 +271,26 @@ jobs:
           path: /tmp/neon
           aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
+      - name: Check diesel schema
+        if: inputs.build-type == 'release' && inputs.arch == 'x64'
+        env:
+          DATABASE_URL: postgresql://localhost:1235/storage_controller
+          POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+        run: |
+          /tmp/neon/bin/neon_local init
+          /tmp/neon/bin/neon_local storage_controller start
+
+          diesel print-schema > storage_controller/src/schema.rs
+
+          if [ -n "$(git diff storage_controller/src/schema.rs)" ]; then
+            echo >&2 "Uncommitted changes in diesel schema"
+
+            git diff .
+            exit 1
+          fi
+
+          /tmp/neon/bin/neon_local storage_controller stop
+
       # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
       - name: Merge and upload coverage data
         if: inputs.build-type == 'debug'
diff --git a/.github/workflows/_check-codestyle-rust.yml b/.github/workflows/_check-codestyle-rust.yml
index cbc47c6406..f7518d6500 100644
--- a/.github/workflows/_check-codestyle-rust.yml
+++ b/.github/workflows/_check-codestyle-rust.yml
@@ -16,6 +16,9 @@ defaults:
   run:
     shell: bash -euxo pipefail {0}
 
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
 jobs:
   check-codestyle-rust:
     strategy:
diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 9c13e480c1..dfcc7d06b4 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -261,6 +261,7 @@ ARG CARGO_HAKARI_VERSION=0.9.33
 ARG CARGO_DENY_VERSION=0.16.2
 ARG CARGO_HACK_VERSION=0.6.33
 ARG CARGO_NEXTEST_VERSION=0.9.85
+ARG CARGO_DIESEL_CLI_VERSION=2.2.6
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
 	chmod +x rustup-init && \
 	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
@@ -274,6 +275,8 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
     cargo install cargo-hack          --version ${CARGO_HACK_VERSION} && \
     cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} && \
+    cargo install diesel_cli          --version ${CARGO_DIESEL_CLI_VERSION} \
+                                      --features postgres-bundled --no-default-features && \
     rm -rf /home/nonroot/.cargo/registry && \
     rm -rf /home/nonroot/.cargo/git
 

From dce617fe070e8528aba5a5628b138e2fe3eb4c85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 31 Jan 2025 13:40:20 +0100
Subject: [PATCH 333/583] Update to rebased rust-postgres (#10584)

Update to a rebased version of our rust-postgres patches, rebased on
[this](https://github.com/sfackler/rust-postgres/commit/98f5a11bc0a8e451552d8941ffa078c7eb6cd60c)
commit this time.

With #10280 reapplied, this means that the rust-postgres crates will be
deduplicated, as the new crate versions are finally compatible with the
requirements of diesel-async.

Earlier update: #10561

rust-postgres PR: https://github.com/neondatabase/rust-postgres/pull/39
---
 Cargo.lock | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 6b63c3c388..cdc620e485 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4465,8 +4465,8 @@ dependencies = [
 
 [[package]]
 name = "postgres"
-version = "0.19.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
+version = "0.19.7"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4479,9 +4479,9 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.22.1",
  "byteorder",
  "bytes",
  "fallible-iterator",
@@ -4513,7 +4513,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
  "bytes",
  "chrono",
@@ -6871,8 +6871,8 @@ dependencies = [
 
 [[package]]
 name = "tokio-postgres"
-version = "0.7.9"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
+version = "0.7.10"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
  "async-trait",
  "byteorder",

From 10cf5e7a38d45037f3f51b666c519b7e5c6c72a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Fri, 31 Jan 2025 14:42:59 +0100
Subject: [PATCH 334/583] Move cargo-deny into a separate workflow on a
 schedule (#10289)

## Problem
There are two (related) problems with the previous handling of
`cargo-deny`:
- When a new advisory is added to rustsec that affects a dependency,
unrelated pull requests will fail.
- New advisories rely on pushes or PRs to be surfaced. Problems that
already exist on main will only be found if we try to merge new things
into main.

## Summary of changes
We split out `cargo-deny` into a separate workflow that runs on all PRs
that touch `Cargo.lock`, and on a schedule on `main`, `release`,
`release-compute` and `release-proxy` to find new advisories.
---
 .github/actionlint.yml                      |  1 +
 .github/file-filters.yaml                   |  1 +
 .github/workflows/_check-codestyle-rust.yml |  5 --
 .github/workflows/build_and_test.yml        | 39 +++++++++++++-
 .github/workflows/cargo-deny.yml            | 57 +++++++++++++++++++++
 .github/workflows/pre-merge-checks.yml      |  3 +-
 6 files changed, 99 insertions(+), 7 deletions(-)
 create mode 100644 .github/workflows/cargo-deny.yml

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index ecff0cc70b..2b96ce95da 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -27,3 +27,4 @@ config-variables:
   - SLACK_ON_CALL_QA_STAGING_STREAM
   - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
   - SLACK_ON_CALL_STORAGE_STAGING_STREAM
+  - SLACK_CICD_CHANNEL_ID
diff --git a/.github/file-filters.yaml b/.github/file-filters.yaml
index 886cd3919a..02ee383d5e 100644
--- a/.github/file-filters.yaml
+++ b/.github/file-filters.yaml
@@ -1,4 +1,5 @@
 rust_code: ['**/*.rs', '**/Cargo.toml', '**/Cargo.lock']
+rust_dependencies: ['**/Cargo.lock']
 
 v14: ['vendor/postgres-v14/**', 'Makefile', 'pgxn/**']
 v15: ['vendor/postgres-v15/**', 'Makefile', 'pgxn/**']
diff --git a/.github/workflows/_check-codestyle-rust.yml b/.github/workflows/_check-codestyle-rust.yml
index f7518d6500..c4c76914aa 100644
--- a/.github/workflows/_check-codestyle-rust.yml
+++ b/.github/workflows/_check-codestyle-rust.yml
@@ -87,8 +87,3 @@ jobs:
         run: |
           cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
           cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
-
-      # https://github.com/EmbarkStudios/cargo-deny
-      - name: Check rust licenses/bans/advisories/sources
-        if: ${{ !cancelled() }}
-        run: cargo deny check --hide-inclusion-graph
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index e588fc5a0e..1274543429 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -45,6 +45,26 @@ jobs:
             run cancel-previous-in-concurrency-group.yml \
               --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
 
+  files-changed:
+    needs: [ check-permissions ]
+    runs-on: [ self-hosted, small ]
+    timeout-minutes: 3
+    outputs:
+      check-rust-dependencies: ${{ steps.files-changed.outputs.rust_dependencies }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - name: Check for file changes
+        uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36  # v3.0.2
+        id: files-changed
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          filters: .github/file-filters.yaml
+
   tag:
     needs: [ check-permissions ]
     runs-on: [ self-hosted, small ]
@@ -170,6 +190,14 @@ jobs:
       archs: '["x64", "arm64"]'
     secrets: inherit
 
+  check-dependencies-rust:
+    needs: [ files-changed, build-build-tools-image ]
+    if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' }}
+    uses: ./.github/workflows/cargo-deny.yml
+    with:
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+    secrets: inherit
+
   build-and-test-locally:
     needs: [ tag, build-build-tools-image ]
     strategy:
@@ -1332,6 +1360,8 @@ jobs:
       - build-and-test-locally
       - check-codestyle-python
       - check-codestyle-rust
+      - check-dependencies-rust
+      - files-changed
       - promote-images-dev
       - test-images
       - trigger-custom-extensions-build-and-wait
@@ -1344,4 +1374,11 @@ jobs:
         if: |
           contains(needs.*.result, 'failure')
           || contains(needs.*.result, 'cancelled')
-          || contains(needs.*.result, 'skipped')
+          || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true')
+          || needs.build-and-test-locally.result == 'skipped'
+          || needs.check-codestyle-python.result == 'skipped'
+          || needs.check-codestyle-rust.result == 'skipped'
+          || needs.files-changed.result == 'skipped'
+          || needs.promote-images-dev.result == 'skipped'
+          || needs.test-images.result == 'skipped'
+          || needs.trigger-custom-extensions-build-and-wait.result == 'skipped'
diff --git a/.github/workflows/cargo-deny.yml b/.github/workflows/cargo-deny.yml
new file mode 100644
index 0000000000..433b377c32
--- /dev/null
+++ b/.github/workflows/cargo-deny.yml
@@ -0,0 +1,57 @@
+name: cargo deny checks
+
+on:
+  workflow_call:
+    inputs:
+      build-tools-image:
+        required: false
+        type: string
+  schedule:
+    - cron: '0 0 * * *'
+
+jobs:
+  cargo-deny:
+    strategy:
+      matrix:
+        ref: >-
+          ${{
+            fromJSON(
+              github.event_name == 'schedule'
+                && '["main","release","release-proxy","release-compute"]'
+                || format('["{0}"]', github.sha)
+            )
+          }}
+
+    runs-on: [self-hosted, small]
+
+    container:
+      image: ${{ inputs.build-tools-image || 'neondatabase/build-tools:pinned' }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ matrix.ref }}
+
+      - name: Check rust licenses/bans/advisories/sources
+        env:
+          CARGO_DENY_TARGET: >-
+            ${{ github.event_name == 'schedule' && 'advisories' || 'all' }}
+        run: cargo deny check --hide-inclusion-graph $CARGO_DENY_TARGET
+
+      - name: Post to a Slack channel
+        if: ${{ github.event_name == 'schedule' && failure() }}
+        uses: slackapi/slack-github-action@v2
+        with:
+          method: chat.postMessage
+          token: ${{ secrets.SLACK_BOT_TOKEN }}
+          payload: |
+            channel: ${{ vars.SLACK_CICD_CHANNEL_ID }}
+            text: |
+              Periodic cargo-deny on ${{ matrix.ref }}: ${{ job.status }}
+              <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+              Pinging @oncall-devprod.
diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml
index e6dfbaeed8..e92a153db9 100644
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -124,6 +124,7 @@ jobs:
       - name: Fail the job if any of the dependencies do not succeed or skipped
         run: exit 1
         if: |
-          (contains(needs.check-codestyle-python.result, 'skipped') && needs.get-changed-files.outputs.python-changed == 'true')
+          (needs.check-codestyle-python.result == 'skipped' && needs.get-changed-files.outputs.python-changed == 'true')
+          || (needs.check-codestyle-rust.result == 'skipped' && needs.get-changed-files.outputs.rust-changed == 'true')
           || contains(needs.*.result, 'failure')
           || contains(needs.*.result, 'cancelled')

From a93e9f22fc0e35ad2863d1e57db9f3a01326b710 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 31 Jan 2025 17:43:31 +0000
Subject: [PATCH 335/583] pageserver: remove faulty debug assertion in
 compaction (#10610)

## Problem

This assertion is incorrect: it is legal to see another shard's data at
this point, after a shard split.

Closes: https://github.com/neondatabase/neon/issues/10609

## Summary of changes

- Remove faulty assertion
---
 pageserver/src/tenant/timeline/compaction.rs | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 7242f73a82..7244e946cb 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1503,11 +1503,9 @@ impl Timeline {
                     .await
                     .map_err(CompactionError::Other)?;
             } else {
-                let shard = self.shard_identity.shard_index();
                 let owner = self.shard_identity.get_shard_number(&key);
-                if cfg!(debug_assertions) {
-                    panic!("key {key} does not belong on shard {shard}, owned by {owner}");
-                }
+
+                // This happens after a shard split, when we're compacting an L0 created by our parent shard
                 debug!("dropping key {key} during compaction (it belongs on shard {owner})");
             }
 

From aedeb1c7c277703af861894455764a8c248df9b8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 31 Jan 2025 17:43:54 +0000
Subject: [PATCH 336/583] pageserver: revise logging of cancelled request
 results (#10604)

## Problem

When a client dropped before a request completed, and a handler returned
an ApiError, we would log that at error severity. That was excessive in
the case of a request erroring on a shutdown, and could cause test
flakes.

example:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/13067651123/index.html#suites/ad9c266207b45eafe19909d1020dd987/6021ce86a0d72ae7/

```
Cancelled request finished with an error: ShuttingDown
```

## Summary of changes

- Log a different info-level on ShuttingDown and ResourceUnavailable API
errors from cancelled requests
---
 pageserver/src/http/routes.rs | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index eb9cb4da0c..94f7510a4a 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3393,7 +3393,17 @@ where
                             let status = response.status();
                             info!(%status, "Cancelled request finished successfully")
                         }
-                        Err(e) => error!("Cancelled request finished with an error: {e:?}"),
+                        Err(e) => match e {
+                            ApiError::ShuttingDown | ApiError::ResourceUnavailable(_) => {
+                                // Don't log this at error severity: they are normal during lifecycle of tenants/process
+                                info!("Cancelled request aborted for shutdown")
+                            }
+                            _ => {
+                                // Log these in a highly visible way, because we have no client to send the response to, but
+                                // would like to know that something went wrong.
+                                error!("Cancelled request finished with an error: {e:?}")
+                            }
+                        },
                     }
                 }
                 // only logging for cancelled panicked request handlers is the tracing_panic_hook,

From 48c87dc458a84fa9132ff22b1ae1fcc3d3094cda Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 31 Jan 2025 18:07:26 +0000
Subject: [PATCH 337/583] CI(pre-merge-checks): fix condition (#10617)

## Problem

Merge Queue fails if changes include Rust code.

## Summary of changes
- Fix condition for `build-build-tools-image`
- Add a couple of no-op `false ||` to make predicates look
symmetric
---
 .github/workflows/pre-merge-checks.yml | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml
index e92a153db9..d39ccecac9 100644
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -59,7 +59,10 @@ jobs:
           echo "${RUST_CHANGED_FILES}"
 
   build-build-tools-image:
-    if: needs.get-changed-files.outputs.python-changed == 'true'
+    if: |
+      false
+      || needs.get-changed-files.outputs.python-changed == 'true'
+      || needs.get-changed-files.outputs.rust-changed == 'true'
     needs: [ get-changed-files ]
     uses: ./.github/workflows/build-build-tools-image.yml
     with:
@@ -124,7 +127,8 @@ jobs:
       - name: Fail the job if any of the dependencies do not succeed or skipped
         run: exit 1
         if: |
-          (needs.check-codestyle-python.result == 'skipped' && needs.get-changed-files.outputs.python-changed == 'true')
-          || (needs.check-codestyle-rust.result == 'skipped' && needs.get-changed-files.outputs.rust-changed == 'true')
+          false
+          || (needs.check-codestyle-python.result == 'skipped' && needs.get-changed-files.outputs.python-changed == 'true')
+          || (needs.check-codestyle-rust.result   == 'skipped' && needs.get-changed-files.outputs.rust-changed   == 'true')
           || contains(needs.*.result, 'failure')
           || contains(needs.*.result, 'cancelled')

From bc7822d90c82046de709b211faa03d3f720a6931 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Fri, 31 Jan 2025 19:41:17 +0100
Subject: [PATCH 338/583] temporarily disable some steps and run more often to
 expose more pgbench --initialize in benchmarking workflow (#10616)

## Problem

we want to disable some steps in benchmarking workflow that do not
initialize new projects and instead run the test more frequently

Test run
 https://github.com/neondatabase/neon/actions/runs/13077737888
---
 .github/workflows/benchmarking.yml | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 49f23e895b..20a8a6e2c9 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -11,7 +11,8 @@ on:
     #          │ │ ┌───────────── day of the month (1 - 31)
     #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
     #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron:   '0 3 * * *' # run once a day, timezone is utc
+    # - cron:   '0 3 * * *' # run once a day, timezone is utc
+    - cron: '0 */10 * * *' # Runs every 10 hours at minute 0
   workflow_dispatch: # adds ability to run this manually
     inputs:
       region_id:
@@ -550,6 +551,7 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   pgbench-pgvector:
+    if: false
     permissions:
       contents: write
       statuses: write
@@ -683,7 +685,8 @@ jobs:
     #
     # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
     # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    if: false
     permissions:
       contents: write
       statuses: write
@@ -810,7 +813,8 @@ jobs:
     # We might change it after https://github.com/neondatabase/neon/issues/2900.
     #
     # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    if: false
     permissions:
       contents: write
       statuses: write
@@ -929,7 +933,8 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   user-examples-compare:
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    if: false
     permissions:
       contents: write
       statuses: write

From fcd195c2b63fdfbb5323258a5422469e1e850175 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 31 Jan 2025 13:04:26 -0600
Subject: [PATCH 339/583] Migrate compute_ctl arg parsing to clap derive
 (#10497)

The primary benefit is that all the ad hoc get_matches() calls are no
longer necessary. Now all it takes to get at the CLI arguments is
referencing a struct member. It's also great the we can replace the ad
hoc CLI struct we had with this more formal solution.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs | 348 +++++++++------------------
 1 file changed, 112 insertions(+), 236 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index b98cf706d3..47fc9cb7fe 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -34,6 +34,7 @@
 //!             -r http://pg-ext-s3-gateway \
 //! ```
 use std::collections::HashMap;
+use std::ffi::OsString;
 use std::fs::File;
 use std::path::Path;
 use std::process::exit;
@@ -44,7 +45,7 @@ use std::{thread, time::Duration};
 
 use anyhow::{Context, Result};
 use chrono::Utc;
-use clap::Arg;
+use clap::Parser;
 use compute_tools::disk_quota::set_disk_quota;
 use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
 use signal_hook::consts::{SIGQUIT, SIGTERM};
@@ -73,10 +74,75 @@ use utils::failpoint_support;
 // in-case of not-set environment var
 const BUILD_TAG_DEFAULT: &str = "latest";
 
-fn main() -> Result<()> {
-    let scenario = failpoint_support::init();
+// Compatibility hack: if the control plane specified any remote-ext-config
+// use the default value for extension storage proxy gateway.
+// Remove this once the control plane is updated to pass the gateway URL
+fn parse_remote_ext_config(arg: &str) -> Result<String> {
+    if arg.starts_with("http") {
+        Ok(arg.trim_end_matches('/').to_string())
+    } else {
+        Ok("http://pg-ext-s3-gateway".to_string())
+    }
+}
 
-    let (build_tag, clap_args) = init()?;
+#[derive(Parser)]
+#[command(rename_all = "kebab-case")]
+struct Cli {
+    #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
+    pub pgbin: String,
+
+    #[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
+    pub remote_ext_config: Option<String>,
+
+    #[arg(long, default_value_t = 3080)]
+    pub http_port: u16,
+
+    #[arg(short = 'D', long, value_name = "DATADIR")]
+    pub pgdata: String,
+
+    #[arg(short = 'C', long, value_name = "DATABASE_URL")]
+    pub connstr: String,
+
+    #[cfg(target_os = "linux")]
+    #[arg(long, default_value = "neon-postgres")]
+    pub cgroup: String,
+
+    #[cfg(target_os = "linux")]
+    #[arg(
+        long,
+        default_value = "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor"
+    )]
+    pub filecache_connstr: String,
+
+    #[cfg(target_os = "linux")]
+    #[arg(long, default_value = "0.0.0.0:10301")]
+    pub vm_monitor_addr: String,
+
+    #[arg(long, action = clap::ArgAction::SetTrue)]
+    pub resize_swap_on_bind: bool,
+
+    #[arg(long)]
+    pub set_disk_quota_for_fs: Option<String>,
+
+    #[arg(short = 's', long = "spec", group = "spec")]
+    pub spec_json: Option<String>,
+
+    #[arg(short = 'S', long, group = "spec-path")]
+    pub spec_path: Option<OsString>,
+
+    #[arg(short = 'i', long, group = "compute-id", conflicts_with_all = ["spec", "spec-path"])]
+    pub compute_id: Option<String>,
+
+    #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], requires = "compute-id", value_name = "CONTROL_PLANE_API_BASE_URL")]
+    pub control_plane_uri: Option<String>,
+}
+
+fn main() -> Result<()> {
+    let cli = Cli::parse();
+
+    let build_tag = init()?;
+
+    let scenario = failpoint_support::init();
 
     // enable core dumping for all child processes
     setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
@@ -85,13 +151,11 @@ fn main() -> Result<()> {
         // Enter startup tracing context
         let _startup_context_guard = startup_context_from_env();
 
-        let cli_args = process_cli(&clap_args)?;
+        let cli_spec = try_spec_from_cli(&cli)?;
 
-        let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;
+        let compute = wait_spec(build_tag, &cli, cli_spec)?;
 
-        let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?;
-
-        start_postgres(&clap_args, wait_spec_result)?
+        start_postgres(&cli, compute)?
 
         // Startup is finished, exit the startup tracing span
     };
@@ -108,7 +172,7 @@ fn main() -> Result<()> {
     deinit_and_exit(wait_pg_result);
 }
 
-fn init() -> Result<(String, clap::ArgMatches)> {
+fn init() -> Result<String> {
     init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
 
     let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
@@ -123,66 +187,7 @@ fn init() -> Result<(String, clap::ArgMatches)> {
         .to_string();
     info!("build_tag: {build_tag}");
 
-    Ok((build_tag, cli().get_matches()))
-}
-
-fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
-    let pgbin_default = "postgres";
-    let pgbin = matches
-        .get_one::<String>("pgbin")
-        .map(|s| s.as_str())
-        .unwrap_or(pgbin_default);
-
-    let ext_remote_storage = matches
-        .get_one::<String>("remote-ext-config")
-        // Compatibility hack: if the control plane specified any remote-ext-config
-        // use the default value for extension storage proxy gateway.
-        // Remove this once the control plane is updated to pass the gateway URL
-        .map(|conf| {
-            if conf.starts_with("http") {
-                conf.trim_end_matches('/')
-            } else {
-                "http://pg-ext-s3-gateway"
-            }
-        });
-
-    let http_port = *matches
-        .get_one::<u16>("http-port")
-        .expect("http-port is required");
-    let pgdata = matches
-        .get_one::<String>("pgdata")
-        .expect("PGDATA path is required");
-    let connstr = matches
-        .get_one::<String>("connstr")
-        .expect("Postgres connection string is required");
-    let spec_json = matches.get_one::<String>("spec");
-    let spec_path = matches.get_one::<String>("spec-path");
-    let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
-    let set_disk_quota_for_fs = matches.get_one::<String>("set-disk-quota-for-fs");
-
-    Ok(ProcessCliResult {
-        connstr,
-        pgdata,
-        pgbin,
-        ext_remote_storage,
-        http_port,
-        spec_json,
-        spec_path,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs,
-    })
-}
-
-struct ProcessCliResult<'clap> {
-    connstr: &'clap str,
-    pgdata: &'clap str,
-    pgbin: &'clap str,
-    ext_remote_storage: Option<&'clap str>,
-    http_port: u16,
-    spec_json: Option<&'clap String>,
-    spec_path: Option<&'clap String>,
-    resize_swap_on_bind: bool,
-    set_disk_quota_for_fs: Option<&'clap String>,
+    Ok(build_tag)
 }
 
 fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
@@ -235,19 +240,9 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
     }
 }
 
-fn try_spec_from_cli(
-    matches: &clap::ArgMatches,
-    ProcessCliResult {
-        spec_json,
-        spec_path,
-        ..
-    }: &ProcessCliResult,
-) -> Result<CliSpecParams> {
-    let compute_id = matches.get_one::<String>("compute-id");
-    let control_plane_uri = matches.get_one::<String>("control-plane-uri");
-
+fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
     // First, try to get cluster spec from the cli argument
-    if let Some(spec_json) = spec_json {
+    if let Some(ref spec_json) = cli.spec_json {
         info!("got spec from cli argument {}", spec_json);
         return Ok(CliSpecParams {
             spec: Some(serde_json::from_str(spec_json)?),
@@ -256,7 +251,7 @@ fn try_spec_from_cli(
     }
 
     // Second, try to read it from the file if path is provided
-    if let Some(spec_path) = spec_path {
+    if let Some(ref spec_path) = cli.spec_path {
         let file = File::open(Path::new(spec_path))?;
         return Ok(CliSpecParams {
             spec: Some(serde_json::from_reader(file)?),
@@ -264,17 +259,20 @@ fn try_spec_from_cli(
         });
     }
 
-    let Some(compute_id) = compute_id else {
+    if cli.compute_id.is_none() {
         panic!(
             "compute spec should be provided by one of the following ways: \
                 --spec OR --spec-path OR --control-plane-uri and --compute-id"
         );
     };
-    let Some(control_plane_uri) = control_plane_uri else {
+    if cli.control_plane_uri.is_none() {
         panic!("must specify both --control-plane-uri and --compute-id or none");
     };
 
-    match get_spec_from_control_plane(control_plane_uri, compute_id) {
+    match get_spec_from_control_plane(
+        cli.control_plane_uri.as_ref().unwrap(),
+        cli.compute_id.as_ref().unwrap(),
+    ) {
         Ok(spec) => Ok(CliSpecParams {
             spec,
             live_config_allowed: true,
@@ -298,21 +296,12 @@ struct CliSpecParams {
 
 fn wait_spec(
     build_tag: String,
-    ProcessCliResult {
-        connstr,
-        pgdata,
-        pgbin,
-        ext_remote_storage,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs,
-        http_port,
-        ..
-    }: ProcessCliResult,
+    cli: &Cli,
     CliSpecParams {
         spec,
         live_config_allowed,
     }: CliSpecParams,
-) -> Result<WaitSpecResult> {
+) -> Result<Arc<ComputeNode>> {
     let mut new_state = ComputeState::new();
     let spec_set;
 
@@ -324,7 +313,7 @@ fn wait_spec(
     } else {
         spec_set = false;
     }
-    let connstr = Url::parse(connstr).context("cannot parse connstr as a URL")?;
+    let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
     let conn_conf = postgres::config::Config::from_str(connstr.as_str())
         .context("cannot build postgres config from connstr")?;
     let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
@@ -333,14 +322,14 @@ fn wait_spec(
         connstr,
         conn_conf,
         tokio_conn_conf,
-        pgdata: pgdata.to_string(),
-        pgbin: pgbin.to_string(),
-        pgversion: get_pg_version_string(pgbin),
-        http_port,
+        pgdata: cli.pgdata.clone(),
+        pgbin: cli.pgbin.clone(),
+        pgversion: get_pg_version_string(&cli.pgbin),
+        http_port: cli.http_port,
         live_config_allowed,
         state: Mutex::new(new_state),
         state_changed: Condvar::new(),
-        ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
+        ext_remote_storage: cli.remote_ext_config.clone(),
         ext_download_progress: RwLock::new(HashMap::new()),
         build_tag,
     };
@@ -357,7 +346,7 @@ fn wait_spec(
     // Launch http service first, so that we can serve control-plane requests
     // while configuration is still in progress.
     let _http_handle =
-        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
+        launch_http_server(cli.http_port, &compute).expect("cannot launch http endpoint thread");
 
     if !spec_set {
         // No spec provided, hang waiting for it.
@@ -389,27 +378,12 @@ fn wait_spec(
 
     launch_lsn_lease_bg_task_for_static(&compute);
 
-    Ok(WaitSpecResult {
-        compute,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(),
-    })
-}
-
-struct WaitSpecResult {
-    compute: Arc<ComputeNode>,
-    resize_swap_on_bind: bool,
-    set_disk_quota_for_fs: Option<String>,
+    Ok(compute)
 }
 
 fn start_postgres(
-    // need to allow unused because `matches` is only used if target_os = "linux"
-    #[allow(unused_variables)] matches: &clap::ArgMatches,
-    WaitSpecResult {
-        compute,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs,
-    }: WaitSpecResult,
+    cli: &Cli,
+    compute: Arc<ComputeNode>,
 ) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
     // We got all we need, update the state.
     let mut state = compute.state.lock().unwrap();
@@ -437,7 +411,7 @@ fn start_postgres(
     let mut delay_exit = false;
 
     // Resize swap to the desired size if the compute spec says so
-    if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) {
+    if let (Some(size_bytes), true) = (swap_size_bytes, cli.resize_swap_on_bind) {
         // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
         // *before* starting postgres.
         //
@@ -464,9 +438,9 @@ fn start_postgres(
 
     // Set disk quota if the compute spec says so
     if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
-        (disk_quota_bytes, set_disk_quota_for_fs)
+        (disk_quota_bytes, cli.set_disk_quota_for_fs.as_ref())
     {
-        match set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint) {
+        match set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint) {
             Ok(()) => {
                 let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
                 info!(%disk_quota_bytes, %size_mib, "set disk quota");
@@ -509,13 +483,7 @@ fn start_postgres(
         if #[cfg(target_os = "linux")] {
             use std::env;
             use tokio_util::sync::CancellationToken;
-            let vm_monitor_addr = matches
-                .get_one::<String>("vm-monitor-addr")
-                .expect("--vm-monitor-addr should always be set because it has a default arg");
-            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
-            let cgroup = matches.get_one::<String>("cgroup");
 
-            // Only make a runtime if we need to.
             // Note: it seems like you can make a runtime in an inner scope and
             // if you start a task in it it won't be dropped. However, make it
             // in the outermost scope just to be safe.
@@ -538,15 +506,15 @@ fn start_postgres(
             let pgconnstr = if disable_lfc_resizing.unwrap_or(false) {
                 None
             } else {
-                file_cache_connstr.cloned()
+                Some(cli.filecache_connstr.clone())
             };
 
             let vm_monitor = rt.as_ref().map(|rt| {
                 rt.spawn(vm_monitor::start(
                     Box::leak(Box::new(vm_monitor::Args {
-                        cgroup: cgroup.cloned(),
+                        cgroup: Some(cli.cgroup.clone()),
                         pgconnstr,
-                        addr: vm_monitor_addr.clone(),
+                        addr: cli.vm_monitor_addr.clone(),
                     })),
                     token.clone(),
                 ))
@@ -702,105 +670,6 @@ fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
     exit(exit_code.unwrap_or(1))
 }
 
-fn cli() -> clap::Command {
-    // Env variable is set by `cargo`
-    let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
-    clap::Command::new("compute_ctl")
-        .version(version)
-        .arg(
-            Arg::new("http-port")
-                .long("http-port")
-                .value_name("HTTP_PORT")
-                .default_value("3080")
-                .value_parser(clap::value_parser!(u16))
-                .required(false),
-        )
-        .arg(
-            Arg::new("connstr")
-                .short('C')
-                .long("connstr")
-                .value_name("DATABASE_URL")
-                .required(true),
-        )
-        .arg(
-            Arg::new("pgdata")
-                .short('D')
-                .long("pgdata")
-                .value_name("DATADIR")
-                .required(true),
-        )
-        .arg(
-            Arg::new("pgbin")
-                .short('b')
-                .long("pgbin")
-                .default_value("postgres")
-                .value_name("POSTGRES_PATH"),
-        )
-        .arg(
-            Arg::new("spec")
-                .short('s')
-                .long("spec")
-                .value_name("SPEC_JSON"),
-        )
-        .arg(
-            Arg::new("spec-path")
-                .short('S')
-                .long("spec-path")
-                .value_name("SPEC_PATH"),
-        )
-        .arg(
-            Arg::new("compute-id")
-                .short('i')
-                .long("compute-id")
-                .value_name("COMPUTE_ID"),
-        )
-        .arg(
-            Arg::new("control-plane-uri")
-                .short('p')
-                .long("control-plane-uri")
-                .value_name("CONTROL_PLANE_API_BASE_URI"),
-        )
-        .arg(
-            Arg::new("remote-ext-config")
-                .short('r')
-                .long("remote-ext-config")
-                .value_name("REMOTE_EXT_CONFIG"),
-        )
-        // TODO(fprasx): we currently have default arguments because the cloud PR
-        // to pass them in hasn't been merged yet. We should get rid of them once
-        // the PR is merged.
-        .arg(
-            Arg::new("vm-monitor-addr")
-                .long("vm-monitor-addr")
-                .default_value("0.0.0.0:10301")
-                .value_name("VM_MONITOR_ADDR"),
-        )
-        .arg(
-            Arg::new("cgroup")
-                .long("cgroup")
-                .default_value("neon-postgres")
-                .value_name("CGROUP"),
-        )
-        .arg(
-            Arg::new("filecache-connstr")
-                .long("filecache-connstr")
-                .default_value(
-                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor",
-                )
-                .value_name("FILECACHE_CONNSTR"),
-        )
-        .arg(
-            Arg::new("resize-swap-on-bind")
-                .long("resize-swap-on-bind")
-                .action(clap::ArgAction::SetTrue),
-        )
-        .arg(
-            Arg::new("set-disk-quota-for-fs")
-                .long("set-disk-quota-for-fs")
-                .value_name("SET_DISK_QUOTA_FOR_FS")
-        )
-}
-
 /// When compute_ctl is killed, send also termination signal to sync-safekeepers
 /// to prevent leakage. TODO: it is better to convert compute_ctl to async and
 /// wait for termination which would be easy then.
@@ -810,7 +679,14 @@ fn handle_exit_signal(sig: i32) {
     exit(1);
 }
 
-#[test]
-fn verify_cli() {
-    cli().debug_assert()
+#[cfg(test)]
+mod test {
+    use clap::CommandFactory;
+
+    use super::Cli;
+
+    #[test]
+    fn verify_cli() {
+        Cli::command().debug_assert()
+    }
 }

From ad1a41157affa94c9e818239b7c2d9fd26bb3de6 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 31 Jan 2025 19:14:27 +0000
Subject: [PATCH 340/583] feat(proxy): optimizing the chances of large write in
 copy_bidirectional (#10608)

We forked copy_bidirectional to solve some issues like fast-shutdown
(disallowing half-open connections) and to introduce better error
tracking (which side of the conn closed down).

A change recently made its way upstream offering performance
improvements: https://github.com/tokio-rs/tokio/pull/6532. These seem
applicable to our fork, thus it makes sense to apply them here as well.
---
 proxy/src/proxy/copy_bidirectional.rs | 35 +++++++++++++++------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 3336a9556a..861f1766e8 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -201,25 +201,26 @@ impl CopyBuffer {
         W: AsyncWrite + ?Sized,
     {
         loop {
-            // If our buffer is empty, then we need to read some data to
-            // continue.
-            if self.pos == self.cap && !self.read_done {
-                self.pos = 0;
-                self.cap = 0;
-
+            // If there is some space left in our buffer, then we try to read some
+            // data to continue, thus maximizing the chances of a large write.
+            if self.cap < self.buf.len() && !self.read_done {
                 match self.poll_fill_buf(cx, reader.as_mut()) {
                     Poll::Ready(Ok(())) => (),
                     Poll::Ready(Err(err)) => return Poll::Ready(Err(ErrorDirection::Read(err))),
                     Poll::Pending => {
-                        // Try flushing when the reader has no progress to avoid deadlock
-                        // when the reader depends on buffered writer.
-                        if self.need_flush {
-                            ready!(writer.as_mut().poll_flush(cx))
-                                .map_err(ErrorDirection::Write)?;
-                            self.need_flush = false;
-                        }
+                        // Ignore pending reads when our buffer is not empty, because
+                        // we can try to write data immediately.
+                        if self.pos == self.cap {
+                            // Try flushing when the reader has no progress to avoid deadlock
+                            // when the reader depends on buffered writer.
+                            if self.need_flush {
+                                ready!(writer.as_mut().poll_flush(cx))
+                                    .map_err(ErrorDirection::Write)?;
+                                self.need_flush = false;
+                            }
 
-                        return Poll::Pending;
+                            return Poll::Pending;
+                        }
                     }
                 }
             }
@@ -246,9 +247,13 @@ impl CopyBuffer {
                 "writer returned length larger than input slice"
             );
 
+            // All data has been written, the buffer can be considered empty again
+            self.pos = 0;
+            self.cap = 0;
+
             // If we've written all the data and we've seen EOF, flush out the
             // data and finish the transfer.
-            if self.pos == self.cap && self.read_done {
+            if self.read_done {
                 ready!(writer.as_mut().poll_flush(cx)).map_err(ErrorDirection::Write)?;
                 return Poll::Ready(Ok(self.amt));
             }

From 6dd48ba148af2eaf90c9d8b5505a760a9995f173 Mon Sep 17 00:00:00 2001
From: Stefan Radig <stefan@neon.tech>
Date: Fri, 31 Jan 2025 21:32:57 +0100
Subject: [PATCH 341/583] feat(proxy): Implement access control with VPC
 endpoint checks and block for public internet / VPC (#10143)

- Wired up filtering on VPC endpoints
- Wired up block access from public internet / VPC depending on per
project flag
- Added cache invalidation for VPC endpoints (partially based on PR from
Raphael)
- Removed BackendIpAllowlist trait

---------

Co-authored-by: Ivan Efremov <ivan@neon.tech>
---
 proxy/src/auth/backend/console_redirect.rs    |  32 ++-
 proxy/src/auth/backend/mod.rs                 | 154 ++++++++----
 proxy/src/auth/mod.rs                         |  25 ++
 proxy/src/bin/local_proxy.rs                  |   1 +
 proxy/src/bin/proxy.rs                        |   1 +
 proxy/src/cache/project_info.rs               | 224 +++++++++++++++++-
 proxy/src/cancellation.rs                     |  55 ++++-
 proxy/src/config.rs                           |   1 +
 proxy/src/console_redirect_proxy.rs           |   3 +-
 proxy/src/context/mod.rs                      |  11 +-
 .../control_plane/client/cplane_proxy_v1.rs   | 174 +++++++++++++-
 proxy/src/control_plane/client/mock.rs        |  42 +++-
 proxy/src/control_plane/client/mod.rs         |  51 +++-
 proxy/src/control_plane/messages.rs           |  26 +-
 proxy/src/control_plane/mod.rs                |  33 ++-
 proxy/src/intern.rs                           |  22 +-
 proxy/src/metrics.rs                          |  13 +
 proxy/src/proxy/mod.rs                        |   3 +-
 proxy/src/proxy/tests/mod.rs                  |  16 +-
 proxy/src/redis/notifications.rs              |  48 +++-
 proxy/src/serverless/backend.rs               |  42 +++-
 proxy/src/types.rs                            |   2 +
 22 files changed, 845 insertions(+), 134 deletions(-)

diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index 1cbf91d3ae..9be29c38c9 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -7,8 +7,8 @@ use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, info_span};
 
-use super::{ComputeCredentialKeys, ControlPlaneApi};
-use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo};
+use super::ComputeCredentialKeys;
+use crate::auth::backend::ComputeUserInfo;
 use crate::auth::IpPattern;
 use crate::cache::Cached;
 use crate::config::AuthenticationConfig;
@@ -84,26 +84,15 @@ pub(crate) fn new_psql_session_id() -> String {
     hex::encode(rand::random::<[u8; 8]>())
 }
 
-#[async_trait]
-impl BackendIpAllowlist for ConsoleRedirectBackend {
-    async fn get_allowed_ips(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> auth::Result<Vec<auth::IpPattern>> {
-        self.api
-            .get_allowed_ips_and_secret(ctx, user_info)
-            .await
-            .map(|(ips, _)| ips.as_ref().clone())
-            .map_err(|e| e.into())
-    }
-}
-
 impl ConsoleRedirectBackend {
     pub fn new(console_uri: reqwest::Url, api: cplane_proxy_v1::NeonControlPlaneClient) -> Self {
         Self { console_uri, api }
     }
 
+    pub(crate) fn get_api(&self) -> &cplane_proxy_v1::NeonControlPlaneClient {
+        &self.api
+    }
+
     pub(crate) async fn authenticate(
         &self,
         ctx: &RequestContext,
@@ -191,6 +180,15 @@ async fn authenticate(
         }
     }
 
+    // Check if the access over the public internet is allowed, otherwise block. Note that
+    // the console redirect is not behind the VPC service endpoint, so we don't need to check
+    // the VPC endpoint ID.
+    if let Some(public_access_allowed) = db_info.public_access_allowed {
+        if !public_access_allowed {
+            return Err(auth::AuthError::NetworkNotAllowed);
+        }
+    }
+
     client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;
 
     // This config should be self-contained, because we won't
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index d17d91a56d..7ef096207a 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -26,10 +26,12 @@ use crate::context::RequestContext;
 use crate::control_plane::client::ControlPlaneClient;
 use crate::control_plane::errors::GetAuthInfoError;
 use crate::control_plane::{
-    self, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi,
+    self, AccessBlockerFlags, AuthSecret, CachedAccessBlockerFlags, CachedAllowedIps,
+    CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi,
 };
 use crate::intern::EndpointIdInt;
 use crate::metrics::Metrics;
+use crate::protocol2::ConnectionInfoExtra;
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
 use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter};
@@ -99,6 +101,13 @@ impl<T> Backend<'_, T> {
             Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)),
         }
     }
+
+    pub(crate) fn get_api(&self) -> &ControlPlaneClient {
+        match self {
+            Self::ControlPlane(api, _) => api,
+            Self::Local(_) => panic!("Local backend has no API"),
+        }
+    }
 }
 
 impl<'a, T> Backend<'a, T> {
@@ -247,15 +256,6 @@ impl AuthenticationConfig {
     }
 }
 
-#[async_trait::async_trait]
-pub(crate) trait BackendIpAllowlist {
-    async fn get_allowed_ips(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> auth::Result<Vec<auth::IpPattern>>;
-}
-
 /// True to its name, this function encapsulates our current auth trade-offs.
 /// Here, we choose the appropriate auth flow based on circumstances.
 ///
@@ -282,23 +282,51 @@ async fn auth_quirks(
         Ok(info) => (info, None),
     };
 
-    debug!("fetching user's authentication info");
-    let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;
+    debug!("fetching authentication info and allowlists");
 
     // check allowed list
-    if config.ip_allowlist_check_enabled
-        && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips)
-    {
-        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
+    let allowed_ips = if config.ip_allowlist_check_enabled {
+        let allowed_ips = api.get_allowed_ips(ctx, &info).await?;
+        if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
+            return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
+        }
+        allowed_ips
+    } else {
+        Cached::new_uncached(Arc::new(vec![]))
+    };
+
+    // check if a VPC endpoint ID is coming in and if yes, if it's allowed
+    let access_blocks = api.get_block_public_or_vpc_access(ctx, &info).await?;
+    if config.is_vpc_acccess_proxy {
+        if access_blocks.vpc_access_blocked {
+            return Err(AuthError::NetworkNotAllowed);
+        }
+
+        let incoming_vpc_endpoint_id = match ctx.extra() {
+            None => return Err(AuthError::MissingEndpointName),
+            Some(ConnectionInfoExtra::Aws { vpce_id }) => {
+                // Convert the vcpe_id to a string
+                String::from_utf8(vpce_id.to_vec()).unwrap_or_default()
+            }
+            Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(),
+        };
+        let allowed_vpc_endpoint_ids = api.get_allowed_vpc_endpoint_ids(ctx, &info).await?;
+        // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that.
+        if !allowed_vpc_endpoint_ids.is_empty()
+            && !allowed_vpc_endpoint_ids.contains(&incoming_vpc_endpoint_id)
+        {
+            return Err(AuthError::vpc_endpoint_id_not_allowed(
+                incoming_vpc_endpoint_id,
+            ));
+        }
+    } else if access_blocks.public_access_blocked {
+        return Err(AuthError::NetworkNotAllowed);
     }
 
     if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) {
         return Err(AuthError::too_many_connections());
     }
-    let cached_secret = match maybe_secret {
-        Some(secret) => secret,
-        None => api.get_role_secret(ctx, &info).await?,
-    };
+    let cached_secret = api.get_role_secret(ctx, &info).await?;
     let (cached_entry, secret) = cached_secret.take_value();
 
     let secret = if let Some(secret) = secret {
@@ -440,34 +468,38 @@ impl Backend<'_, ComputeUserInfo> {
         }
     }
 
-    pub(crate) async fn get_allowed_ips_and_secret(
+    pub(crate) async fn get_allowed_ips(
         &self,
         ctx: &RequestContext,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
+    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
         match self {
-            Self::ControlPlane(api, user_info) => {
-                api.get_allowed_ips_and_secret(ctx, user_info).await
-            }
-            Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+            Self::ControlPlane(api, user_info) => api.get_allowed_ips(ctx, user_info).await,
+            Self::Local(_) => Ok(Cached::new_uncached(Arc::new(vec![]))),
         }
     }
-}
 
-#[async_trait::async_trait]
-impl BackendIpAllowlist for Backend<'_, ()> {
-    async fn get_allowed_ips(
+    pub(crate) async fn get_allowed_vpc_endpoint_ids(
         &self,
         ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> auth::Result<Vec<auth::IpPattern>> {
-        let auth_data = match self {
-            Self::ControlPlane(api, ()) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
-        };
+    ) -> Result<CachedAllowedVpcEndpointIds, GetAuthInfoError> {
+        match self {
+            Self::ControlPlane(api, user_info) => {
+                api.get_allowed_vpc_endpoint_ids(ctx, user_info).await
+            }
+            Self::Local(_) => Ok(Cached::new_uncached(Arc::new(vec![]))),
+        }
+    }
 
-        auth_data
-            .map(|(ips, _)| ips.as_ref().clone())
-            .map_err(|e| e.into())
+    pub(crate) async fn get_block_public_or_vpc_access(
+        &self,
+        ctx: &RequestContext,
+    ) -> Result<CachedAccessBlockerFlags, GetAuthInfoError> {
+        match self {
+            Self::ControlPlane(api, user_info) => {
+                api.get_block_public_or_vpc_access(ctx, user_info).await
+            }
+            Self::Local(_) => Ok(Cached::new_uncached(AccessBlockerFlags::default())),
+        }
     }
 }
 
@@ -514,7 +546,10 @@ mod tests {
     use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern};
     use crate::config::AuthenticationConfig;
     use crate::context::RequestContext;
-    use crate::control_plane::{self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret};
+    use crate::control_plane::{
+        self, AccessBlockerFlags, CachedAccessBlockerFlags, CachedAllowedIps,
+        CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret,
+    };
     use crate::proxy::NeonOptions;
     use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo};
     use crate::scram::threadpool::ThreadPool;
@@ -523,6 +558,8 @@ mod tests {
 
     struct Auth {
         ips: Vec<IpPattern>,
+        vpc_endpoint_ids: Vec<String>,
+        access_blocker_flags: AccessBlockerFlags,
         secret: AuthSecret,
     }
 
@@ -535,17 +572,31 @@ mod tests {
             Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
         }
 
-        async fn get_allowed_ips_and_secret(
+        async fn get_allowed_ips(
             &self,
             _ctx: &RequestContext,
             _user_info: &super::ComputeUserInfo,
-        ) -> Result<
-            (CachedAllowedIps, Option<CachedRoleSecret>),
-            control_plane::errors::GetAuthInfoError,
-        > {
-            Ok((
-                CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())),
-                Some(CachedRoleSecret::new_uncached(Some(self.secret.clone()))),
+        ) -> Result<CachedAllowedIps, control_plane::errors::GetAuthInfoError> {
+            Ok(CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())))
+        }
+
+        async fn get_allowed_vpc_endpoint_ids(
+            &self,
+            _ctx: &RequestContext,
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<CachedAllowedVpcEndpointIds, control_plane::errors::GetAuthInfoError> {
+            Ok(CachedAllowedVpcEndpointIds::new_uncached(Arc::new(
+                self.vpc_endpoint_ids.clone(),
+            )))
+        }
+
+        async fn get_block_public_or_vpc_access(
+            &self,
+            _ctx: &RequestContext,
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<CachedAccessBlockerFlags, control_plane::errors::GetAuthInfoError> {
+            Ok(CachedAccessBlockerFlags::new_uncached(
+                self.access_blocker_flags.clone(),
             ))
         }
 
@@ -575,6 +626,7 @@ mod tests {
         rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
         rate_limit_ip_subnet: 64,
         ip_allowlist_check_enabled: true,
+        is_vpc_acccess_proxy: false,
         is_auth_broker: false,
         accept_jwts: false,
         console_redirect_confirmation_timeout: std::time::Duration::from_secs(5),
@@ -642,6 +694,8 @@ mod tests {
         let ctx = RequestContext::test();
         let api = Auth {
             ips: vec![],
+            vpc_endpoint_ids: vec![],
+            access_blocker_flags: AccessBlockerFlags::default(),
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
         };
 
@@ -722,6 +776,8 @@ mod tests {
         let ctx = RequestContext::test();
         let api = Auth {
             ips: vec![],
+            vpc_endpoint_ids: vec![],
+            access_blocker_flags: AccessBlockerFlags::default(),
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
         };
 
@@ -774,6 +830,8 @@ mod tests {
         let ctx = RequestContext::test();
         let api = Auth {
             ips: vec![],
+            vpc_endpoint_ids: vec![],
+            access_blocker_flags: AccessBlockerFlags::default(),
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
         };
 
diff --git a/proxy/src/auth/mod.rs b/proxy/src/auth/mod.rs
index 0198cc306e..6082695a6b 100644
--- a/proxy/src/auth/mod.rs
+++ b/proxy/src/auth/mod.rs
@@ -55,6 +55,12 @@ pub(crate) enum AuthError {
     )]
     MissingEndpointName,
 
+    #[error(
+        "VPC endpoint ID is not specified. \
+        This endpoint requires a VPC endpoint ID to connect."
+    )]
+    MissingVPCEndpointId,
+
     #[error("password authentication failed for user '{0}'")]
     PasswordFailed(Box<str>),
 
@@ -69,6 +75,15 @@ pub(crate) enum AuthError {
     )]
     IpAddressNotAllowed(IpAddr),
 
+    #[error("This connection is trying to access this endpoint from a blocked network.")]
+    NetworkNotAllowed,
+
+    #[error(
+        "This VPC endpoint id {0} is not allowed to connect to this endpoint. \
+        Please add it to the allowed list in the Neon console."
+    )]
+    VpcEndpointIdNotAllowed(String),
+
     #[error("Too many connections to this endpoint. Please try again later.")]
     TooManyConnections,
 
@@ -95,6 +110,10 @@ impl AuthError {
         AuthError::IpAddressNotAllowed(ip)
     }
 
+    pub(crate) fn vpc_endpoint_id_not_allowed(id: String) -> Self {
+        AuthError::VpcEndpointIdNotAllowed(id)
+    }
+
     pub(crate) fn too_many_connections() -> Self {
         AuthError::TooManyConnections
     }
@@ -122,8 +141,11 @@ impl UserFacingError for AuthError {
             Self::BadAuthMethod(_) => self.to_string(),
             Self::MalformedPassword(_) => self.to_string(),
             Self::MissingEndpointName => self.to_string(),
+            Self::MissingVPCEndpointId => self.to_string(),
             Self::Io(_) => "Internal error".to_string(),
             Self::IpAddressNotAllowed(_) => self.to_string(),
+            Self::NetworkNotAllowed => self.to_string(),
+            Self::VpcEndpointIdNotAllowed(_) => self.to_string(),
             Self::TooManyConnections => self.to_string(),
             Self::UserTimeout(_) => self.to_string(),
             Self::ConfirmationTimeout(_) => self.to_string(),
@@ -142,8 +164,11 @@ impl ReportableError for AuthError {
             Self::BadAuthMethod(_) => crate::error::ErrorKind::User,
             Self::MalformedPassword(_) => crate::error::ErrorKind::User,
             Self::MissingEndpointName => crate::error::ErrorKind::User,
+            Self::MissingVPCEndpointId => crate::error::ErrorKind::User,
             Self::Io(_) => crate::error::ErrorKind::ClientDisconnect,
             Self::IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
+            Self::NetworkNotAllowed => crate::error::ErrorKind::User,
+            Self::VpcEndpointIdNotAllowed(_) => crate::error::ErrorKind::User,
             Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
             Self::UserTimeout(_) => crate::error::ErrorKind::User,
             Self::ConfirmationTimeout(_) => crate::error::ErrorKind::User,
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index ee8b3d4ef5..7a855bf54b 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -284,6 +284,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
             rate_limiter: BucketRateLimiter::new(vec![]),
             rate_limit_ip_subnet: 64,
             ip_allowlist_check_enabled: true,
+            is_vpc_acccess_proxy: false,
             is_auth_broker: false,
             accept_jwts: true,
             console_redirect_confirmation_timeout: Duration::ZERO,
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index e1affe8391..de685a82c6 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -630,6 +630,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
         rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
         ip_allowlist_check_enabled: !args.is_private_access_proxy,
+        is_vpc_acccess_proxy: args.is_private_access_proxy,
         is_auth_broker: args.is_auth_broker,
         accept_jwts: args.is_auth_broker,
         console_redirect_confirmation_timeout: args.webauth_confirmation_timeout,
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index a5e71f1a87..7651eb71a2 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -15,13 +15,16 @@ use tracing::{debug, info};
 use super::{Cache, Cached};
 use crate::auth::IpPattern;
 use crate::config::ProjectInfoCacheOptions;
-use crate::control_plane::AuthSecret;
-use crate::intern::{EndpointIdInt, ProjectIdInt, RoleNameInt};
+use crate::control_plane::{AccessBlockerFlags, AuthSecret};
+use crate::intern::{AccountIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
 use crate::types::{EndpointId, RoleName};
 
 #[async_trait]
 pub(crate) trait ProjectInfoCache {
     fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt);
+    fn invalidate_allowed_vpc_endpoint_ids_for_projects(&self, project_ids: Vec<ProjectIdInt>);
+    fn invalidate_allowed_vpc_endpoint_ids_for_org(&self, account_id: AccountIdInt);
+    fn invalidate_block_public_or_vpc_access_for_project(&self, project_id: ProjectIdInt);
     fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
     async fn decrement_active_listeners(&self);
     async fn increment_active_listeners(&self);
@@ -51,6 +54,8 @@ impl<T> From<T> for Entry<T> {
 struct EndpointInfo {
     secret: std::collections::HashMap<RoleNameInt, Entry<Option<AuthSecret>>>,
     allowed_ips: Option<Entry<Arc<Vec<IpPattern>>>>,
+    block_public_or_vpc_access: Option<Entry<AccessBlockerFlags>>,
+    allowed_vpc_endpoint_ids: Option<Entry<Arc<Vec<String>>>>,
 }
 
 impl EndpointInfo {
@@ -92,9 +97,52 @@ impl EndpointInfo {
         }
         None
     }
+    pub(crate) fn get_allowed_vpc_endpoint_ids(
+        &self,
+        valid_since: Instant,
+        ignore_cache_since: Option<Instant>,
+    ) -> Option<(Arc<Vec<String>>, bool)> {
+        if let Some(allowed_vpc_endpoint_ids) = &self.allowed_vpc_endpoint_ids {
+            if valid_since < allowed_vpc_endpoint_ids.created_at {
+                return Some((
+                    allowed_vpc_endpoint_ids.value.clone(),
+                    Self::check_ignore_cache(
+                        ignore_cache_since,
+                        allowed_vpc_endpoint_ids.created_at,
+                    ),
+                ));
+            }
+        }
+        None
+    }
+    pub(crate) fn get_block_public_or_vpc_access(
+        &self,
+        valid_since: Instant,
+        ignore_cache_since: Option<Instant>,
+    ) -> Option<(AccessBlockerFlags, bool)> {
+        if let Some(block_public_or_vpc_access) = &self.block_public_or_vpc_access {
+            if valid_since < block_public_or_vpc_access.created_at {
+                return Some((
+                    block_public_or_vpc_access.value.clone(),
+                    Self::check_ignore_cache(
+                        ignore_cache_since,
+                        block_public_or_vpc_access.created_at,
+                    ),
+                ));
+            }
+        }
+        None
+    }
+
     pub(crate) fn invalidate_allowed_ips(&mut self) {
         self.allowed_ips = None;
     }
+    pub(crate) fn invalidate_allowed_vpc_endpoint_ids(&mut self) {
+        self.allowed_vpc_endpoint_ids = None;
+    }
+    pub(crate) fn invalidate_block_public_or_vpc_access(&mut self) {
+        self.block_public_or_vpc_access = None;
+    }
     pub(crate) fn invalidate_role_secret(&mut self, role_name: RoleNameInt) {
         self.secret.remove(&role_name);
     }
@@ -111,6 +159,8 @@ pub struct ProjectInfoCacheImpl {
     cache: ClashMap<EndpointIdInt, EndpointInfo>,
 
     project2ep: ClashMap<ProjectIdInt, HashSet<EndpointIdInt>>,
+    // FIXME(stefan): we need a way to GC the account2ep map.
+    account2ep: ClashMap<AccountIdInt, HashSet<EndpointIdInt>>,
     config: ProjectInfoCacheOptions,
 
     start_time: Instant,
@@ -120,6 +170,63 @@ pub struct ProjectInfoCacheImpl {
 
 #[async_trait]
 impl ProjectInfoCache for ProjectInfoCacheImpl {
+    fn invalidate_allowed_vpc_endpoint_ids_for_projects(&self, project_ids: Vec<ProjectIdInt>) {
+        info!(
+            "invalidating allowed vpc endpoint ids for projects `{}`",
+            project_ids
+                .iter()
+                .map(|id| id.to_string())
+                .collect::<Vec<_>>()
+                .join(", ")
+        );
+        for project_id in project_ids {
+            let endpoints = self
+                .project2ep
+                .get(&project_id)
+                .map(|kv| kv.value().clone())
+                .unwrap_or_default();
+            for endpoint_id in endpoints {
+                if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
+                    endpoint_info.invalidate_allowed_vpc_endpoint_ids();
+                }
+            }
+        }
+    }
+
+    fn invalidate_allowed_vpc_endpoint_ids_for_org(&self, account_id: AccountIdInt) {
+        info!(
+            "invalidating allowed vpc endpoint ids for org `{}`",
+            account_id
+        );
+        let endpoints = self
+            .account2ep
+            .get(&account_id)
+            .map(|kv| kv.value().clone())
+            .unwrap_or_default();
+        for endpoint_id in endpoints {
+            if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
+                endpoint_info.invalidate_allowed_vpc_endpoint_ids();
+            }
+        }
+    }
+
+    fn invalidate_block_public_or_vpc_access_for_project(&self, project_id: ProjectIdInt) {
+        info!(
+            "invalidating block public or vpc access for project `{}`",
+            project_id
+        );
+        let endpoints = self
+            .project2ep
+            .get(&project_id)
+            .map(|kv| kv.value().clone())
+            .unwrap_or_default();
+        for endpoint_id in endpoints {
+            if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
+                endpoint_info.invalidate_block_public_or_vpc_access();
+            }
+        }
+    }
+
     fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) {
         info!("invalidating allowed ips for project `{}`", project_id);
         let endpoints = self
@@ -178,6 +285,7 @@ impl ProjectInfoCacheImpl {
         Self {
             cache: ClashMap::new(),
             project2ep: ClashMap::new(),
+            account2ep: ClashMap::new(),
             config,
             ttl_disabled_since_us: AtomicU64::new(u64::MAX),
             start_time: Instant::now(),
@@ -226,6 +334,49 @@ impl ProjectInfoCacheImpl {
         }
         Some(Cached::new_uncached(value))
     }
+    pub(crate) fn get_allowed_vpc_endpoint_ids(
+        &self,
+        endpoint_id: &EndpointId,
+    ) -> Option<Cached<&Self, Arc<Vec<String>>>> {
+        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
+        let (valid_since, ignore_cache_since) = self.get_cache_times();
+        let endpoint_info = self.cache.get(&endpoint_id)?;
+        let value = endpoint_info.get_allowed_vpc_endpoint_ids(valid_since, ignore_cache_since);
+        let (value, ignore_cache) = value?;
+        if !ignore_cache {
+            let cached = Cached {
+                token: Some((
+                    self,
+                    CachedLookupInfo::new_allowed_vpc_endpoint_ids(endpoint_id),
+                )),
+                value,
+            };
+            return Some(cached);
+        }
+        Some(Cached::new_uncached(value))
+    }
+    pub(crate) fn get_block_public_or_vpc_access(
+        &self,
+        endpoint_id: &EndpointId,
+    ) -> Option<Cached<&Self, AccessBlockerFlags>> {
+        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
+        let (valid_since, ignore_cache_since) = self.get_cache_times();
+        let endpoint_info = self.cache.get(&endpoint_id)?;
+        let value = endpoint_info.get_block_public_or_vpc_access(valid_since, ignore_cache_since);
+        let (value, ignore_cache) = value?;
+        if !ignore_cache {
+            let cached = Cached {
+                token: Some((
+                    self,
+                    CachedLookupInfo::new_block_public_or_vpc_access(endpoint_id),
+                )),
+                value,
+            };
+            return Some(cached);
+        }
+        Some(Cached::new_uncached(value))
+    }
+
     pub(crate) fn insert_role_secret(
         &self,
         project_id: ProjectIdInt,
@@ -256,6 +407,43 @@ impl ProjectInfoCacheImpl {
         self.insert_project2endpoint(project_id, endpoint_id);
         self.cache.entry(endpoint_id).or_default().allowed_ips = Some(allowed_ips.into());
     }
+    pub(crate) fn insert_allowed_vpc_endpoint_ids(
+        &self,
+        account_id: Option<AccountIdInt>,
+        project_id: ProjectIdInt,
+        endpoint_id: EndpointIdInt,
+        allowed_vpc_endpoint_ids: Arc<Vec<String>>,
+    ) {
+        if self.cache.len() >= self.config.size {
+            // If there are too many entries, wait until the next gc cycle.
+            return;
+        }
+        if let Some(account_id) = account_id {
+            self.insert_account2endpoint(account_id, endpoint_id);
+        }
+        self.insert_project2endpoint(project_id, endpoint_id);
+        self.cache
+            .entry(endpoint_id)
+            .or_default()
+            .allowed_vpc_endpoint_ids = Some(allowed_vpc_endpoint_ids.into());
+    }
+    pub(crate) fn insert_block_public_or_vpc_access(
+        &self,
+        project_id: ProjectIdInt,
+        endpoint_id: EndpointIdInt,
+        access_blockers: AccessBlockerFlags,
+    ) {
+        if self.cache.len() >= self.config.size {
+            // If there are too many entries, wait until the next gc cycle.
+            return;
+        }
+        self.insert_project2endpoint(project_id, endpoint_id);
+        self.cache
+            .entry(endpoint_id)
+            .or_default()
+            .block_public_or_vpc_access = Some(access_blockers.into());
+    }
+
     fn insert_project2endpoint(&self, project_id: ProjectIdInt, endpoint_id: EndpointIdInt) {
         if let Some(mut endpoints) = self.project2ep.get_mut(&project_id) {
             endpoints.insert(endpoint_id);
@@ -264,6 +452,14 @@ impl ProjectInfoCacheImpl {
                 .insert(project_id, HashSet::from([endpoint_id]));
         }
     }
+    fn insert_account2endpoint(&self, account_id: AccountIdInt, endpoint_id: EndpointIdInt) {
+        if let Some(mut endpoints) = self.account2ep.get_mut(&account_id) {
+            endpoints.insert(endpoint_id);
+        } else {
+            self.account2ep
+                .insert(account_id, HashSet::from([endpoint_id]));
+        }
+    }
     fn get_cache_times(&self) -> (Instant, Option<Instant>) {
         let mut valid_since = Instant::now() - self.config.ttl;
         // Only ignore cache if ttl is disabled.
@@ -334,11 +530,25 @@ impl CachedLookupInfo {
             lookup_type: LookupType::AllowedIps,
         }
     }
+    pub(self) fn new_allowed_vpc_endpoint_ids(endpoint_id: EndpointIdInt) -> Self {
+        Self {
+            endpoint_id,
+            lookup_type: LookupType::AllowedVpcEndpointIds,
+        }
+    }
+    pub(self) fn new_block_public_or_vpc_access(endpoint_id: EndpointIdInt) -> Self {
+        Self {
+            endpoint_id,
+            lookup_type: LookupType::BlockPublicOrVpcAccess,
+        }
+    }
 }
 
 enum LookupType {
     RoleSecret(RoleNameInt),
     AllowedIps,
+    AllowedVpcEndpointIds,
+    BlockPublicOrVpcAccess,
 }
 
 impl Cache for ProjectInfoCacheImpl {
@@ -360,6 +570,16 @@ impl Cache for ProjectInfoCacheImpl {
                     endpoint_info.invalidate_allowed_ips();
                 }
             }
+            LookupType::AllowedVpcEndpointIds => {
+                if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
+                    endpoint_info.invalidate_allowed_vpc_endpoint_ids();
+                }
+            }
+            LookupType::BlockPublicOrVpcAccess => {
+                if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
+                    endpoint_info.invalidate_block_public_or_vpc_access();
+                }
+            }
         }
     }
 }
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 9a0b954341..4d919f374a 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -12,13 +12,15 @@ use tokio::net::TcpStream;
 use tokio::sync::{mpsc, oneshot};
 use tracing::{debug, info};
 
-use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo};
+use crate::auth::backend::ComputeUserInfo;
 use crate::auth::{check_peer_addr_is_in_list, AuthError};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
+use crate::control_plane::ControlPlaneApi;
 use crate::error::ReportableError;
 use crate::ext::LockExt;
 use crate::metrics::{CancelChannelSizeGuard, CancellationRequest, Metrics, RedisMsgKind};
+use crate::protocol2::ConnectionInfoExtra;
 use crate::rate_limiter::LeakyBucketRateLimiter;
 use crate::redis::keys::KeyPrefix;
 use crate::redis::kv_ops::RedisKVClient;
@@ -133,6 +135,9 @@ pub(crate) enum CancelError {
     #[error("IP is not allowed")]
     IpNotAllowed,
 
+    #[error("VPC endpoint id is not allowed to connect")]
+    VpcEndpointIdNotAllowed,
+
     #[error("Authentication backend error")]
     AuthError(#[from] AuthError),
 
@@ -152,8 +157,9 @@ impl ReportableError for CancelError {
             }
             CancelError::Postgres(_) => crate::error::ErrorKind::Compute,
             CancelError::RateLimit => crate::error::ErrorKind::RateLimit,
-            CancelError::IpNotAllowed => crate::error::ErrorKind::User,
-            CancelError::NotFound => crate::error::ErrorKind::User,
+            CancelError::IpNotAllowed
+            | CancelError::VpcEndpointIdNotAllowed
+            | CancelError::NotFound => crate::error::ErrorKind::User,
             CancelError::AuthError(_) => crate::error::ErrorKind::ControlPlane,
             CancelError::InternalError => crate::error::ErrorKind::Service,
         }
@@ -265,11 +271,12 @@ impl CancellationHandler {
     /// Will fetch IP allowlist internally.
     ///
     /// return Result primarily for tests
-    pub(crate) async fn cancel_session<T: BackendIpAllowlist>(
+    pub(crate) async fn cancel_session<T: ControlPlaneApi>(
         &self,
         key: CancelKeyData,
         ctx: RequestContext,
-        check_allowed: bool,
+        check_ip_allowed: bool,
+        check_vpc_allowed: bool,
         auth_backend: &T,
     ) -> Result<(), CancelError> {
         let subnet_key = match ctx.peer_addr() {
@@ -304,11 +311,11 @@ impl CancellationHandler {
             return Err(CancelError::NotFound);
         };
 
-        if check_allowed {
+        if check_ip_allowed {
             let ip_allowlist = auth_backend
                 .get_allowed_ips(&ctx, &cancel_closure.user_info)
                 .await
-                .map_err(CancelError::AuthError)?;
+                .map_err(|e| CancelError::AuthError(e.into()))?;
 
             if !check_peer_addr_is_in_list(&ctx.peer_addr(), &ip_allowlist) {
                 // log it here since cancel_session could be spawned in a task
@@ -320,6 +327,40 @@ impl CancellationHandler {
             }
         }
 
+        // check if a VPC endpoint ID is coming in and if yes, if it's allowed
+        let access_blocks = auth_backend
+            .get_block_public_or_vpc_access(&ctx, &cancel_closure.user_info)
+            .await
+            .map_err(|e| CancelError::AuthError(e.into()))?;
+
+        if check_vpc_allowed {
+            if access_blocks.vpc_access_blocked {
+                return Err(CancelError::AuthError(AuthError::NetworkNotAllowed));
+            }
+
+            let incoming_vpc_endpoint_id = match ctx.extra() {
+                None => return Err(CancelError::AuthError(AuthError::MissingVPCEndpointId)),
+                Some(ConnectionInfoExtra::Aws { vpce_id }) => {
+                    // Convert the vcpe_id to a string
+                    String::from_utf8(vpce_id.to_vec()).unwrap_or_default()
+                }
+                Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(),
+            };
+
+            let allowed_vpc_endpoint_ids = auth_backend
+                .get_allowed_vpc_endpoint_ids(&ctx, &cancel_closure.user_info)
+                .await
+                .map_err(|e| CancelError::AuthError(e.into()))?;
+            // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that.
+            if !allowed_vpc_endpoint_ids.is_empty()
+                && !allowed_vpc_endpoint_ids.contains(&incoming_vpc_endpoint_id)
+            {
+                return Err(CancelError::VpcEndpointIdNotAllowed);
+            }
+        } else if access_blocks.public_access_blocked {
+            return Err(CancelError::VpcEndpointIdNotAllowed);
+        }
+
         Metrics::get()
             .proxy
             .cancellation_requests_total
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 8502edcfab..1dcd37712e 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -68,6 +68,7 @@ pub struct AuthenticationConfig {
     pub rate_limiter: AuthRateLimiter,
     pub rate_limit_ip_subnet: u8,
     pub ip_allowlist_check_enabled: bool,
+    pub is_vpc_acccess_proxy: bool,
     pub jwks_cache: JwkCache,
     pub is_auth_broker: bool,
     pub accept_jwts: bool,
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 78bfb6deac..c4548a7ddd 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -182,7 +182,8 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
                             cancel_key_data,
                             ctx,
                             config.authentication_config.ip_allowlist_check_enabled,
-                            backend,
+                            config.authentication_config.is_vpc_acccess_proxy,
+                            backend.get_api(),
                         )
                         .await
                         .inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok();
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index a9fb513d3c..3236b2e1bf 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -19,7 +19,7 @@ use crate::intern::{BranchIdInt, ProjectIdInt};
 use crate::metrics::{
     ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting,
 };
-use crate::protocol2::ConnectionInfo;
+use crate::protocol2::{ConnectionInfo, ConnectionInfoExtra};
 use crate::types::{DbName, EndpointId, RoleName};
 
 pub mod parquet;
@@ -312,6 +312,15 @@ impl RequestContext {
             .ip()
     }
 
+    pub(crate) fn extra(&self) -> Option<ConnectionInfoExtra> {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .conn_info
+            .extra
+            .clone()
+    }
+
     pub(crate) fn cold_start_info(&self) -> ColdStartInfo {
         self.0
             .try_lock()
diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs
index ece03156d1..ef6621fc59 100644
--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -22,7 +22,8 @@ use crate::control_plane::errors::{
 use crate::control_plane::locks::ApiLocks;
 use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
 use crate::control_plane::{
-    AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo,
+    AccessBlockerFlags, AuthInfo, AuthSecret, CachedAccessBlockerFlags, CachedAllowedIps,
+    CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret, NodeInfo,
 };
 use crate::metrics::{CacheOutcome, Metrics};
 use crate::rate_limiter::WakeComputeRateLimiter;
@@ -137,9 +138,6 @@ impl NeonControlPlaneClient {
                 }
             };
 
-            // Ivan: don't know where it will be used, so I leave it here
-            let _endpoint_vpc_ids = body.allowed_vpc_endpoint_ids.unwrap_or_default();
-
             let secret = if body.role_secret.is_empty() {
                 None
             } else {
@@ -153,10 +151,23 @@ impl NeonControlPlaneClient {
                 .proxy
                 .allowed_ips_number
                 .observe(allowed_ips.len() as f64);
+            let allowed_vpc_endpoint_ids = body.allowed_vpc_endpoint_ids.unwrap_or_default();
+            Metrics::get()
+                .proxy
+                .allowed_vpc_endpoint_ids
+                .observe(allowed_vpc_endpoint_ids.len() as f64);
+            let block_public_connections = body.block_public_connections.unwrap_or_default();
+            let block_vpc_connections = body.block_vpc_connections.unwrap_or_default();
             Ok(AuthInfo {
                 secret,
                 allowed_ips,
+                allowed_vpc_endpoint_ids,
                 project_id: body.project_id,
+                account_id: body.account_id,
+                access_blocker_flags: AccessBlockerFlags {
+                    public_access_blocked: block_public_connections,
+                    vpc_access_blocked: block_vpc_connections,
+                },
             })
         }
         .inspect_err(|e| tracing::debug!(error = ?e))
@@ -299,6 +310,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
             return Ok(role_secret);
         }
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        let account_id = auth_info.account_id;
         if let Some(project_id) = auth_info.project_id {
             let normalized_ep_int = normalized_ep.into();
             self.caches.project_info.insert_role_secret(
@@ -312,24 +324,35 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
                 normalized_ep_int,
                 Arc::new(auth_info.allowed_ips),
             );
+            self.caches.project_info.insert_allowed_vpc_endpoint_ids(
+                account_id,
+                project_id,
+                normalized_ep_int,
+                Arc::new(auth_info.allowed_vpc_endpoint_ids),
+            );
+            self.caches.project_info.insert_block_public_or_vpc_access(
+                project_id,
+                normalized_ep_int,
+                auth_info.access_blocker_flags,
+            );
             ctx.set_project_id(project_id);
         }
         // When we just got a secret, we don't need to invalidate it.
         Ok(Cached::new_uncached(auth_info.secret))
     }
 
-    async fn get_allowed_ips_and_secret(
+    async fn get_allowed_ips(
         &self,
         ctx: &RequestContext,
         user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
+    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
         let normalized_ep = &user_info.endpoint.normalize();
         if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
             Metrics::get()
                 .proxy
-                .allowed_ips_cache_misses
+                .allowed_ips_cache_misses // TODO SR: Should we rename this variable to something like allowed_ip_cache_stats?
                 .inc(CacheOutcome::Hit);
-            return Ok((allowed_ips, None));
+            return Ok(allowed_ips);
         }
         Metrics::get()
             .proxy
@@ -337,7 +360,10 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
             .inc(CacheOutcome::Miss);
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
         let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids);
+        let access_blocker_flags = auth_info.access_blocker_flags;
         let user = &user_info.user;
+        let account_id = auth_info.account_id;
         if let Some(project_id) = auth_info.project_id {
             let normalized_ep_int = normalized_ep.into();
             self.caches.project_info.insert_role_secret(
@@ -351,12 +377,136 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
                 normalized_ep_int,
                 allowed_ips.clone(),
             );
+            self.caches.project_info.insert_allowed_vpc_endpoint_ids(
+                account_id,
+                project_id,
+                normalized_ep_int,
+                allowed_vpc_endpoint_ids.clone(),
+            );
+            self.caches.project_info.insert_block_public_or_vpc_access(
+                project_id,
+                normalized_ep_int,
+                access_blocker_flags,
+            );
             ctx.set_project_id(project_id);
         }
-        Ok((
-            Cached::new_uncached(allowed_ips),
-            Some(Cached::new_uncached(auth_info.secret)),
-        ))
+        Ok(Cached::new_uncached(allowed_ips))
+    }
+
+    async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedVpcEndpointIds, GetAuthInfoError> {
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(allowed_vpc_endpoint_ids) = self
+            .caches
+            .project_info
+            .get_allowed_vpc_endpoint_ids(normalized_ep)
+        {
+            Metrics::get()
+                .proxy
+                .vpc_endpoint_id_cache_stats
+                .inc(CacheOutcome::Hit);
+            return Ok(allowed_vpc_endpoint_ids);
+        }
+
+        Metrics::get()
+            .proxy
+            .vpc_endpoint_id_cache_stats
+            .inc(CacheOutcome::Miss);
+
+        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids);
+        let access_blocker_flags = auth_info.access_blocker_flags;
+        let user = &user_info.user;
+        let account_id = auth_info.account_id;
+        if let Some(project_id) = auth_info.project_id {
+            let normalized_ep_int = normalized_ep.into();
+            self.caches.project_info.insert_role_secret(
+                project_id,
+                normalized_ep_int,
+                user.into(),
+                auth_info.secret.clone(),
+            );
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
+            self.caches.project_info.insert_allowed_vpc_endpoint_ids(
+                account_id,
+                project_id,
+                normalized_ep_int,
+                allowed_vpc_endpoint_ids.clone(),
+            );
+            self.caches.project_info.insert_block_public_or_vpc_access(
+                project_id,
+                normalized_ep_int,
+                access_blocker_flags,
+            );
+            ctx.set_project_id(project_id);
+        }
+        Ok(Cached::new_uncached(allowed_vpc_endpoint_ids))
+    }
+
+    async fn get_block_public_or_vpc_access(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAccessBlockerFlags, GetAuthInfoError> {
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(access_blocker_flags) = self
+            .caches
+            .project_info
+            .get_block_public_or_vpc_access(normalized_ep)
+        {
+            Metrics::get()
+                .proxy
+                .access_blocker_flags_cache_stats
+                .inc(CacheOutcome::Hit);
+            return Ok(access_blocker_flags);
+        }
+
+        Metrics::get()
+            .proxy
+            .access_blocker_flags_cache_stats
+            .inc(CacheOutcome::Miss);
+
+        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids);
+        let access_blocker_flags = auth_info.access_blocker_flags;
+        let user = &user_info.user;
+        let account_id = auth_info.account_id;
+        if let Some(project_id) = auth_info.project_id {
+            let normalized_ep_int = normalized_ep.into();
+            self.caches.project_info.insert_role_secret(
+                project_id,
+                normalized_ep_int,
+                user.into(),
+                auth_info.secret.clone(),
+            );
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
+            self.caches.project_info.insert_allowed_vpc_endpoint_ids(
+                account_id,
+                project_id,
+                normalized_ep_int,
+                allowed_vpc_endpoint_ids.clone(),
+            );
+            self.caches.project_info.insert_block_public_or_vpc_access(
+                project_id,
+                normalized_ep_int,
+                access_blocker_flags.clone(),
+            );
+            ctx.set_project_id(project_id);
+        }
+        Ok(Cached::new_uncached(access_blocker_flags))
     }
 
     #[tracing::instrument(skip_all)]
diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs
index 5f8bda0f35..1e6cde8fb0 100644
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -13,12 +13,14 @@ use crate::auth::backend::ComputeUserInfo;
 use crate::auth::IpPattern;
 use crate::cache::Cached;
 use crate::context::RequestContext;
-use crate::control_plane::client::{CachedAllowedIps, CachedRoleSecret};
+use crate::control_plane::client::{
+    CachedAllowedIps, CachedAllowedVpcEndpointIds, CachedRoleSecret,
+};
 use crate::control_plane::errors::{
     ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
 };
 use crate::control_plane::messages::MetricsAuxInfo;
-use crate::control_plane::{AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo};
+use crate::control_plane::{AccessBlockerFlags, AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo};
 use crate::error::io_error;
 use crate::intern::RoleNameInt;
 use crate::types::{BranchId, EndpointId, ProjectId, RoleName};
@@ -121,7 +123,10 @@ impl MockControlPlane {
         Ok(AuthInfo {
             secret,
             allowed_ips,
+            allowed_vpc_endpoint_ids: vec![],
             project_id: None,
+            account_id: None,
+            access_blocker_flags: AccessBlockerFlags::default(),
         })
     }
 
@@ -214,16 +219,35 @@ impl super::ControlPlaneApi for MockControlPlane {
         ))
     }
 
-    async fn get_allowed_ips_and_secret(
+    async fn get_allowed_ips(
         &self,
         _ctx: &RequestContext,
         user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        Ok((
-            Cached::new_uncached(Arc::new(
-                self.do_get_auth_info(user_info).await?.allowed_ips,
-            )),
-            None,
+    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
+        Ok(Cached::new_uncached(Arc::new(
+            self.do_get_auth_info(user_info).await?.allowed_ips,
+        )))
+    }
+
+    async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        _ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedVpcEndpointIds, super::errors::GetAuthInfoError> {
+        Ok(Cached::new_uncached(Arc::new(
+            self.do_get_auth_info(user_info)
+                .await?
+                .allowed_vpc_endpoint_ids,
+        )))
+    }
+
+    async fn get_block_public_or_vpc_access(
+        &self,
+        _ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<super::CachedAccessBlockerFlags, super::errors::GetAuthInfoError> {
+        Ok(Cached::new_uncached(
+            self.do_get_auth_info(user_info).await?.access_blocker_flags,
         ))
     }
 
diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs
index b879f3a59f..a06943726e 100644
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -17,7 +17,8 @@ use crate::cache::project_info::ProjectInfoCacheImpl;
 use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions};
 use crate::context::RequestContext;
 use crate::control_plane::{
-    errors, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache,
+    errors, CachedAccessBlockerFlags, CachedAllowedIps, CachedAllowedVpcEndpointIds,
+    CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache,
 };
 use crate::error::ReportableError;
 use crate::metrics::ApiLockMetrics;
@@ -55,17 +56,45 @@ impl ControlPlaneApi for ControlPlaneClient {
         }
     }
 
-    async fn get_allowed_ips_and_secret(
+    async fn get_allowed_ips(
         &self,
         ctx: &RequestContext,
         user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
+    ) -> Result<CachedAllowedIps, errors::GetAuthInfoError> {
         match self {
-            Self::ProxyV1(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::ProxyV1(api) => api.get_allowed_ips(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
-            Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::PostgresMock(api) => api.get_allowed_ips(ctx, user_info).await,
             #[cfg(test)]
-            Self::Test(api) => api.get_allowed_ips_and_secret(),
+            Self::Test(api) => api.get_allowed_ips(),
+        }
+    }
+
+    async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedVpcEndpointIds, errors::GetAuthInfoError> {
+        match self {
+            Self::ProxyV1(api) => api.get_allowed_vpc_endpoint_ids(ctx, user_info).await,
+            #[cfg(any(test, feature = "testing"))]
+            Self::PostgresMock(api) => api.get_allowed_vpc_endpoint_ids(ctx, user_info).await,
+            #[cfg(test)]
+            Self::Test(api) => api.get_allowed_vpc_endpoint_ids(),
+        }
+    }
+
+    async fn get_block_public_or_vpc_access(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAccessBlockerFlags, errors::GetAuthInfoError> {
+        match self {
+            Self::ProxyV1(api) => api.get_block_public_or_vpc_access(ctx, user_info).await,
+            #[cfg(any(test, feature = "testing"))]
+            Self::PostgresMock(api) => api.get_block_public_or_vpc_access(ctx, user_info).await,
+            #[cfg(test)]
+            Self::Test(api) => api.get_block_public_or_vpc_access(),
         }
     }
 
@@ -102,9 +131,15 @@ impl ControlPlaneApi for ControlPlaneClient {
 pub(crate) trait TestControlPlaneClient: Send + Sync + 'static {
     fn wake_compute(&self) -> Result<CachedNodeInfo, errors::WakeComputeError>;
 
-    fn get_allowed_ips_and_secret(
+    fn get_allowed_ips(&self) -> Result<CachedAllowedIps, errors::GetAuthInfoError>;
+
+    fn get_allowed_vpc_endpoint_ids(
         &self,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
+    ) -> Result<CachedAllowedVpcEndpointIds, errors::GetAuthInfoError>;
+
+    fn get_block_public_or_vpc_access(
+        &self,
+    ) -> Result<CachedAccessBlockerFlags, errors::GetAuthInfoError>;
 
     fn dyn_clone(&self) -> Box<dyn TestControlPlaneClient>;
 }
diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index d068614b24..5883d02b92 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -4,7 +4,7 @@ use measured::FixedCardinalityLabel;
 use serde::{Deserialize, Serialize};
 
 use crate::auth::IpPattern;
-use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
+use crate::intern::{AccountIdInt, BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
 use crate::proxy::retry::CouldRetry;
 
 /// Generic error response with human-readable description.
@@ -227,8 +227,11 @@ pub(crate) struct UserFacingMessage {
 pub(crate) struct GetEndpointAccessControl {
     pub(crate) role_secret: Box<str>,
     pub(crate) allowed_ips: Option<Vec<IpPattern>>,
+    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<String>>,
     pub(crate) project_id: Option<ProjectIdInt>,
-    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<EndpointIdInt>>,
+    pub(crate) account_id: Option<AccountIdInt>,
+    pub(crate) block_public_connections: Option<bool>,
+    pub(crate) block_vpc_connections: Option<bool>,
 }
 
 /// Response which holds compute node's `host:port` pair.
@@ -282,6 +285,10 @@ pub(crate) struct DatabaseInfo {
     pub(crate) aux: MetricsAuxInfo,
     #[serde(default)]
     pub(crate) allowed_ips: Option<Vec<IpPattern>>,
+    #[serde(default)]
+    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<String>>,
+    #[serde(default)]
+    pub(crate) public_access_allowed: Option<bool>,
 }
 
 // Manually implement debug to omit sensitive info.
@@ -293,6 +300,7 @@ impl fmt::Debug for DatabaseInfo {
             .field("dbname", &self.dbname)
             .field("user", &self.user)
             .field("allowed_ips", &self.allowed_ips)
+            .field("allowed_vpc_endpoint_ids", &self.allowed_vpc_endpoint_ids)
             .finish_non_exhaustive()
     }
 }
@@ -457,7 +465,7 @@ mod tests {
 
     #[test]
     fn parse_get_role_secret() -> anyhow::Result<()> {
-        // Empty `allowed_ips` field.
+        // Empty `allowed_ips` and `allowed_vpc_endpoint_ids` field.
         let json = json!({
             "role_secret": "secret",
         });
@@ -467,9 +475,21 @@ mod tests {
             "allowed_ips": ["8.8.8.8"],
         });
         serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
+        let json = json!({
+            "role_secret": "secret",
+            "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"],
+        });
+        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
         let json = json!({
             "role_secret": "secret",
             "allowed_ips": ["8.8.8.8"],
+            "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"],
+        });
+        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
+        let json = json!({
+            "role_secret": "secret",
+            "allowed_ips": ["8.8.8.8"],
+            "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"],
             "project_id": "project",
         });
         serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs
index 1dca26d686..f92e4f3f60 100644
--- a/proxy/src/control_plane/mod.rs
+++ b/proxy/src/control_plane/mod.rs
@@ -19,6 +19,7 @@ use crate::cache::{Cached, TimedLru};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo};
+use crate::intern::AccountIdInt;
 use crate::intern::ProjectIdInt;
 use crate::types::{EndpointCacheKey, EndpointId};
 use crate::{compute, scram};
@@ -52,8 +53,14 @@ pub(crate) struct AuthInfo {
     pub(crate) secret: Option<AuthSecret>,
     /// List of IP addresses allowed for the autorization.
     pub(crate) allowed_ips: Vec<IpPattern>,
+    /// List of VPC endpoints allowed for the autorization.
+    pub(crate) allowed_vpc_endpoint_ids: Vec<String>,
     /// Project ID. This is used for cache invalidation.
     pub(crate) project_id: Option<ProjectIdInt>,
+    /// Account ID. This is used for cache invalidation.
+    pub(crate) account_id: Option<AccountIdInt>,
+    /// Are public connections or VPC connections blocked?
+    pub(crate) access_blocker_flags: AccessBlockerFlags,
 }
 
 /// Info for establishing a connection to a compute node.
@@ -95,11 +102,21 @@ impl NodeInfo {
     }
 }
 
+#[derive(Clone, Default, Eq, PartialEq, Debug)]
+pub(crate) struct AccessBlockerFlags {
+    pub public_access_blocked: bool,
+    pub vpc_access_blocked: bool,
+}
+
 pub(crate) type NodeInfoCache =
     TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ControlPlaneErrorMessage>>>;
 pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
 pub(crate) type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
 pub(crate) type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
+pub(crate) type CachedAllowedVpcEndpointIds =
+    Cached<&'static ProjectInfoCacheImpl, Arc<Vec<String>>>;
+pub(crate) type CachedAccessBlockerFlags =
+    Cached<&'static ProjectInfoCacheImpl, AccessBlockerFlags>;
 
 /// This will allocate per each call, but the http requests alone
 /// already require a few allocations, so it should be fine.
@@ -113,11 +130,23 @@ pub(crate) trait ControlPlaneApi {
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
 
-    async fn get_allowed_ips_and_secret(
+    async fn get_allowed_ips(
         &self,
         ctx: &RequestContext,
         user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
+    ) -> Result<CachedAllowedIps, errors::GetAuthInfoError>;
+
+    async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedVpcEndpointIds, errors::GetAuthInfoError>;
+
+    async fn get_block_public_or_vpc_access(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAccessBlockerFlags, errors::GetAuthInfoError>;
 
     async fn get_endpoint_jwks(
         &self,
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index 79c6020302..0d1382679c 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -7,7 +7,7 @@ use std::sync::OnceLock;
 use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo};
 use rustc_hash::FxHasher;
 
-use crate::types::{BranchId, EndpointId, ProjectId, RoleName};
+use crate::types::{AccountId, BranchId, EndpointId, ProjectId, RoleName};
 
 pub trait InternId: Sized + 'static {
     fn get_interner() -> &'static StringInterner<Self>;
@@ -206,6 +206,26 @@ impl From<ProjectId> for ProjectIdInt {
     }
 }
 
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct AccountIdTag;
+impl InternId for AccountIdTag {
+    fn get_interner() -> &'static StringInterner<Self> {
+        static ROLE_NAMES: OnceLock<StringInterner<AccountIdTag>> = OnceLock::new();
+        ROLE_NAMES.get_or_init(Default::default)
+    }
+}
+pub type AccountIdInt = InternedString<AccountIdTag>;
+impl From<&AccountId> for AccountIdInt {
+    fn from(value: &AccountId) -> Self {
+        AccountIdTag::get_interner().get_or_intern(value)
+    }
+}
+impl From<AccountId> for AccountIdInt {
+    fn from(value: AccountId) -> Self {
+        AccountIdTag::get_interner().get_or_intern(&value)
+    }
+}
+
 #[cfg(test)]
 #[expect(clippy::unwrap_used)]
 mod tests {
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index f3d281a26b..25bcc81108 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -96,6 +96,16 @@ pub struct ProxyMetrics {
     #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))]
     pub allowed_ips_number: Histogram<10>,
 
+    /// Number of cache hits/misses for VPC endpoint IDs.
+    pub vpc_endpoint_id_cache_stats: CounterVec<StaticLabelSet<CacheOutcome>>,
+
+    /// Number of cache hits/misses for access blocker flags.
+    pub access_blocker_flags_cache_stats: CounterVec<StaticLabelSet<CacheOutcome>>,
+
+    /// Number of allowed VPC endpoints IDs
+    #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))]
+    pub allowed_vpc_endpoint_ids: Histogram<10>,
+
     /// Number of connections (per sni).
     pub accepted_connections_by_sni: CounterVec<StaticLabelSet<SniKind>>,
 
@@ -570,6 +580,9 @@ pub enum RedisEventsCount {
     CancelSession,
     PasswordUpdate,
     AllowedIpsUpdate,
+    AllowedVpcEndpointIdsUpdateForProjects,
+    AllowedVpcEndpointIdsUpdateForAllProjectsInOrg,
+    BlockPublicOrVpcAccessUpdate,
 }
 
 pub struct ThreadPoolWorkers(usize);
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index ab173bd0d0..8a407c8119 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -283,7 +283,8 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
                             cancel_key_data,
                             ctx,
                             config.authentication_config.ip_allowlist_check_enabled,
-                            auth_backend,
+                            config.authentication_config.is_vpc_acccess_proxy,
+                            auth_backend.get_api(),
                         )
                         .await
                         .inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok();
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 10db2bcb30..d8c00a9b41 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -26,7 +26,7 @@ use crate::config::{ComputeConfig, RetryConfig};
 use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient};
 use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status};
 use crate::control_plane::{
-    self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, NodeInfoCache,
+    self, CachedAllowedIps, CachedAllowedVpcEndpointIds, CachedNodeInfo, NodeInfo, NodeInfoCache,
 };
 use crate::error::ErrorKind;
 use crate::tls::client_config::compute_client_config_with_certs;
@@ -526,9 +526,19 @@ impl TestControlPlaneClient for TestConnectMechanism {
         }
     }
 
-    fn get_allowed_ips_and_secret(
+    fn get_allowed_ips(&self) -> Result<CachedAllowedIps, control_plane::errors::GetAuthInfoError> {
+        unimplemented!("not used in tests")
+    }
+
+    fn get_allowed_vpc_endpoint_ids(
         &self,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), control_plane::errors::GetAuthInfoError>
+    ) -> Result<CachedAllowedVpcEndpointIds, control_plane::errors::GetAuthInfoError> {
+        unimplemented!("not used in tests")
+    }
+
+    fn get_block_public_or_vpc_access(
+        &self,
+    ) -> Result<control_plane::CachedAccessBlockerFlags, control_plane::errors::GetAuthInfoError>
     {
         unimplemented!("not used in tests")
     }
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 19fdd3280d..1a7024588a 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -10,7 +10,7 @@ use uuid::Uuid;
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use crate::cache::project_info::ProjectInfoCache;
-use crate::intern::{ProjectIdInt, RoleNameInt};
+use crate::intern::{AccountIdInt, ProjectIdInt, RoleNameInt};
 use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
 
 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
@@ -86,9 +86,7 @@ pub(crate) struct BlockPublicOrVpcAccessUpdated {
 
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 pub(crate) struct AllowedVpcEndpointsUpdatedForOrg {
-    // TODO: change type once the implementation is more fully fledged.
-    // See e.g. https://github.com/neondatabase/neon/pull/10073.
-    account_id: ProjectIdInt,
+    account_id: AccountIdInt,
 }
 
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
@@ -205,6 +203,24 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                         .proxy
                         .redis_events_count
                         .inc(RedisEventsCount::PasswordUpdate);
+                } else if matches!(
+                    msg,
+                    Notification::AllowedVpcEndpointsUpdatedForProjects { .. }
+                ) {
+                    Metrics::get()
+                        .proxy
+                        .redis_events_count
+                        .inc(RedisEventsCount::AllowedVpcEndpointIdsUpdateForProjects);
+                } else if matches!(msg, Notification::AllowedVpcEndpointsUpdatedForOrg { .. }) {
+                    Metrics::get()
+                        .proxy
+                        .redis_events_count
+                        .inc(RedisEventsCount::AllowedVpcEndpointIdsUpdateForAllProjectsInOrg);
+                } else if matches!(msg, Notification::BlockPublicOrVpcAccessUpdated { .. }) {
+                    Metrics::get()
+                        .proxy
+                        .redis_events_count
+                        .inc(RedisEventsCount::BlockPublicOrVpcAccessUpdate);
                 }
                 // TODO: add additional metrics for the other event types.
 
@@ -230,20 +246,26 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
         Notification::AllowedIpsUpdate { allowed_ips_update } => {
             cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id);
         }
+        Notification::BlockPublicOrVpcAccessUpdated {
+            block_public_or_vpc_access_updated,
+        } => cache.invalidate_block_public_or_vpc_access_for_project(
+            block_public_or_vpc_access_updated.project_id,
+        ),
+        Notification::AllowedVpcEndpointsUpdatedForOrg {
+            allowed_vpc_endpoints_updated_for_org,
+        } => cache.invalidate_allowed_vpc_endpoint_ids_for_org(
+            allowed_vpc_endpoints_updated_for_org.account_id,
+        ),
+        Notification::AllowedVpcEndpointsUpdatedForProjects {
+            allowed_vpc_endpoints_updated_for_projects,
+        } => cache.invalidate_allowed_vpc_endpoint_ids_for_projects(
+            allowed_vpc_endpoints_updated_for_projects.project_ids,
+        ),
         Notification::PasswordUpdate { password_update } => cache
             .invalidate_role_secret_for_project(
                 password_update.project_id,
                 password_update.role_name,
             ),
-        Notification::BlockPublicOrVpcAccessUpdated { .. } => {
-            // https://github.com/neondatabase/neon/pull/10073
-        }
-        Notification::AllowedVpcEndpointsUpdatedForOrg { .. } => {
-            // https://github.com/neondatabase/neon/pull/10073
-        }
-        Notification::AllowedVpcEndpointsUpdatedForProjects { .. } => {
-            // https://github.com/neondatabase/neon/pull/10073
-        }
         Notification::UnknownTopic => unreachable!(),
     }
 }
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 6d5fb13681..0fb4a8a6cc 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -30,6 +30,7 @@ use crate::control_plane::locks::ApiLocks;
 use crate::control_plane::CachedNodeInfo;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::intern::EndpointIdInt;
+use crate::protocol2::ConnectionInfoExtra;
 use crate::proxy::connect_compute::ConnectMechanism;
 use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute};
 use crate::rate_limiter::EndpointRateLimiter;
@@ -57,23 +58,52 @@ impl PoolingBackend {
 
         let user_info = user_info.clone();
         let backend = self.auth_backend.as_ref().map(|()| user_info.clone());
-        let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
+        let allowed_ips = backend.get_allowed_ips(ctx).await?;
+
         if self.config.authentication_config.ip_allowlist_check_enabled
             && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips)
         {
             return Err(AuthError::ip_address_not_allowed(ctx.peer_addr()));
         }
+
+        let access_blocker_flags = backend.get_block_public_or_vpc_access(ctx).await?;
+        if self.config.authentication_config.is_vpc_acccess_proxy {
+            if access_blocker_flags.vpc_access_blocked {
+                return Err(AuthError::NetworkNotAllowed);
+            }
+
+            let extra = ctx.extra();
+            let incoming_endpoint_id = match extra {
+                None => String::new(),
+                Some(ConnectionInfoExtra::Aws { vpce_id }) => {
+                    // Convert the vcpe_id to a string
+                    String::from_utf8(vpce_id.to_vec()).unwrap_or_default()
+                }
+                Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(),
+            };
+
+            if incoming_endpoint_id.is_empty() {
+                return Err(AuthError::MissingVPCEndpointId);
+            }
+
+            let allowed_vpc_endpoint_ids = backend.get_allowed_vpc_endpoint_ids(ctx).await?;
+            // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that.
+            if !allowed_vpc_endpoint_ids.is_empty()
+                && !allowed_vpc_endpoint_ids.contains(&incoming_endpoint_id)
+            {
+                return Err(AuthError::vpc_endpoint_id_not_allowed(incoming_endpoint_id));
+            }
+        } else if access_blocker_flags.public_access_blocked {
+            return Err(AuthError::NetworkNotAllowed);
+        }
+
         if !self
             .endpoint_rate_limiter
             .check(user_info.endpoint.clone().into(), 1)
         {
             return Err(AuthError::too_many_connections());
         }
-        let cached_secret = match maybe_secret {
-            Some(secret) => secret,
-            None => backend.get_role_secret(ctx).await?,
-        };
-
+        let cached_secret = backend.get_role_secret(ctx).await?;
         let secret = match cached_secret.value.clone() {
             Some(secret) => self.config.authentication_config.check_rate_limit(
                 ctx,
diff --git a/proxy/src/types.rs b/proxy/src/types.rs
index 6e0bd61c94..d5952d1d8b 100644
--- a/proxy/src/types.rs
+++ b/proxy/src/types.rs
@@ -97,6 +97,8 @@ smol_str_wrapper!(EndpointId);
 smol_str_wrapper!(BranchId);
 // 90% of project strings are 23 characters or less.
 smol_str_wrapper!(ProjectId);
+// 90% of account strings are 23 characters or less.
+smol_str_wrapper!(AccountId);
 
 // will usually equal endpoint ID
 smol_str_wrapper!(EndpointCacheKey);

From 6318828c639ba85e66da321f7b32828ee945de12 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Fri, 31 Jan 2025 21:52:17 +0100
Subject: [PATCH 342/583] Update rust to 1.84.1 (#10618)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust
ecosystem as well.

[Release notes](https://releases.rs/docs/1.84.1/).

Prior update was in https://github.com/neondatabase/neon/pull/10328.

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 build-tools.Dockerfile | 2 +-
 rust-toolchain.toml    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index dfcc7d06b4..f744b44808 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -253,7 +253,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.84.0
+ENV RUSTC_VERSION=1.84.1
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 06746d3e1d..38a7f202ba 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.84.0"
+channel = "1.84.1"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From b9e1a6724628aad5cf62737f6acab60ec23ff09b Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Sat, 1 Feb 2025 12:09:45 +0100
Subject: [PATCH 343/583] fix generate matrix for olap for saturdays (#10622)

## Problem

when introducing pg17 for job step `Generate matrix for OLAP benchmarks`
I introduced a syntax error that only hits on Saturdays.

## Summary of changes

Remove trailing comma

## successful test run

https://github.com/neondatabase/neon/actions/runs/13086363907
---
 .github/workflows/benchmarking.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 20a8a6e2c9..413af90dec 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -341,7 +341,7 @@ jobs:
           ],
           "pg_version" : [
             16,17
-          ],
+          ]
         }'
 
         if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then

From 8ae6f656a694a2d6892ce6ebd1475d1b831ba917 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 3 Feb 2025 05:11:06 +0100
Subject: [PATCH 344/583] Don't require partial backup semaphore capacity for
 deletions (#10628)

In the safekeeper, we block deletions on the timeline's gate closing,
and any `WalResidentTimeline` keeps the gate open (because it owns a
gate lock object). Thus, unless the `main_task` function of a partial
backup doesn't return, we can't delete the associated timeline.

In order to make these tasks exit early, we call the cancellation token
of the timeline upon its shutdown. However, the partial backup task
wasn't looking for the cancellation while waiting to acquire a partial
backup permit.

On a staging safekeeper we have been in a situation in the past where
the semaphore was already empty for a duration of many hours, rendering
all attempted deletions unable to proceed until a restart where the
semaphore was reset:
https://neondb.slack.com/archives/C03H1K0PGKH/p1738416586442029
---
 safekeeper/src/wal_backup_partial.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 4e5b34a9bf..5ecb23e8e0 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -535,6 +535,10 @@ pub async fn main_task(
         // limit concurrent uploads
         let _upload_permit = tokio::select! {
             acq = limiter.acquire_partial_backup() => acq,
+            _ = backup.tli.cancel.cancelled() => {
+                info!("timeline canceled");
+                return None;
+            }
             _ = cancel.cancelled() => {
                 info!("task canceled");
                 return None;

From 4dfe60e2ad08ad0e99bfd938bcbe18271deb4f84 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Mon, 3 Feb 2025 10:00:23 +0100
Subject: [PATCH 345/583] revert
 https://github.com/neondatabase/neon/pull/10616 (#10631)

## Problem

https://github.com/neondatabase/neon/pull/10616 was only intended
temparily during the weekend, want to reset to prior state

## Summary of changes

revert https://github.com/neondatabase/neon/pull/10616 but keep fixes in
https://github.com/neondatabase/neon/pull/10622
---
 .github/workflows/benchmarking.yml | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 413af90dec..b36ac46f35 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -11,8 +11,7 @@ on:
     #          │ │ ┌───────────── day of the month (1 - 31)
     #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
     #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    # - cron:   '0 3 * * *' # run once a day, timezone is utc
-    - cron: '0 */10 * * *' # Runs every 10 hours at minute 0
+    - cron:   '0 3 * * *' # run once a day, timezone is utc
   workflow_dispatch: # adds ability to run this manually
     inputs:
       region_id:
@@ -551,7 +550,6 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   pgbench-pgvector:
-    if: false
     permissions:
       contents: write
       statuses: write
@@ -685,8 +683,7 @@ jobs:
     #
     # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
     # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
-    # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    if: false
+    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
     permissions:
       contents: write
       statuses: write
@@ -814,7 +811,6 @@ jobs:
     #
     # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
     # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    if: false
     permissions:
       contents: write
       statuses: write
@@ -934,7 +930,6 @@ jobs:
 
   user-examples-compare:
     # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    if: false
     permissions:
       contents: write
       statuses: write

From f071800979fba434ea0708f22e454c513efe47b2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 3 Feb 2025 09:02:21 +0000
Subject: [PATCH 346/583] tests: stabilize shard locations earlier in
 test_scrubber_tenant_snapshot (#10606)

## Problem

This test would sometimes emit unexpected logs from the storage
controller's requests to do migrations, which overlap with the test's
restarts of pageservers, where those migrations are happening some time
after a shard split as the controller moves load around.

Example:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-10602/13067323736/index.html#testresult/f66f1329557a1fc5/retries

## Summary of changes

- Do a reconcile_until_idle after shard split, so that the rest of the
test doesn't run concurrently with migrations
---
 test_runner/regress/test_storage_scrubber.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 7e92cc01cd..0f4e5688a9 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -71,6 +71,10 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
     else:
         tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)]
 
+    # Let shards finish rescheduling to other pageservers: this makes the rest of the test more stable
+    # is it won't overlap with migrations
+    env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
+
     output_path = neon_env_builder.test_output_dir / "snapshot"
     os.makedirs(output_path)
 

From 89b9f7407706353bf256ac6eeea0edd31c3829d1 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 3 Feb 2025 09:40:12 +0000
Subject: [PATCH 347/583] CI(pre-merge-checks): do not run `conclusion` job for
 PRs (#10619)

## Problem

While working on https://github.com/neondatabase/neon/pull/10617 I
(unintentionally) merged the PR before the main CI pipeline has
finished.
I suspect this happens because we have received all the required job
results from the pre-merge-checks workflow, which runs on PRs that
include changes to relevant files.

## Summary of changes
- Skip the `conclusion` job in `pre-merge-checks` workflows for PRs
---
 .github/workflows/pre-merge-checks.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml
index d39ccecac9..c47b3fe0de 100644
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -95,7 +95,8 @@ jobs:
   # - conclusion
   # - neon-cloud-e2e
   conclusion:
-    if: always()
+    # Do not run job on Pull Requests as it interferes with the `conclusion` job from the `build_and_test` workflow
+    if: always() && github.event_name == 'merge_group'
     permissions:
       statuses: write # for `github.repos.createCommitStatus(...)`
       contents: write

From 87ad50c92547891f6c1161c3e3c255067e6c0276 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 3 Feb 2025 12:53:51 +0100
Subject: [PATCH 348/583] storcon: use diesel-async again, now with tls support
 (#10614)

Successor of #10280 after it was reverted in #10592.

Re-introduce the usage of diesel-async again, but now also add TLS
support so that we connect to the storcon database using TLS. By
default, diesel-async doesn't support TLS, so add some code to make us
explicitly request TLS.

cc https://github.com/neondatabase/cloud/issues/23583
---
 .github/workflows/_build-and-test-locally.yml |   4 -
 .github/workflows/build-macos.yml             |   2 +-
 .github/workflows/neon_extra_builds.yml       |   2 +-
 Cargo.lock                                    |  72 +-
 Dockerfile                                    |   2 +-
 Makefile                                      |   2 -
 storage_controller/Cargo.toml                 |   9 +-
 storage_controller/src/main.rs                |   2 +-
 storage_controller/src/persistence.rs         | 833 +++++++++++-------
 9 files changed, 549 insertions(+), 379 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index e9483492c9..1dec8106b4 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -158,8 +158,6 @@ jobs:
 
       - name: Run cargo build
         run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
       # Do install *before* running rust tests because they might recompile the
@@ -217,8 +215,6 @@ jobs:
         env:
           NEXTEST_RETRIES: 3
         run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
           LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib
           export LD_LIBRARY_PATH
 
diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml
index 01d82a1ed2..347a511e98 100644
--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -235,7 +235,7 @@ jobs:
           echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
 
       - name: Run cargo build (only for v17)
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release -j$(sysctl -n hw.ncpu)
+        run: cargo build --all --release -j$(sysctl -n hw.ncpu)
 
       - name: Check that no warnings are produced (only for v17)
         run: ./run_clippy.sh
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 5b5910badf..f077e04d1c 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -114,7 +114,7 @@ jobs:
         run: make walproposer-lib -j$(nproc)
 
       - name: Produce the build stats
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)
+        run: cargo build --all --release --timings -j$(nproc)
 
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@v4
diff --git a/Cargo.lock b/Cargo.lock
index cdc620e485..0133c83564 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -932,6 +932,18 @@ version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
 
+[[package]]
+name = "bb8"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d89aabfae550a5c44b43ab941844ffcd2e993cb6900b342debf59e9ea74acdb8"
+dependencies = [
+ "async-trait",
+ "futures-util",
+ "parking_lot 0.12.1",
+ "tokio",
+]
+
 [[package]]
 name = "bcder"
 version = "0.7.4"
@@ -1790,11 +1802,24 @@ dependencies = [
  "chrono",
  "diesel_derives",
  "itoa",
- "pq-sys",
- "r2d2",
  "serde_json",
 ]
 
+[[package]]
+name = "diesel-async"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51a307ac00f7c23f526a04a77761a0519b9f0eb2838ebf5b905a58580095bdcb"
+dependencies = [
+ "async-trait",
+ "bb8",
+ "diesel",
+ "futures-util",
+ "scoped-futures",
+ "tokio",
+ "tokio-postgres",
+]
+
 [[package]]
 name = "diesel_derives"
 version = "2.2.1"
@@ -4645,15 +4670,6 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
-[[package]]
-name = "pq-sys"
-version = "0.6.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6cc05d7ea95200187117196eee9edd0644424911821aeb28a18ce60ea0b8793"
-dependencies = [
- "vcpkg",
-]
-
 [[package]]
 name = "pq_proto"
 version = "0.1.0"
@@ -4966,17 +4982,6 @@ dependencies = [
  "proc-macro2",
 ]
 
-[[package]]
-name = "r2d2"
-version = "0.8.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93"
-dependencies = [
- "log",
- "parking_lot 0.12.1",
- "scheduled-thread-pool",
-]
-
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -5797,12 +5802,12 @@ dependencies = [
 ]
 
 [[package]]
-name = "scheduled-thread-pool"
-version = "0.2.7"
+name = "scoped-futures"
+version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19"
+checksum = "1b24aae2d0636530f359e9d5ef0c04669d11c5e756699b27a6a6d845d8329091"
 dependencies = [
- "parking_lot 0.12.1",
+ "pin-project-lite",
 ]
 
 [[package]]
@@ -6337,6 +6342,7 @@ dependencies = [
  "clap",
  "control_plane",
  "diesel",
+ "diesel-async",
  "diesel_migrations",
  "fail",
  "futures",
@@ -6351,10 +6357,12 @@ dependencies = [
  "pageserver_api",
  "pageserver_client",
  "postgres_connection",
- "r2d2",
  "rand 0.8.5",
  "reqwest",
  "routerify",
+ "rustls 0.23.18",
+ "rustls-native-certs 0.8.0",
+ "scoped-futures",
  "scopeguard",
  "serde",
  "serde_json",
@@ -6362,6 +6370,8 @@ dependencies = [
  "strum_macros",
  "thiserror 1.0.69",
  "tokio",
+ "tokio-postgres",
+ "tokio-postgres-rustls",
  "tokio-util",
  "tracing",
  "utils",
@@ -6604,7 +6614,7 @@ dependencies = [
  "fastrand 2.2.0",
  "once_cell",
  "rustix",
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -7562,12 +7572,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
 
-[[package]]
-name = "vcpkg"
-version = "0.2.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
-
 [[package]]
 name = "version_check"
 version = "0.9.4"
diff --git a/Dockerfile b/Dockerfile
index f80666529b..7ba54c8ca5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,7 +45,7 @@ COPY --chown=nonroot . .
 
 ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
-    && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
+    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
       --bin pg_sni_router  \
       --bin pageserver  \
       --bin pagectl  \
diff --git a/Makefile b/Makefile
index 22ebfea7d5..d1238caebf 100644
--- a/Makefile
+++ b/Makefile
@@ -64,8 +64,6 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
 CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
 # Force cargo not to print progress bar
 CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
-# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
-CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
 
 CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
 
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index caaa22d0a5..63f43cdf62 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -32,6 +32,7 @@ postgres_connection.workspace = true
 rand.workspace = true
 reqwest = { workspace = true, features = ["stream"] }
 routerify.workspace = true
+rustls-native-certs.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
@@ -39,18 +40,20 @@ tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 measured.workspace = true
+rustls.workspace = true
 scopeguard.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
+tokio-postgres.workspace = true
+tokio-postgres-rustls.workspace = true
 
 diesel = { version = "2.2.6", features = [
     "serde_json",
-    "postgres",
-    "r2d2",
     "chrono",
 ] }
+diesel-async = { version = "0.5.2", features = ["postgres", "bb8", "async-connection-wrapper"] }
 diesel_migrations = { version = "2.2.0" }
-r2d2 = { version = "0.8.10" }
+scoped-futures = "0.1.4"
 
 utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 801409d612..659c088d51 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -308,7 +308,7 @@ async fn async_main() -> anyhow::Result<()> {
     // Validate that we can connect to the database
     Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
 
-    let persistence = Arc::new(Persistence::new(secrets.database_url));
+    let persistence = Arc::new(Persistence::new(secrets.database_url).await);
 
     let service = Service::spawn(config, persistence.clone()).await?;
 
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 37bfaf1139..880f203064 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -1,13 +1,20 @@
 pub(crate) mod split_state;
 use std::collections::HashMap;
 use std::str::FromStr;
+use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
 
 use self::split_state::SplitState;
-use diesel::pg::PgConnection;
 use diesel::prelude::*;
-use diesel::Connection;
+use diesel_async::async_connection_wrapper::AsyncConnectionWrapper;
+use diesel_async::pooled_connection::bb8::Pool;
+use diesel_async::pooled_connection::AsyncDieselConnectionManager;
+use diesel_async::pooled_connection::ManagerConfig;
+use diesel_async::AsyncPgConnection;
+use diesel_async::RunQueryDsl;
+use futures::future::BoxFuture;
+use futures::FutureExt;
 use itertools::Itertools;
 use pageserver_api::controller_api::AvailabilityZone;
 use pageserver_api::controller_api::MetadataHealthRecord;
@@ -20,6 +27,8 @@ use pageserver_api::shard::ShardConfigError;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
+use rustls::crypto::ring;
+use scoped_futures::ScopedBoxFuture;
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};
@@ -60,7 +69,7 @@ const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
 /// updated, and reads of nodes are always from memory, not the database.  We only require that
 /// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
 pub struct Persistence {
-    connection_pool: diesel::r2d2::Pool<diesel::r2d2::ConnectionManager<PgConnection>>,
+    connection_pool: Pool<AsyncPgConnection>,
 }
 
 /// Legacy format, for use in JSON compat objects in test environment
@@ -76,7 +85,7 @@ pub(crate) enum DatabaseError {
     #[error(transparent)]
     Connection(#[from] diesel::result::ConnectionError),
     #[error(transparent)]
-    ConnectionPool(#[from] r2d2::Error),
+    ConnectionPool(#[from] diesel_async::pooled_connection::bb8::RunError),
     #[error("Logical error: {0}")]
     Logical(String),
     #[error("Migration error: {0}")]
@@ -124,6 +133,7 @@ pub(crate) enum AbortShardSplitStatus {
 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
 
 /// Some methods can operate on either a whole tenant or a single shard
+#[derive(Clone)]
 pub(crate) enum TenantFilter {
     Tenant(TenantId),
     Shard(TenantShardId),
@@ -136,6 +146,11 @@ pub(crate) struct ShardGenerationState {
     pub(crate) generation_pageserver: Option<NodeId>,
 }
 
+// A generous allowance for how many times we may retry serializable transactions
+// before giving up.  This is not expected to be hit: it is a defensive measure in case we
+// somehow engineer a situation where duelling transactions might otherwise live-lock.
+const MAX_RETRIES: usize = 128;
+
 impl Persistence {
     // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
     // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
@@ -145,12 +160,18 @@ impl Persistence {
     const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10);
     const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60);
 
-    pub fn new(database_url: String) -> Self {
-        let manager = diesel::r2d2::ConnectionManager::<PgConnection>::new(database_url);
+    pub async fn new(database_url: String) -> Self {
+        let mut mgr_config = ManagerConfig::default();
+        mgr_config.custom_setup = Box::new(establish_connection_rustls);
+
+        let manager = AsyncDieselConnectionManager::<AsyncPgConnection>::new_with_config(
+            database_url,
+            mgr_config,
+        );
 
         // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time
         // to execute queries (database queries are not generally on latency-sensitive paths).
-        let connection_pool = diesel::r2d2::Pool::builder()
+        let connection_pool = Pool::builder()
             .max_size(Self::MAX_CONNECTIONS)
             .max_lifetime(Some(Self::MAX_CONNECTION_LIFETIME))
             .idle_timeout(Some(Self::IDLE_CONNECTION_TIMEOUT))
@@ -158,6 +179,7 @@ impl Persistence {
             .min_idle(Some(1))
             .test_on_check_out(true)
             .build(manager)
+            .await
             .expect("Could not build connection pool");
 
         Self { connection_pool }
@@ -171,7 +193,7 @@ impl Persistence {
     ) -> Result<(), diesel::ConnectionError> {
         let started_at = Instant::now();
         loop {
-            match PgConnection::establish(database_url) {
+            match establish_connection_rustls(database_url).await {
                 Ok(_) => {
                     tracing::info!("Connected to database.");
                     return Ok(());
@@ -192,57 +214,22 @@ impl Persistence {
     pub(crate) async fn migration_run(&self) -> DatabaseResult<()> {
         use diesel_migrations::{HarnessWithOutput, MigrationHarness};
 
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            HarnessWithOutput::write_to_stdout(conn)
-                .run_pending_migrations(MIGRATIONS)
-                .map(|_| ())
-                .map_err(|e| DatabaseError::Migration(e.to_string()))
-        })
-        .await
-    }
-
-    /// Wraps `with_conn` in order to collect latency and error metrics
-    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
-    where
-        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
-        R: Send + 'static,
-    {
-        let latency = &METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_database_query_latency;
-        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
-
-        let res = self.with_conn(func).await;
-
-        if let Err(err) = &res {
-            let error_counter = &METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_database_query_error;
-            error_counter.inc(DatabaseQueryErrorLabelGroup {
-                error_type: err.error_label(),
-                operation: op,
-            })
-        }
-
-        res
-    }
-
-    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
-    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
-    where
-        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
-        R: Send + 'static,
-    {
-        // A generous allowance for how many times we may retry serializable transactions
-        // before giving up.  This is not expected to be hit: it is a defensive measure in case we
-        // somehow engineer a situation where duelling transactions might otherwise live-lock.
-        const MAX_RETRIES: usize = 128;
-
-        let mut conn = self.connection_pool.get()?;
-        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
+        // Can't use self.with_conn here as we do spawn_blocking which requires static.
+        let conn = self
+            .connection_pool
+            .dedicated_connection()
+            .await
+            .map_err(|e| DatabaseError::Migration(e.to_string()))?;
+        let mut async_wrapper: AsyncConnectionWrapper<AsyncPgConnection> =
+            AsyncConnectionWrapper::from(conn);
+        tokio::task::spawn_blocking(move || {
             let mut retry_count = 0;
             loop {
-                match conn.build_transaction().serializable().run(|c| func(c)) {
+                let result = HarnessWithOutput::write_to_stdout(&mut async_wrapper)
+                    .run_pending_migrations(MIGRATIONS)
+                    .map(|_| ())
+                    .map_err(|e| DatabaseError::Migration(e.to_string()));
+                match result {
                     Ok(r) => break Ok(r),
                     Err(
                         err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
@@ -271,33 +258,112 @@ impl Persistence {
             }
         })
         .await
-        .expect("Task panic")
+        .map_err(|e| DatabaseError::Migration(e.to_string()))??;
+        Ok(())
+    }
+
+    /// Wraps `with_conn` in order to collect latency and error metrics
+    async fn with_measured_conn<'a, 'b, F, R>(
+        &self,
+        op: DatabaseOperation,
+        func: F,
+    ) -> DatabaseResult<R>
+    where
+        F: for<'r> Fn(&'r mut AsyncPgConnection) -> ScopedBoxFuture<'b, 'r, DatabaseResult<R>>
+            + Send
+            + std::marker::Sync
+            + 'a,
+        R: Send + 'b,
+    {
+        let latency = &METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_database_query_latency;
+        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
+
+        let res = self.with_conn(func).await;
+
+        if let Err(err) = &res {
+            let error_counter = &METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_database_query_error;
+            error_counter.inc(DatabaseQueryErrorLabelGroup {
+                error_type: err.error_label(),
+                operation: op,
+            })
+        }
+
+        res
+    }
+
+    /// Call the provided function with a Diesel database connection in a retry loop
+    async fn with_conn<'a, 'b, F, R>(&self, func: F) -> DatabaseResult<R>
+    where
+        F: for<'r> Fn(&'r mut AsyncPgConnection) -> ScopedBoxFuture<'b, 'r, DatabaseResult<R>>
+            + Send
+            + std::marker::Sync
+            + 'a,
+        R: Send + 'b,
+    {
+        let mut retry_count = 0;
+        loop {
+            let mut conn = self.connection_pool.get().await?;
+            match conn
+                .build_transaction()
+                .serializable()
+                .run(|c| func(c))
+                .await
+            {
+                Ok(r) => break Ok(r),
+                Err(
+                    err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
+                        diesel::result::DatabaseErrorKind::SerializationFailure,
+                        _,
+                    )),
+                ) => {
+                    retry_count += 1;
+                    if retry_count > MAX_RETRIES {
+                        tracing::error!(
+                            "Exceeded max retries on SerializationFailure errors: {err:?}"
+                        );
+                        break Err(err);
+                    } else {
+                        // Retry on serialization errors: these are expected, because even though our
+                        // transactions don't fight for the same rows, they will occasionally collide
+                        // on index pages (e.g. increment_generation for unrelated shards can collide)
+                        tracing::debug!("Retrying transaction on serialization failure {err:?}");
+                        continue;
+                    }
+                }
+                Err(e) => break Err(e),
+            }
+        }
     }
 
     /// When a node is first registered, persist it before using it for anything
     pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
-        let np = node.to_persistent();
-        self.with_measured_conn(
-            DatabaseOperation::InsertNode,
-            move |conn| -> DatabaseResult<()> {
+        let np = &node.to_persistent();
+        self.with_measured_conn(DatabaseOperation::InsertNode, move |conn| {
+            Box::pin(async move {
                 diesel::insert_into(crate::schema::nodes::table)
-                    .values(&np)
-                    .execute(conn)?;
+                    .values(np)
+                    .execute(conn)
+                    .await?;
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
     /// At startup, populate the list of nodes which our shards may be placed on
     pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
         let nodes: Vec<NodePersistence> = self
-            .with_measured_conn(
-                DatabaseOperation::ListNodes,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
-                },
-            )
+            .with_measured_conn(DatabaseOperation::ListNodes, move |conn| {
+                Box::pin(async move {
+                    Ok(crate::schema::nodes::table
+                        .load::<NodePersistence>(conn)
+                        .await?)
+                })
+            })
             .await?;
 
         tracing::info!("list_nodes: loaded {} nodes", nodes.len());
@@ -313,11 +379,14 @@ impl Persistence {
         use crate::schema::nodes::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
-                let updated = diesel::update(nodes)
-                    .filter(node_id.eq(input_node_id.0 as i64))
-                    .set((scheduling_policy.eq(String::from(input_scheduling)),))
-                    .execute(conn)?;
-                Ok(updated)
+                Box::pin(async move {
+                    let updated = diesel::update(nodes)
+                        .filter(node_id.eq(input_node_id.0 as i64))
+                        .set((scheduling_policy.eq(String::from(input_scheduling)),))
+                        .execute(conn)
+                        .await?;
+                    Ok(updated)
+                })
             })
             .await?;
 
@@ -339,17 +408,16 @@ impl Persistence {
         &self,
     ) -> DatabaseResult<Vec<TenantShardPersistence>> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::ListTenantShards,
-            move |conn| -> DatabaseResult<_> {
+        self.with_measured_conn(DatabaseOperation::ListTenantShards, move |conn| {
+            Box::pin(async move {
                 let query = tenant_shards.filter(
                     placement_policy.ne(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
                 );
-                let result = query.load::<TenantShardPersistence>(conn)?;
+                let result = query.load::<TenantShardPersistence>(conn).await?;
 
                 Ok(result)
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -359,15 +427,14 @@ impl Persistence {
         filter_tenant_id: TenantId,
     ) -> DatabaseResult<Vec<TenantShardPersistence>> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::LoadTenant,
-            move |conn| -> DatabaseResult<_> {
+        self.with_measured_conn(DatabaseOperation::LoadTenant, move |conn| {
+            Box::pin(async move {
                 let query = tenant_shards.filter(tenant_id.eq(filter_tenant_id.to_string()));
-                let result = query.load::<TenantShardPersistence>(conn)?;
+                let result = query.load::<TenantShardPersistence>(conn).await?;
 
                 Ok(result)
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -393,19 +460,22 @@ impl Persistence {
             })
             .collect::<Vec<_>>();
 
-        self.with_measured_conn(
-            DatabaseOperation::InsertTenantShards,
-            move |conn| -> DatabaseResult<()> {
+        let shards = &shards;
+        let metadata_health_records = &metadata_health_records;
+        self.with_measured_conn(DatabaseOperation::InsertTenantShards, move |conn| {
+            Box::pin(async move {
                 diesel::insert_into(tenant_shards::table)
-                    .values(&shards)
-                    .execute(conn)?;
+                    .values(shards)
+                    .execute(conn)
+                    .await?;
 
                 diesel::insert_into(metadata_health::table)
-                    .values(&metadata_health_records)
-                    .execute(conn)?;
+                    .values(metadata_health_records)
+                    .execute(conn)
+                    .await?;
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -413,31 +483,31 @@ impl Persistence {
     /// the tenant from memory on this server.
     pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::DeleteTenant,
-            move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(DatabaseOperation::DeleteTenant, move |conn| {
+            Box::pin(async move {
                 // `metadata_health` status (if exists) is also deleted based on the cascade behavior.
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(del_tenant_id.to_string()))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
     pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
         use crate::schema::nodes::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::DeleteNode,
-            move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(DatabaseOperation::DeleteNode, move |conn| {
+            Box::pin(async move {
                 diesel::delete(nodes)
                     .filter(node_id.eq(del_node_id.0 as i64))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -454,34 +524,41 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
-                let rows_updated = diesel::update(tenant_shards)
-                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
-                    .set(generation.eq(generation + 1))
-                    .execute(conn)?;
+                Box::pin(async move {
+                    let rows_updated = diesel::update(tenant_shards)
+                        .filter(generation_pageserver.eq(input_node_id.0 as i64))
+                        .set(generation.eq(generation + 1))
+                        .execute(conn)
+                        .await?;
 
-                tracing::info!("Incremented {} tenants' generations", rows_updated);
+                    tracing::info!("Incremented {} tenants' generations", rows_updated);
 
-                // TODO: UPDATE+SELECT in one query
+                    // TODO: UPDATE+SELECT in one query
 
-                let updated = tenant_shards
-                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
-                    .select(TenantShardPersistence::as_select())
-                    .load(conn)?;
+                    let updated = tenant_shards
+                        .filter(generation_pageserver.eq(input_node_id.0 as i64))
+                        .select(TenantShardPersistence::as_select())
+                        .load(conn)
+                        .await?;
 
-                // If the node went through a drain and restart phase before re-attaching,
-                // then reset it's node scheduling policy to active.
-                diesel::update(nodes)
-                    .filter(node_id.eq(input_node_id.0 as i64))
-                    .filter(
-                        scheduling_policy
-                            .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
-                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Draining)))
-                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Filling))),
-                    )
-                    .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
-                    .execute(conn)?;
+                    // If the node went through a drain and restart phase before re-attaching,
+                    // then reset it's node scheduling policy to active.
+                    diesel::update(nodes)
+                        .filter(node_id.eq(input_node_id.0 as i64))
+                        .filter(
+                            scheduling_policy
+                                .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
+                                .or(scheduling_policy
+                                    .eq(String::from(NodeSchedulingPolicy::Draining)))
+                                .or(scheduling_policy
+                                    .eq(String::from(NodeSchedulingPolicy::Filling))),
+                        )
+                        .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
+                        .execute(conn)
+                        .await?;
 
-                Ok(updated)
+                    Ok(updated)
+                })
             })
             .await?;
 
@@ -518,19 +595,22 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .set((
-                        generation.eq(generation + 1),
-                        generation_pageserver.eq(node_id.0 as i64),
-                    ))
-                    // TODO: only returning() the generation column
-                    .returning(TenantShardPersistence::as_returning())
-                    .get_result(conn)?;
+                Box::pin(async move {
+                    let updated = diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                        .set((
+                            generation.eq(generation + 1),
+                            generation_pageserver.eq(node_id.0 as i64),
+                        ))
+                        // TODO: only returning() the generation column
+                        .returning(TenantShardPersistence::as_returning())
+                        .get_result(conn)
+                        .await?;
 
-                Ok(updated)
+                    Ok(updated)
+                })
             })
             .await?;
 
@@ -562,12 +642,15 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let rows = self
             .with_measured_conn(DatabaseOperation::TenantGenerations, move |conn| {
-                let result = tenant_shards
-                    .filter(tenant_id.eq(filter_tenant_id.to_string()))
-                    .select(TenantShardPersistence::as_select())
-                    .order(shard_number)
-                    .load(conn)?;
-                Ok(result)
+                Box::pin(async move {
+                    let result = tenant_shards
+                        .filter(tenant_id.eq(filter_tenant_id.to_string()))
+                        .select(TenantShardPersistence::as_select())
+                        .order(shard_number)
+                        .load(conn)
+                        .await?;
+                    Ok(result)
+                })
             })
             .await?;
 
@@ -615,15 +698,18 @@ impl Persistence {
                 break;
             }
 
+            let in_clause = &in_clause;
             let chunk_rows = self
                 .with_measured_conn(DatabaseOperation::ShardGenerations, move |conn| {
-                    // diesel doesn't support multi-column IN queries, so we compose raw SQL.  No escaping is required because
-                    // the inputs are strongly typed and cannot carry any user-supplied raw string content.
-                    let result : Vec<TenantShardPersistence> = diesel::sql_query(
-                        format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str()
-                    ).load(conn)?;
+                    Box::pin(async move {
+                        // diesel doesn't support multi-column IN queries, so we compose raw SQL.  No escaping is required because
+                        // the inputs are strongly typed and cannot carry any user-supplied raw string content.
+                        let result : Vec<TenantShardPersistence> = diesel::sql_query(
+                            format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str()
+                        ).load(conn).await?;
 
-                    Ok(result)
+                        Ok(result)
+                    })
                 })
                 .await?;
             rows.extend(chunk_rows.into_iter())
@@ -657,51 +743,58 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
 
+        let tenant = &tenant;
+        let input_placement_policy = &input_placement_policy;
+        let input_config = &input_config;
+        let input_generation = &input_generation;
+        let input_scheduling_policy = &input_scheduling_policy;
         self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
-            let query = match tenant {
-                TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .into_boxed(),
-                TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(input_tenant_id.to_string()))
-                    .into_boxed(),
-            };
+            Box::pin(async move {
+                let query = match tenant {
+                    TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                        .into_boxed(),
+                    TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(input_tenant_id.to_string()))
+                        .into_boxed(),
+                };
 
-            // Clear generation_pageserver if we are moving into a state where we won't have
-            // any attached pageservers.
-            let input_generation_pageserver = match input_placement_policy {
-                None | Some(PlacementPolicy::Attached(_)) => None,
-                Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
-            };
+                // Clear generation_pageserver if we are moving into a state where we won't have
+                // any attached pageservers.
+                let input_generation_pageserver = match input_placement_policy {
+                    None | Some(PlacementPolicy::Attached(_)) => None,
+                    Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
+                };
 
-            #[derive(AsChangeset)]
-            #[diesel(table_name = crate::schema::tenant_shards)]
-            struct ShardUpdate {
-                generation: Option<i32>,
-                placement_policy: Option<String>,
-                config: Option<String>,
-                scheduling_policy: Option<String>,
-                generation_pageserver: Option<Option<i64>>,
-            }
+                #[derive(AsChangeset)]
+                #[diesel(table_name = crate::schema::tenant_shards)]
+                struct ShardUpdate {
+                    generation: Option<i32>,
+                    placement_policy: Option<String>,
+                    config: Option<String>,
+                    scheduling_policy: Option<String>,
+                    generation_pageserver: Option<Option<i64>>,
+                }
 
-            let update = ShardUpdate {
-                generation: input_generation.map(|g| g.into().unwrap() as i32),
-                placement_policy: input_placement_policy
-                    .as_ref()
-                    .map(|p| serde_json::to_string(&p).unwrap()),
-                config: input_config
-                    .as_ref()
-                    .map(|c| serde_json::to_string(&c).unwrap()),
-                scheduling_policy: input_scheduling_policy
-                    .map(|p| serde_json::to_string(&p).unwrap()),
-                generation_pageserver: input_generation_pageserver,
-            };
+                let update = ShardUpdate {
+                    generation: input_generation.map(|g| g.into().unwrap() as i32),
+                    placement_policy: input_placement_policy
+                        .as_ref()
+                        .map(|p| serde_json::to_string(&p).unwrap()),
+                    config: input_config
+                        .as_ref()
+                        .map(|c| serde_json::to_string(&c).unwrap()),
+                    scheduling_policy: input_scheduling_policy
+                        .map(|p| serde_json::to_string(&p).unwrap()),
+                    generation_pageserver: input_generation_pageserver,
+                };
 
-            query.set(update).execute(conn)?;
+                query.set(update).execute(conn).await?;
 
-            Ok(())
+                Ok(())
+            })
         })
         .await?;
 
@@ -715,23 +808,27 @@ impl Persistence {
     ) -> DatabaseResult<Vec<(TenantShardId, Option<AvailabilityZone>)>> {
         use crate::schema::tenant_shards::dsl::*;
 
+        let preferred_azs = preferred_azs.as_slice();
         self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
-            let mut shards_updated = Vec::default();
+            Box::pin(async move {
+                let mut shards_updated = Vec::default();
 
-            for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
-                    .execute(conn)?;
+                for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
+                    let updated = diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                        .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
+                        .execute(conn)
+                        .await?;
 
-                if updated == 1 {
-                    shards_updated.push((*tenant_shard_id, preferred_az.clone()));
+                    if updated == 1 {
+                        shards_updated.push((*tenant_shard_id, preferred_az.clone()));
+                    }
                 }
-            }
 
-            Ok(shards_updated)
+                Ok(shards_updated)
+            })
         })
         .await
     }
@@ -739,17 +836,21 @@ impl Persistence {
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
         use crate::schema::tenant_shards::dsl::*;
         self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
-            let updated = diesel::update(tenant_shards)
-                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                .set((
-                    generation_pageserver.eq(Option::<i64>::None),
-                    placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
-                ))
-                .execute(conn)?;
+            Box::pin(async move {
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .set((
+                        generation_pageserver.eq(Option::<i64>::None),
+                        placement_policy
+                            .eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
+                    ))
+                    .execute(conn)
+                    .await?;
 
-            Ok(updated)
+                Ok(updated)
+            })
         })
         .await?;
 
@@ -768,14 +869,16 @@ impl Persistence {
         parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
+        let parent_to_children = parent_to_children.as_slice();
+        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| {
+            Box::pin(async move {
             // Mark parent shards as splitting
 
             let updated = diesel::update(tenant_shards)
                 .filter(tenant_id.eq(split_tenant_id.to_string()))
                 .filter(shard_count.eq(old_shard_count.literal() as i32))
                 .set((splitting.eq(1),))
-                .execute(conn)?;
+                .execute(conn).await?;
             if u8::try_from(updated)
                 .map_err(|_| DatabaseError::Logical(
                     format!("Overflow existing shard count {} while splitting", updated))
@@ -788,7 +891,7 @@ impl Persistence {
             }
 
             // FIXME: spurious clone to sidestep closure move rules
-            let parent_to_children = parent_to_children.clone();
+            let parent_to_children = parent_to_children.to_vec();
 
             // Insert child shards
             for (parent_shard_id, children) in parent_to_children {
@@ -796,7 +899,7 @@ impl Persistence {
                     .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
                     .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
                     .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
-                    .load::<TenantShardPersistence>(conn)?;
+                    .load::<TenantShardPersistence>(conn).await?;
                 let parent = if parent.len() != 1 {
                     return Err(DatabaseError::Logical(format!(
                         "Parent shard {parent_shard_id} not found"
@@ -811,12 +914,13 @@ impl Persistence {
                     debug_assert!(shard.splitting == SplitState::Splitting);
                     diesel::insert_into(tenant_shards)
                         .values(shard)
-                        .execute(conn)?;
+                        .execute(conn).await?;
                 }
             }
 
             Ok(())
         })
+        })
         .await
     }
 
@@ -828,25 +932,26 @@ impl Persistence {
         old_shard_count: ShardCount,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::CompleteShardSplit,
-            move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(DatabaseOperation::CompleteShardSplit, move |conn| {
+            Box::pin(async move {
                 // Drop parent shards
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.eq(old_shard_count.literal() as i32))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 // Clear sharding flag
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .set((splitting.eq(0),))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
                 debug_assert!(updated > 0);
 
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -858,15 +963,15 @@ impl Persistence {
         new_shard_count: ShardCount,
     ) -> DatabaseResult<AbortShardSplitStatus> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::AbortShardSplit,
-            move |conn| -> DatabaseResult<AbortShardSplitStatus> {
+        self.with_measured_conn(DatabaseOperation::AbortShardSplit, move |conn| {
+            Box::pin(async move {
                 // Clear the splitting state on parent shards
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.ne(new_shard_count.literal() as i32))
                     .set((splitting.eq(0),))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 // Parent shards are already gone: we cannot abort.
                 if updated == 0 {
@@ -886,11 +991,12 @@ impl Persistence {
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.eq(new_shard_count.literal() as i32))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 Ok(AbortShardSplitStatus::Aborted)
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -906,25 +1012,28 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::metadata_health::dsl::*;
 
-        self.with_measured_conn(
-            DatabaseOperation::UpdateMetadataHealth,
-            move |conn| -> DatabaseResult<_> {
+        let healthy_records = healthy_records.as_slice();
+        let unhealthy_records = unhealthy_records.as_slice();
+        self.with_measured_conn(DatabaseOperation::UpdateMetadataHealth, move |conn| {
+            Box::pin(async move {
                 diesel::insert_into(metadata_health)
-                    .values(&healthy_records)
+                    .values(healthy_records)
                     .on_conflict((tenant_id, shard_number, shard_count))
                     .do_update()
                     .set((healthy.eq(true), last_scrubbed_at.eq(now)))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 diesel::insert_into(metadata_health)
-                    .values(&unhealthy_records)
+                    .values(unhealthy_records)
                     .on_conflict((tenant_id, shard_number, shard_count))
                     .do_update()
                     .set((healthy.eq(false), last_scrubbed_at.eq(now)))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -933,15 +1042,13 @@ impl Persistence {
     pub(crate) async fn list_metadata_health_records(
         &self,
     ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
-        self.with_measured_conn(
-            DatabaseOperation::ListMetadataHealth,
-            move |conn| -> DatabaseResult<_> {
-                Ok(
-                    crate::schema::metadata_health::table
-                        .load::<MetadataHealthPersistence>(conn)?,
-                )
-            },
-        )
+        self.with_measured_conn(DatabaseOperation::ListMetadataHealth, move |conn| {
+            Box::pin(async {
+                Ok(crate::schema::metadata_health::table
+                    .load::<MetadataHealthPersistence>(conn)
+                    .await?)
+            })
+        })
         .await
     }
 
@@ -953,10 +1060,15 @@ impl Persistence {
         use crate::schema::metadata_health::dsl::*;
         self.with_measured_conn(
             DatabaseOperation::ListMetadataHealthUnhealthy,
-            move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::metadata_health::table
-                    .filter(healthy.eq(false))
-                    .load::<MetadataHealthPersistence>(conn)?)
+            move |conn| {
+                Box::pin(async {
+                    DatabaseResult::Ok(
+                        crate::schema::metadata_health::table
+                            .filter(healthy.eq(false))
+                            .load::<MetadataHealthPersistence>(conn)
+                            .await?,
+                    )
+                })
             },
         )
         .await
@@ -970,15 +1082,14 @@ impl Persistence {
     ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
         use crate::schema::metadata_health::dsl::*;
 
-        self.with_measured_conn(
-            DatabaseOperation::ListMetadataHealthOutdated,
-            move |conn| -> DatabaseResult<_> {
+        self.with_measured_conn(DatabaseOperation::ListMetadataHealthOutdated, move |conn| {
+            Box::pin(async move {
                 let query = metadata_health.filter(last_scrubbed_at.lt(earlier));
-                let res = query.load::<MetadataHealthPersistence>(conn)?;
+                let res = query.load::<MetadataHealthPersistence>(conn).await?;
 
                 Ok(res)
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -986,12 +1097,13 @@ impl Persistence {
     /// It is an error for the table to contain more than one entry.
     pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
         let mut leader: Vec<ControllerPersistence> = self
-            .with_measured_conn(
-                DatabaseOperation::GetLeader,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::controllers::table.load::<ControllerPersistence>(conn)?)
-                },
-            )
+            .with_measured_conn(DatabaseOperation::GetLeader, move |conn| {
+                Box::pin(async move {
+                    Ok(crate::schema::controllers::table
+                        .load::<ControllerPersistence>(conn)
+                        .await?)
+                })
+            })
             .await?;
 
         if leader.len() > 1 {
@@ -1014,26 +1126,33 @@ impl Persistence {
         use crate::schema::controllers::dsl::*;
 
         let updated = self
-            .with_measured_conn(
-                DatabaseOperation::UpdateLeader,
-                move |conn| -> DatabaseResult<usize> {
+            .with_measured_conn(DatabaseOperation::UpdateLeader, move |conn| {
+                let prev = prev.clone();
+                let new = new.clone();
+                Box::pin(async move {
                     let updated = match &prev {
-                        Some(prev) => diesel::update(controllers)
-                            .filter(address.eq(prev.address.clone()))
-                            .filter(started_at.eq(prev.started_at))
-                            .set((
-                                address.eq(new.address.clone()),
-                                started_at.eq(new.started_at),
-                            ))
-                            .execute(conn)?,
-                        None => diesel::insert_into(controllers)
-                            .values(new.clone())
-                            .execute(conn)?,
+                        Some(prev) => {
+                            diesel::update(controllers)
+                                .filter(address.eq(prev.address.clone()))
+                                .filter(started_at.eq(prev.started_at))
+                                .set((
+                                    address.eq(new.address.clone()),
+                                    started_at.eq(new.started_at),
+                                ))
+                                .execute(conn)
+                                .await?
+                        }
+                        None => {
+                            diesel::insert_into(controllers)
+                                .values(new.clone())
+                                .execute(conn)
+                                .await?
+                        }
                     };
 
                     Ok(updated)
-                },
-            )
+                })
+            })
             .await?;
 
         if updated == 0 {
@@ -1048,12 +1167,13 @@ impl Persistence {
     /// At startup, populate the list of nodes which our shards may be placed on
     pub(crate) async fn list_safekeepers(&self) -> DatabaseResult<Vec<SafekeeperPersistence>> {
         let safekeepers: Vec<SafekeeperPersistence> = self
-            .with_measured_conn(
-                DatabaseOperation::ListNodes,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::safekeepers::table.load::<SafekeeperPersistence>(conn)?)
-                },
-            )
+            .with_measured_conn(DatabaseOperation::ListNodes, move |conn| {
+                Box::pin(async move {
+                    Ok(crate::schema::safekeepers::table
+                        .load::<SafekeeperPersistence>(conn)
+                        .await?)
+                })
+            })
             .await?;
 
         tracing::info!("list_safekeepers: loaded {} nodes", safekeepers.len());
@@ -1066,11 +1186,14 @@ impl Persistence {
         id: i64,
     ) -> Result<SafekeeperPersistence, DatabaseError> {
         use crate::schema::safekeepers::dsl::{id as id_column, safekeepers};
-        self.with_conn(move |conn| -> DatabaseResult<SafekeeperPersistence> {
-            Ok(safekeepers
-                .filter(id_column.eq(&id))
-                .select(SafekeeperPersistence::as_select())
-                .get_result(conn)?)
+        self.with_conn(move |conn| {
+            Box::pin(async move {
+                Ok(safekeepers
+                    .filter(id_column.eq(&id))
+                    .select(SafekeeperPersistence::as_select())
+                    .get_result(conn)
+                    .await?)
+            })
         })
         .await
     }
@@ -1081,26 +1204,30 @@ impl Persistence {
     ) -> Result<(), DatabaseError> {
         use crate::schema::safekeepers::dsl::*;
 
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            let bind = record
-                .as_insert_or_update()
-                .map_err(|e| DatabaseError::Logical(format!("{e}")))?;
+        self.with_conn(move |conn| {
+            let record = record.clone();
+            Box::pin(async move {
+                let bind = record
+                    .as_insert_or_update()
+                    .map_err(|e| DatabaseError::Logical(format!("{e}")))?;
 
-            let inserted_updated = diesel::insert_into(safekeepers)
-                .values(&bind)
-                .on_conflict(id)
-                .do_update()
-                .set(&bind)
-                .execute(conn)?;
+                let inserted_updated = diesel::insert_into(safekeepers)
+                    .values(&bind)
+                    .on_conflict(id)
+                    .do_update()
+                    .set(&bind)
+                    .execute(conn)
+                    .await?;
 
-            if inserted_updated != 1 {
-                return Err(DatabaseError::Logical(format!(
-                    "unexpected number of rows ({})",
-                    inserted_updated
-                )));
-            }
+                if inserted_updated != 1 {
+                    return Err(DatabaseError::Logical(format!(
+                        "unexpected number of rows ({})",
+                        inserted_updated
+                    )));
+                }
 
-            Ok(())
+                Ok(())
+            })
         })
         .await
     }
@@ -1112,31 +1239,73 @@ impl Persistence {
     ) -> Result<(), DatabaseError> {
         use crate::schema::safekeepers::dsl::*;
 
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            #[derive(Insertable, AsChangeset)]
-            #[diesel(table_name = crate::schema::safekeepers)]
-            struct UpdateSkSchedulingPolicy<'a> {
-                id: i64,
-                scheduling_policy: &'a str,
-            }
-            let scheduling_policy_ = String::from(scheduling_policy_);
+        self.with_conn(move |conn| {
+            Box::pin(async move {
+                #[derive(Insertable, AsChangeset)]
+                #[diesel(table_name = crate::schema::safekeepers)]
+                struct UpdateSkSchedulingPolicy<'a> {
+                    id: i64,
+                    scheduling_policy: &'a str,
+                }
+                let scheduling_policy_ = String::from(scheduling_policy_);
 
-            let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
-                .set(scheduling_policy.eq(scheduling_policy_))
-                .execute(conn)?;
+                let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
+                    .set(scheduling_policy.eq(scheduling_policy_))
+                    .execute(conn)
+                    .await?;
 
-            if rows_affected != 1 {
-                return Err(DatabaseError::Logical(format!(
-                    "unexpected number of rows ({rows_affected})",
-                )));
-            }
+                if rows_affected != 1 {
+                    return Err(DatabaseError::Logical(format!(
+                        "unexpected number of rows ({rows_affected})",
+                    )));
+                }
 
-            Ok(())
+                Ok(())
+            })
         })
         .await
     }
 }
 
+pub(crate) fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
+    let der_certs = rustls_native_certs::load_native_certs();
+
+    if !der_certs.errors.is_empty() {
+        anyhow::bail!("could not parse certificates: {:?}", der_certs.errors);
+    }
+
+    let mut store = rustls::RootCertStore::empty();
+    store.add_parsable_certificates(der_certs.certs);
+    Ok(Arc::new(store))
+}
+
+/// Loads the root certificates and constructs a client config suitable for connecting.
+/// This function is blocking.
+pub fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
+    Ok(
+        rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
+            .with_safe_default_protocol_versions()
+            .expect("ring should support the default protocol versions")
+            .with_root_certificates(load_certs()?)
+            .with_no_client_auth(),
+    )
+}
+
+fn establish_connection_rustls(config: &str) -> BoxFuture<ConnectionResult<AsyncPgConnection>> {
+    let fut = async {
+        // We first set up the way we want rustls to work.
+        let rustls_config = client_config_with_root_certs()
+            .map_err(|err| ConnectionError::BadConnection(format!("{err:?}")))?;
+        let tls = tokio_postgres_rustls::MakeRustlsConnect::new(rustls_config);
+        let (client, conn) = tokio_postgres::connect(config, tls)
+            .await
+            .map_err(|e| ConnectionError::BadConnection(e.to_string()))?;
+
+        AsyncPgConnection::try_from_client_and_connection(client, conn).await
+    };
+    fut.boxed()
+}
+
 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
 #[derive(
     QueryableByName, Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq,

From b1e451091ad45d3e274dc4119e94f0d0307650b5 Mon Sep 17 00:00:00 2001
From: OBBO67 <35974943+OBBO67@users.noreply.github.com>
Date: Mon, 3 Feb 2025 11:54:07 +0000
Subject: [PATCH 349/583] pageserver: clean up references to timeline delete
 marker, uninit marker (#5718) (#10627)

## Problem

Since [#5580](https://github.com/neondatabase/neon/pull/5580) the delete
and uninit file markers are no longer needed.

## Summary of changes

Remove the remaining code for the delete and uninit markers.

Additionally removes the `ends_with_suffix` function as it is no longer
required.

Closes [#5718](https://github.com/neondatabase/neon/issues/5718).
---
 pageserver/src/lib.rs    | 27 ---------------------------
 pageserver/src/tenant.rs |  7 +------
 2 files changed, 1 insertion(+), 33 deletions(-)

diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index ff6af3566c..f43cd08cf7 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -263,14 +263,6 @@ pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
 /// data directory at pageserver startup can be automatically removed.
 pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp";
 
-/// A marker file to mark that a timeline directory was not fully initialized.
-/// If a timeline directory with this marker is encountered at pageserver startup,
-/// the timeline directory and the marker file are both removed.
-/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
-pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
-
-pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
-
 pub fn is_temporary(path: &Utf8Path) -> bool {
     match path.file_name() {
         Some(name) => name.ends_with(TEMP_FILE_SUFFIX),
@@ -278,25 +270,6 @@ pub fn is_temporary(path: &Utf8Path) -> bool {
     }
 }
 
-fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
-    match path.file_name() {
-        Some(name) => name.ends_with(suffix),
-        None => false,
-    }
-}
-
-// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid
-// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
-// from the name.
-
-pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool {
-    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
-}
-
-pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool {
-    ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
-}
-
 /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
 /// blocking.
 ///
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 657cc78e2c..1914a95562 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -95,7 +95,6 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
 use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
-use crate::is_uninit_mark;
 use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::CONCURRENT_INITDBS;
 use crate::metrics::INITDB_RUN_TIME;
@@ -1793,11 +1792,7 @@ impl Tenant {
             let entry = entry.context("read timeline dir entry")?;
             let entry_path = entry.path();
 
-            let purge = if crate::is_temporary(entry_path)
-                // TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718)
-                || is_uninit_mark(entry_path)
-                || crate::is_delete_mark(entry_path)
-            {
+            let purge = if crate::is_temporary(entry_path) {
                 true
             } else {
                 match TimelineId::try_from(entry_path.file_name()) {

From b1bc33eb4d23c77ce0b4a2c9c8d8453e1f7cd5ee Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Mon, 3 Feb 2025 12:44:47 +0000
Subject: [PATCH 350/583] Fix logical_replication_sync test fixture (#10531)

Fixes flaky test_lr_with_slow_safekeeper test #10242

Fix query to `pg_catalog.pg_stat_subscription` catalog to handle table
synchronization and parallel LR correctly.
---
 test_runner/fixtures/neon_fixtures.py         | 56 +++++++++++++------
 .../performance/test_logical_replication.py   |  4 +-
 test_runner/regress/test_compute_catalog.py   |  7 ++-
 test_runner/regress/test_layer_bloating.py    |  2 +-
 .../regress/test_logical_replication.py       | 28 +++++-----
 .../test_physical_and_logical_replicaiton.py  |  4 +-
 .../regress/test_subscriber_branching.py      |  3 +-
 7 files changed, 65 insertions(+), 39 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 7e3cc19829..8909f7f249 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4996,13 +4996,35 @@ def check_restored_datadir_content(
     assert (mismatch, error) == ([], [])
 
 
+# wait for subscriber to catch up with publisher
 def logical_replication_sync(
     subscriber: PgProtocol,
     publisher: PgProtocol,
+    # pass subname explicitly to avoid confusion
+    # when multiple subscriptions are present
+    subname: str,
     sub_dbname: str | None = None,
     pub_dbname: str | None = None,
-) -> Lsn:
+):
     """Wait logical replication subscriber to sync with publisher."""
+
+    def initial_sync():
+        # first check if the subscription is active `s`=`synchronized`, `r` = `ready`
+        query = f"""SELECT 1 FROM pg_subscription_rel join pg_catalog.pg_subscription
+                    on pg_subscription_rel.srsubid = pg_subscription.oid
+                    WHERE srsubstate NOT IN ('r', 's') and subname='{subname}'"""
+
+        if sub_dbname is not None:
+            res = subscriber.safe_psql(query, dbname=sub_dbname)
+        else:
+            res = subscriber.safe_psql(query)
+
+        assert (res is None) or (len(res) == 0)
+
+    wait_until(initial_sync)
+
+    # wait for the subscription to catch up with current state of publisher
+    # caller is responsible to call checkpoint before calling this function
     if pub_dbname is not None:
         publisher_lsn = Lsn(
             publisher.safe_psql("SELECT pg_current_wal_flush_lsn()", dbname=pub_dbname)[0][0]
@@ -5010,23 +5032,23 @@ def logical_replication_sync(
     else:
         publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
 
-    while True:
-        if sub_dbname is not None:
-            res = subscriber.safe_psql(
-                "select latest_end_lsn from pg_catalog.pg_stat_subscription", dbname=sub_dbname
-            )[0][0]
-        else:
-            res = subscriber.safe_psql(
-                "select latest_end_lsn from pg_catalog.pg_stat_subscription"
-            )[0][0]
+    def subscriber_catch_up():
+        query = f"select latest_end_lsn from pg_catalog.pg_stat_subscription where latest_end_lsn is NOT NULL and subname='{subname}'"
 
-        if res:
-            log.info(f"subscriber_lsn={res}")
-            subscriber_lsn = Lsn(res)
-            log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={publisher_lsn}")
-            if subscriber_lsn >= publisher_lsn:
-                return subscriber_lsn
-        time.sleep(0.5)
+        if sub_dbname is not None:
+            res = subscriber.safe_psql(query, dbname=sub_dbname)
+        else:
+            res = subscriber.safe_psql(query)
+
+        assert res is not None
+
+        res_lsn = res[0][0]
+        log.info(f"subscriber_lsn={res_lsn}")
+        subscriber_lsn = Lsn(res_lsn)
+        log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={publisher_lsn}")
+        assert subscriber_lsn >= publisher_lsn
+
+    wait_until(subscriber_catch_up)
 
 
 def tenant_get_shards(
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 9d653d1a1e..fdc56cc496 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -44,13 +44,13 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
 
     # Wait logical replication channel to be established
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
 
     pg_bin.run_capture(["pgbench", "-c10", "-T100", "-Mprepared", endpoint.connstr()])
 
     # Wait logical replication to sync
     start = time.time()
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     log.info(f"Sync with master took {time.time() - start} seconds")
 
     sum_master = cast("int", endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0])
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
index f0878b2631..50a922a616 100644
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -183,6 +183,7 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
         cursor.execute("select pg_catalog.pg_create_logical_replication_slot('mysub', 'pgoutput');")
         cursor.execute("CREATE TABLE t(a int)")
         cursor.execute("INSERT INTO t VALUES (1)")
+        cursor.execute("CHECKPOINT")
 
     # connect to the subscriber_db and create a subscription
     # Note that we need to create subscription with
@@ -195,7 +196,11 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
 
     # wait for the subscription to be active
     logical_replication_sync(
-        endpoint, endpoint, sub_dbname="subscriber_db", pub_dbname="publisher_db"
+        endpoint,
+        endpoint,
+        "mysub",
+        sub_dbname="subscriber_db",
+        pub_dbname="publisher_db",
     )
 
     # Check that replication is working
diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py
index d9043fef7f..0260704ebf 100644
--- a/test_runner/regress/test_layer_bloating.py
+++ b/test_runner/regress/test_layer_bloating.py
@@ -63,7 +63,7 @@ def test_layer_bloating(neon_env_builder: NeonEnvBuilder, vanilla_pg):
     cur.execute("set statement_timeout=0")
     cur.execute("select create_snapshots(10000)")
     # Wait logical replication to sync
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline)
     env.pageserver.http_client().timeline_checkpoint(env.initial_tenant, timeline, compact=False)
 
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 8908763109..3a92f0d1d1 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -55,13 +55,13 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgr
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
 
     # Wait logical replication channel to be established
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
 
     # insert some data
     cur.execute("insert into t values (generate_series(1,1000), 0)")
 
     # Wait logical replication to sync
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == 1000
 
     # now stop subscriber...
@@ -78,7 +78,7 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgr
     vanilla_pg.start()
 
     # Wait logical replication to sync
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
 
     # Check that subscribers receives all data
     assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == 2000
@@ -148,7 +148,7 @@ COMMIT;
     endpoint.start()
 
     vanilla_pg.start()
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     eq_q = "select testcolumn1, testcolumn2, testcolumn3 from replication_example order by 1, 2, 3"
     assert vanilla_pg.safe_psql(eq_q) == endpoint.safe_psql(eq_q)
     log.info("rewriteheap synced")
@@ -285,7 +285,7 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
     vanilla_pg.safe_psql("create table t(a int)")
     connstr = endpoint.connstr().replace("'", "''")
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
 
     vanilla_pg.stop()
 
@@ -321,13 +321,13 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
         sk_http = sk.http_client()
         sk_http.configure_failpoints([("sk-pause-send", "off")])
 
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2]
 
     # Check that local reads also work
     with endpoint.connect().cursor() as cur:
         cur.execute("insert into t values (3)")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2, 3]
 
     log_path = vanilla_pg.pgdatadir / "pg.log"
@@ -365,7 +365,7 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres)
     log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
     connstr = endpoint.connstr().replace("'", "''")
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     vanilla_pg.stop()
 
     wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
@@ -375,7 +375,7 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres)
     # this should flush current wal page
     cur.execute("insert into replication_example values (3, 4)")
     vanilla_pg.start()
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert vanilla_pg.safe_psql(
         "select sum(somedata) from replication_example"
     ) == endpoint.safe_psql("select sum(somedata) from replication_example")
@@ -409,18 +409,18 @@ def test_large_records(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
     # Test simple insert, update, delete. But with very large values
     value = random_string(10_000_000)
     cur.execute(f"INSERT INTO reptbl VALUES (1, '{value}')")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(1, value)]
 
     # Test delete, and reinsert another value
     cur.execute("DELETE FROM reptbl WHERE id = 1")
     cur.execute(f"INSERT INTO reptbl VALUES (2, '{value}')")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)]
 
     value = random_string(10_000_000)
     cur.execute(f"UPDATE reptbl SET largeval='{value}'")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)]
 
     endpoint.stop()
@@ -428,7 +428,7 @@ def test_large_records(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
     cur = endpoint.connect().cursor()
     value = random_string(10_000_000)
     cur.execute(f"UPDATE reptbl SET largeval='{value}'")
-    logical_replication_sync(vanilla_pg, endpoint)
+    logical_replication_sync(vanilla_pg, endpoint, "sub1")
     assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)]
 
 
@@ -608,7 +608,7 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg: Van
         for i in range(0, 1000):
             pcur.execute("INSERT into t values (%s, random()*100000)", (i,))
     # wait until sub receives all data
-    logical_replication_sync(sub, vanilla_pg)
+    logical_replication_sync(sub, vanilla_pg, "sub")
     # Update confirmed_flush_lsn of the slot. If subscriber ack'ed recevied data
     # as flushed we'll now lose it if subscriber restars. That's why
     # logical_replication_wait_flush_lsn_sync is expected to hang while
diff --git a/test_runner/regress/test_physical_and_logical_replicaiton.py b/test_runner/regress/test_physical_and_logical_replicaiton.py
index 3f9824ee67..229439106b 100644
--- a/test_runner/regress/test_physical_and_logical_replicaiton.py
+++ b/test_runner/regress/test_physical_and_logical_replicaiton.py
@@ -43,7 +43,7 @@ def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonE
     s_cur.execute("select count(*) from t")
     assert s_cur.fetchall()[0][0] == n_records
 
-    logical_replication_sync(vanilla_pg, primary)
+    logical_replication_sync(vanilla_pg, primary, "sub1")
     assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == n_records
 
     # Check that LR slot is not copied to replica
@@ -87,7 +87,7 @@ def test_aux_not_logged_at_replica(neon_simple_env: NeonEnv, vanilla_pg):
     s_con = secondary.connect()
     s_cur = s_con.cursor()
 
-    logical_replication_sync(vanilla_pg, primary)
+    logical_replication_sync(vanilla_pg, primary, "sub1")
 
     assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == n_records
     s_cur.execute("select count(*) from t")
diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py
index 645572da8e..849d4f024d 100644
--- a/test_runner/regress/test_subscriber_branching.py
+++ b/test_runner/regress/test_subscriber_branching.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 import time
 
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, logical_replication_sync
+from fixtures.neon_fixtures import NeonEnv
 from fixtures.utils import query_scalar, wait_until
 
 
@@ -208,7 +208,6 @@ def test_subscriber_branching(neon_simple_env: NeonEnv):
         # wake the sub and ensure that it catches up with the new data
         sub.start(create_test_user=True)
         with sub.cursor(dbname="neondb", user="test", password="testpwd") as scur:
-            logical_replication_sync(sub, pub)
             wait_until(check_that_changes_propagated)
             scur.execute("SELECT count(*) FROM t")
             res = scur.fetchall()

From 23ca8b061ba4c3a53b38d3f49d3ae7be1ed696f8 Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Mon, 3 Feb 2025 13:55:48 +0100
Subject: [PATCH 351/583] Use actions/checkout for checkout (#10630)

## Problem
1. First of all it's more correct
2. Current usage allows ` Time-of-Check-Time-of-Use (TOCTOU) 'Pwn
Request' vulnerabilities`. Please check security slack channel or reach
me for more details. I will update PR description after merge.

## Summary of changes
1. Use `actions/checkout` with `ref: ${{
github.event.pull_request.head.sha }}`

Discovered by and Co-author: @varunsh-coder
---
 .github/workflows/approved-for-ci-run.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml
index 0a0898d30c..fc2f36c74b 100644
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -94,7 +94,9 @@ jobs:
           echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT}
           echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT}
 
-      - run: gh pr checkout "${PR_NUMBER}"
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
 
       - run: git checkout -b "${BRANCH}"
 

From e617a3a075f4debfa2b50a280a50475f3abed32c Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Mon, 3 Feb 2025 05:34:11 -0800
Subject: [PATCH 352/583] vm-monitor: Improve error display (#10542)

Logging errors with the debug format specifier causes multi-line errors,
which are sometimes a pain to deal with. Instead, we should use anyhow's
alternate display format, which shows the same information on a single
line.

Also adjusted a couple of error messages that were stale.

Fixes neondatabase/cloud#14710.
---
 libs/vm_monitor/src/filecache.rs |  6 +++---
 libs/vm_monitor/src/lib.rs       | 14 +++++++-------
 libs/vm_monitor/src/runner.rs    | 12 ++++++++----
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/libs/vm_monitor/src/filecache.rs b/libs/vm_monitor/src/filecache.rs
index fe71e11197..4f5bf1c1e3 100644
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -177,8 +177,8 @@ impl FileCacheState {
         crate::spawn_with_cancel(
             token,
             |res| {
-                if let Err(error) = res {
-                    error!(%error, "postgres error")
+                if let Err(e) = res {
+                    error!(error = format_args!("{e:#}"), "postgres error");
                 }
             },
             conn,
@@ -205,7 +205,7 @@ impl FileCacheState {
         {
             Ok(rows) => Ok(rows),
             Err(e) => {
-                error!(error = ?e, "postgres error: {e} -> retrying");
+                error!(error = format_args!("{e:#}"), "postgres error -> retrying");
 
                 let client = FileCacheState::connect(&self.conn_str, self.token.clone())
                     .await
diff --git a/libs/vm_monitor/src/lib.rs b/libs/vm_monitor/src/lib.rs
index 1b13c8e0b2..0cd97d4ca1 100644
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -191,15 +191,12 @@ async fn start_monitor(
     .await;
     let mut monitor = match monitor {
         Ok(Ok(monitor)) => monitor,
-        Ok(Err(error)) => {
-            error!(?error, "failed to create monitor");
+        Ok(Err(e)) => {
+            error!(error = format_args!("{e:#}"), "failed to create monitor");
             return;
         }
         Err(_) => {
-            error!(
-                ?timeout,
-                "creating monitor timed out (probably waiting to receive protocol range)"
-            );
+            error!(?timeout, "creating monitor timed out");
             return;
         }
     };
@@ -207,6 +204,9 @@ async fn start_monitor(
 
     match monitor.run().await {
         Ok(()) => info!("monitor was killed due to new connection"),
-        Err(e) => error!(error = ?e, "monitor terminated unexpectedly"),
+        Err(e) => error!(
+            error = format_args!("{e:#}"),
+            "monitor terminated unexpectedly"
+        ),
     }
 }
diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs
index 8605314ba9..8839f5803f 100644
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -370,12 +370,16 @@ impl Runner {
                 }),
             InboundMsgKind::InvalidMessage { error } => {
                 warn!(
-                    %error, id, "received notification of an invalid message we sent"
+                    error = format_args!("{error:#}"),
+                    id, "received notification of an invalid message we sent"
                 );
                 Ok(None)
             }
             InboundMsgKind::InternalError { error } => {
-                warn!(error, id, "agent experienced an internal error");
+                warn!(
+                    error = format_args!("{error:#}"),
+                    id, "agent experienced an internal error"
+                );
                 Ok(None)
             }
             InboundMsgKind::HealthCheck {} => {
@@ -476,7 +480,7 @@ impl Runner {
                                         // gives the outermost cause, and the debug impl
                                         // pretty-prints the error, whereas {:#} contains all the
                                         // causes, but is compact (no newlines).
-                                        warn!(error = format!("{e:#}"), "error handling message");
+                                        warn!(error = format_args!("{e:#}"), "error handling message");
                                         OutboundMsg::new(
                                             OutboundMsgKind::InternalError {
                                                 error: e.to_string(),
@@ -492,7 +496,7 @@ impl Runner {
                                     .context("failed to send message")?;
                             }
                             Err(e) => warn!(
-                                error = format!("{e}"),
+                                error = format_args!("{e:#}"),
                                 msg = ?msg,
                                 "received error message"
                             ),

From 43682624b5548136f321254e88cd9b7f7453ae61 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 3 Feb 2025 13:41:41 +0000
Subject: [PATCH 353/583] CI(pg-clients): fix logical replication tests
 (#10623)

## Problem

Tests for logical replication (on Staging) have been failing for some
time because logical replication is not enabled for them. This issue
occurred after switching to an org API key with a different default
setting, where logical replication was not enabled by default.

## Summary of changes
- Add `enable_logical_replication` input to
`actions/neon-project-create`
- Enable logical replication in `test-logical-replication` job
---
 .../actions/neon-project-create/action.yml    | 12 +++++++----
 .github/workflows/pg-clients.yml              |  6 ++++--
 test_runner/logical_repl/README.md            | 20 +++++++++++--------
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index 11f46bce8e..c9f6b0832e 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -41,7 +41,10 @@ inputs:
     description: 'Path to directory containing libpq library - it is caller responsibility to provision the libpq library'
     required: false
     default: '/tmp/neon/pg_install/v16/lib'
-  
+  project_settings:
+    description: 'A JSON object with project settings'
+    required: false
+    default: '{}'
 
 outputs:
   dsn:
@@ -73,7 +76,7 @@ runs:
               \"provisioner\": \"k8s-neonvm\",
               \"autoscaling_limit_min_cu\": ${MIN_CU},
               \"autoscaling_limit_max_cu\": ${MAX_CU},
-              \"settings\": { }
+              \"settings\": ${PROJECT_SETTINGS}
             }
           }")
 
@@ -92,12 +95,12 @@ runs:
         if [ "${SHARD_SPLIT_PROJECT}" = "true" ]; then
           # determine tenant ID
           TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"`
-          
+
           echo "Splitting project ${project_id} with tenant_id ${TENANT_ID} into $((SHARD_COUNT)) shards with stripe size $((STRIPE_SIZE))"
 
           echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split"
           echo "with body {\"new_shard_count\": $((SHARD_COUNT)), \"new_stripe_size\": $((STRIPE_SIZE))}"
-          
+
           # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set)
           curl -X PUT \
             "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split" \
@@ -118,3 +121,4 @@ runs:
         STRIPE_SIZE: ${{ inputs.stripe_size }}
         PSQL: ${{ inputs.psql_path }}
         LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }}
+        PROJECT_SETTINGS: ${{ inputs.project_settings }}
diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml
index 4947907eb0..abc90c7fe1 100644
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -12,8 +12,8 @@ on:
   pull_request:
     paths:
       - '.github/workflows/pg-clients.yml'
-      - 'test_runner/pg_clients/**'
-      - 'test_runner/logical_repl/**'
+      - 'test_runner/pg_clients/**/*.py'
+      - 'test_runner/logical_repl/**/*.py'
       - 'poetry.lock'
   workflow_dispatch:
 
@@ -104,6 +104,8 @@ jobs:
         with:
           api_key: ${{ secrets.NEON_STAGING_API_KEY }}
           postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+          project_settings: >-
+            {"enable_logical_replication": true}
 
       - name: Run tests
         uses: ./.github/actions/run-python-test-set
diff --git a/test_runner/logical_repl/README.md b/test_runner/logical_repl/README.md
index 8eca056dda..449e56e21d 100644
--- a/test_runner/logical_repl/README.md
+++ b/test_runner/logical_repl/README.md
@@ -1,13 +1,18 @@
 # Logical replication tests
 
+> [!NOTE]
+> Neon project should have logical replication enabled:
+>
+> https://neon.tech/docs/guides/logical-replication-postgres#enable-logical-replication-in-the-source-neon-project
+
 ## Clickhouse
 
 ```bash
 export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
 
-docker compose -f clickhouse/docker-compose.yml up -d
-pytest -m remote_cluster -k test_clickhouse
-docker compose -f clickhouse/docker-compose.yml down
+docker compose -f test_runner/logical_repl/clickhouse/docker-compose.yml up -d
+./scripts/pytest -m remote_cluster -k test_clickhouse
+docker compose -f test_runner/logical_repl/clickhouse/docker-compose.yml down
 ```
 
 ## Debezium
@@ -15,8 +20,7 @@ docker compose -f clickhouse/docker-compose.yml down
 ```bash
 export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
 
-docker compose -f debezium/docker-compose.yml up -d
-pytest -m remote_cluster -k test_debezium
-docker compose -f debezium/docker-compose.yml down
-
-```
\ No newline at end of file
+docker compose -f test_runner/logical_repl/debezium/docker-compose.yml up -d
+./scripts/pytest -m remote_cluster -k test_debezium
+docker compose -f test_runner/logical_repl/debezium/docker-compose.yml down
+```

From 628a9616c4f0836a8d06dce34f8b4a525a5d7985 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 3 Feb 2025 15:12:41 +0100
Subject: [PATCH 354/583] fix(proxy): Don't use --is-private-access-proxy to
 disable IP check (#10633)

## Problem
* The behavior of this flag changed. Plus, it's not necessary to disable
the IP check as long as there are no IPs listed in the local postgres.

## Summary of changes
* Drop the flag from the command in the README.md section.
* Change the postgres URL passed to proxy to not use the endpoint
hostname.
* Also swap postgres creation and proxy startup, so the DB is running
when proxy comes up.
---
 proxy/README.md | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/proxy/README.md b/proxy/README.md
index 4b98342d72..ecd54fbbd8 100644
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -106,17 +106,7 @@ cases where it is hard to use rows represented as objects (e.g. when several fie
 
 Proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so we can use *.localtest.me` which resolves to `127.0.0.1`.
 
-Let's create self-signed certificate by running:
-```sh
-openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me"
-```
-
-Then we need to build proxy with 'testing' feature and run, e.g.:
-```sh
-RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://proxy:password@endpoint.localtest.me:5432/postgres' --is-private-access-proxy true -c server.crt -k server.key
-```
-
-We will also need to have a postgres instance. Assuming that we have setted up docker we can set it up as follows:
+We will need to have a postgres instance. Assuming that we have set up docker we can set it up as follows:
 ```sh
 docker run \
   --detach \
@@ -133,8 +123,18 @@ docker exec -it proxy-postgres psql -U postgres -c "CREATE TABLE neon_control_pl
 docker exec -it proxy-postgres psql -U postgres -c "CREATE ROLE proxy WITH SUPERUSER LOGIN PASSWORD 'password';"
 ```
 
+Let's create self-signed certificate by running:
+```sh
+openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me"
+```
+
+Then we need to build proxy with 'testing' feature and run, e.g.:
+```sh
+RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key
+```
+
 Now from client you can start a new session:
 
 ```sh
 PGSSLROOTCERT=./server.crt psql  "postgresql://proxy:password@endpoint.localtest.me:4432/postgres?sslmode=verify-full"
-```
\ No newline at end of file
+```

From c774f0a14710846c65f6b82b3838c1496d03af8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 3 Feb 2025 19:21:01 +0100
Subject: [PATCH 355/583] storcon db: allow accepting any TLS certificate
 (#10640)

We encountered some TLS validation errors for the storcon since applying
#10614. Add an option to downgrade them to logged errors instead to
allow us to debug with more peace.

cc issue https://github.com/neondatabase/cloud/issues/23583
---
 storage_controller/src/persistence.rs | 93 +++++++++++++++++++++++++--
 1 file changed, 88 insertions(+), 5 deletions(-)

diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 880f203064..45f3108d6b 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -27,6 +27,8 @@ use pageserver_api::shard::ShardConfigError;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
+use rustls::client::danger::ServerCertVerifier;
+use rustls::client::WebPkiServerVerifier;
 use rustls::crypto::ring;
 use scoped_futures::ScopedBoxFuture;
 use serde::{Deserialize, Serialize};
@@ -1281,14 +1283,95 @@ pub(crate) fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
 
 /// Loads the root certificates and constructs a client config suitable for connecting.
 /// This function is blocking.
-pub fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
-    Ok(
+fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
+    let client_config =
         rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
             .with_safe_default_protocol_versions()
-            .expect("ring should support the default protocol versions")
+            .expect("ring should support the default protocol versions");
+    static DO_CERT_CHECKS: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
+    let do_cert_checks =
+        DO_CERT_CHECKS.get_or_init(|| std::env::var("STORCON_CERT_CHECKS").is_ok());
+    Ok(if *do_cert_checks {
+        client_config
             .with_root_certificates(load_certs()?)
-            .with_no_client_auth(),
-    )
+            .with_no_client_auth()
+    } else {
+        use rustls::client::danger::{HandshakeSignatureValid, ServerCertVerified};
+        #[derive(Debug)]
+        struct AcceptAll(Arc<WebPkiServerVerifier>);
+        impl ServerCertVerifier for AcceptAll {
+            fn verify_server_cert(
+                &self,
+                end_entity: &rustls::pki_types::CertificateDer<'_>,
+                intermediates: &[rustls::pki_types::CertificateDer<'_>],
+                server_name: &rustls::pki_types::ServerName<'_>,
+                ocsp_response: &[u8],
+                now: rustls::pki_types::UnixTime,
+            ) -> Result<ServerCertVerified, rustls::Error> {
+                let r = self.0.verify_server_cert(
+                    end_entity,
+                    intermediates,
+                    server_name,
+                    ocsp_response,
+                    now,
+                );
+                if let Err(err) = r {
+                    tracing::info!(
+                        ?server_name,
+                        "ignoring db connection TLS validation error: {err:?}"
+                    );
+                    return Ok(ServerCertVerified::assertion());
+                }
+                r
+            }
+            fn verify_tls12_signature(
+                &self,
+                message: &[u8],
+                cert: &rustls::pki_types::CertificateDer<'_>,
+                dss: &rustls::DigitallySignedStruct,
+            ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error>
+            {
+                let r = self.0.verify_tls12_signature(message, cert, dss);
+                if let Err(err) = r {
+                    tracing::info!(
+                        "ignoring db connection 1.2 signature TLS validation error: {err:?}"
+                    );
+                    return Ok(HandshakeSignatureValid::assertion());
+                }
+                r
+            }
+            fn verify_tls13_signature(
+                &self,
+                message: &[u8],
+                cert: &rustls::pki_types::CertificateDer<'_>,
+                dss: &rustls::DigitallySignedStruct,
+            ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error>
+            {
+                let r = self.0.verify_tls13_signature(message, cert, dss);
+                if let Err(err) = r {
+                    tracing::info!(
+                        "ignoring db connection 1.3 signature TLS validation error: {err:?}"
+                    );
+                    return Ok(HandshakeSignatureValid::assertion());
+                }
+                r
+            }
+            fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
+                self.0.supported_verify_schemes()
+            }
+        }
+        let verifier = AcceptAll(
+            WebPkiServerVerifier::builder_with_provider(
+                load_certs()?,
+                Arc::new(ring::default_provider()),
+            )
+            .build()?,
+        );
+        client_config
+            .dangerous()
+            .with_custom_certificate_verifier(Arc::new(verifier))
+            .with_no_client_auth()
+    })
 }
 
 fn establish_connection_rustls(config: &str) -> BoxFuture<ConnectionResult<AsyncPgConnection>> {

From 715e20343a64a6194feffba0e67d4f6a47d4c83a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 3 Feb 2025 19:01:16 +0000
Subject: [PATCH 356/583] storage controller: improve scheduling of tenants
 created in PlacementPolicy::Secondary (#10590)

## Problem

I noticed when onboarding lots of tenants that the AZ scheduling
violation stat was climbing, before falling later as optimisations
happened. This was happening because we first add the tenant with
PlacementPolicy::Secondary, and then later go to
PlacementPolicy::Attached, and the scheduler's behavior led to a bad AZ
choice:
1. Create a secondary location in the non-preferred AZ
2. Upgrade to Attached where we promote that non-preferred-AZ location
to attached and then create another secondary
3. Optimiser later realises we're in the wrong AZ and moves us

## Summary of changes

- Extend some logging to give more information about AZs
- When scheduling secondary location in PlacementPolicy::Secondary,
select it as if we were attached: in this mode, our business goal is to
have a warm pageserver location that we can make available as attached
quickly if needed, therefore we want it to be in the preferred AZ.
- Make optimize_secondary logic the same, so that it will consider a
secondary location in the preferred AZ to be optimal when in
PlacementPolicy::Secondary
- When transitioning to from PlacementPolicy::Attached(N) to
PlacementPolicy::Secondary, instead of arbitrarily picking a location to
keep, prefer to keep the location in the preferred AZ
---
 storage_controller/src/scheduler.rs           |   5 +-
 storage_controller/src/tenant_shard.rs        | 155 ++++++++++++++++--
 .../regress/test_storage_controller.py        |   7 +
 3 files changed, 155 insertions(+), 12 deletions(-)

diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index f5cab9dd57..f9e72862ae 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -774,8 +774,9 @@ impl Scheduler {
 
         if !matches!(context.mode, ScheduleMode::Speculative) {
             tracing::info!(
-            "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
-            scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>()
+            "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?}, preferred_az: {:?})",
+            scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>(),
+            preferred_az,
        );
         }
 
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index d344e27e31..302104dc97 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -710,7 +710,11 @@ impl TenantShard {
                     modified = true;
                 } else if self.intent.secondary.is_empty() {
                     // Populate secondary by scheduling a fresh node
-                    let node_id = scheduler.schedule_shard::<SecondaryShardTag>(
+                    //
+                    // We use [`AttachedShardTag`] because when a secondary location is the only one
+                    // a shard has, we expect that its next use will be as an attached location: we want
+                    // the tenant to be ready to warm up and run fast in their preferred AZ.
+                    let node_id = scheduler.schedule_shard::<AttachedShardTag>(
                         &[],
                         &self.intent.preferred_az_id,
                         context,
@@ -719,9 +723,17 @@ impl TenantShard {
                     modified = true;
                 }
                 while self.intent.secondary.len() > 1 {
-                    // We have no particular preference for one secondary location over another: just
-                    // arbitrarily drop from the end
-                    self.intent.pop_secondary(scheduler);
+                    // If we have multiple secondaries (e.g. when transitioning from Attached to Secondary and
+                    // having just demoted our attached location), then we should prefer to keep the location
+                    // in our preferred AZ.  Tenants in Secondary mode want to be in the preferred AZ so that
+                    // they have a warm location to become attached when transitioning back into Attached.
+
+                    let mut candidates = self.intent.get_secondary().clone();
+                    // Sort to get secondaries outside preferred AZ last
+                    candidates
+                        .sort_by_key(|n| scheduler.get_node_az(n).as_ref() != self.preferred_az());
+                    let secondary_to_remove = candidates.pop().unwrap();
+                    self.intent.remove_secondary(scheduler, secondary_to_remove);
                     modified = true;
                 }
             }
@@ -1079,12 +1091,31 @@ impl TenantShard {
                 None => vec![],
             };
 
-            let replacement = self.find_better_location::<SecondaryShardTag>(
-                scheduler,
-                &schedule_context,
-                *secondary,
-                &exclude,
-            );
+            let replacement = match &self.policy {
+                PlacementPolicy::Attached(_) => {
+                    // Secondaries for an attached shard should be scheduled using `SecondaryShardTag`
+                    // to avoid placing them in the preferred AZ.
+                    self.find_better_location::<SecondaryShardTag>(
+                        scheduler,
+                        &schedule_context,
+                        *secondary,
+                        &exclude,
+                    )
+                }
+                PlacementPolicy::Secondary => {
+                    // In secondary-only mode, we want our secondary locations in the preferred AZ,
+                    // so that they're ready to take over as an attached location when we transition
+                    // into PlacementPolicy::Attached.
+                    self.find_better_location::<AttachedShardTag>(
+                        scheduler,
+                        &schedule_context,
+                        *secondary,
+                        &exclude,
+                    )
+                }
+                PlacementPolicy::Detached => None,
+            };
+
             assert!(replacement != Some(*secondary));
             if let Some(replacement) = replacement {
                 // We have found a candidate and confirmed that its score is preferable
@@ -2687,4 +2718,108 @@ pub(crate) mod tests {
         }
         Ok(())
     }
+
+    /// Check how the shard's scheduling behaves when in PlacementPolicy::Secondary mode.
+    #[test]
+    fn tenant_secondary_scheduling() -> anyhow::Result<()> {
+        let az_a = AvailabilityZone("az-a".to_string());
+        let nodes = make_test_nodes(
+            3,
+            &[
+                az_a.clone(),
+                AvailabilityZone("az-b".to_string()),
+                AvailabilityZone("az-c".to_string()),
+            ],
+        );
+
+        let mut scheduler = Scheduler::new(nodes.values());
+        let mut context = ScheduleContext::default();
+
+        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Secondary);
+        tenant_shard.intent.preferred_az_id = Some(az_a.clone());
+        tenant_shard
+            .schedule(&mut scheduler, &mut context)
+            .expect("we have enough nodes, scheduling should work");
+        assert_eq!(tenant_shard.intent.secondary.len(), 1);
+        assert!(tenant_shard.intent.attached.is_none());
+
+        // Should have scheduled into the preferred AZ
+        assert_eq!(
+            scheduler
+                .get_node_az(&tenant_shard.intent.secondary[0])
+                .as_ref(),
+            tenant_shard.preferred_az()
+        );
+
+        // Optimizer should agree
+        assert_eq!(
+            tenant_shard.optimize_attachment(&mut scheduler, &context),
+            None
+        );
+        assert_eq!(
+            tenant_shard.optimize_secondary(&mut scheduler, &context),
+            None
+        );
+
+        // Switch to PlacementPolicy::Attached
+        tenant_shard.policy = PlacementPolicy::Attached(1);
+        tenant_shard
+            .schedule(&mut scheduler, &mut context)
+            .expect("we have enough nodes, scheduling should work");
+        assert_eq!(tenant_shard.intent.secondary.len(), 1);
+        assert!(tenant_shard.intent.attached.is_some());
+        // Secondary should now be in non-preferred AZ
+        assert_ne!(
+            scheduler
+                .get_node_az(&tenant_shard.intent.secondary[0])
+                .as_ref(),
+            tenant_shard.preferred_az()
+        );
+        // Attached should be in preferred AZ
+        assert_eq!(
+            scheduler
+                .get_node_az(&tenant_shard.intent.attached.unwrap())
+                .as_ref(),
+            tenant_shard.preferred_az()
+        );
+
+        // Optimizer should agree
+        assert_eq!(
+            tenant_shard.optimize_attachment(&mut scheduler, &context),
+            None
+        );
+        assert_eq!(
+            tenant_shard.optimize_secondary(&mut scheduler, &context),
+            None
+        );
+
+        // Switch back to PlacementPolicy::Secondary
+        tenant_shard.policy = PlacementPolicy::Secondary;
+        tenant_shard
+            .schedule(&mut scheduler, &mut context)
+            .expect("we have enough nodes, scheduling should work");
+        assert_eq!(tenant_shard.intent.secondary.len(), 1);
+        assert!(tenant_shard.intent.attached.is_none());
+        // When we picked a location to keep, we should have kept the one in the preferred AZ
+        assert_eq!(
+            scheduler
+                .get_node_az(&tenant_shard.intent.secondary[0])
+                .as_ref(),
+            tenant_shard.preferred_az()
+        );
+
+        // Optimizer should agree
+        assert_eq!(
+            tenant_shard.optimize_attachment(&mut scheduler, &context),
+            None
+        );
+        assert_eq!(
+            tenant_shard.optimize_secondary(&mut scheduler, &context),
+            None
+        );
+
+        tenant_shard.intent.clear(&mut scheduler);
+
+        Ok(())
+    }
 }
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 350fe31099..11a4d09202 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -373,6 +373,7 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     but imports the generation number.
     """
 
+    neon_env_builder.num_azs = 3
     env, origin_ps, tenant_id, generation = prepare_onboarding_env(neon_env_builder)
 
     virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
@@ -409,6 +410,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
             "node_secondary"
         ][0]
 
+        # Check that the secondary's scheduling is stable
+        assert env.storage_controller.reconcile_all() == 0
+
     # Call into storage controller to onboard the tenant
     generation += 1
     r = virtual_ps_http.tenant_location_conf(
@@ -460,6 +464,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     )
     assert len(r["shards"]) == 1
 
+    # Check that onboarding did not result in an unstable scheduling state
+    assert env.storage_controller.reconcile_all() == 0
+
     # We should see the tenant is now attached to the pageserver managed
     # by the sharding service
     origin_tenants = origin_ps.http_client().tenant_list()

From 06b45fd0fd38515cd431aab0d9baae7e13a52058 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 3 Feb 2025 20:23:12 +0100
Subject: [PATCH 357/583] utils/logging: add `critical!` macro and metric
 (#10641)

## Problem

We don't currently have good alerts for critical errors, e.g. data
loss/corruption.

Touches #10094.

## Summary of changes

Add a `critical!` macro and corresponding
`libmetrics_tracing_event_count{level="critical"}` metric. This will:

* Emit an `ERROR` log message with prefix `"CRITICAL:"` and a backtrace.
* Increment `libmetrics_tracing_event_count{level="critical"}`, and
indirectly `level="error"`.
* Trigger a pageable alert (via the metric above).
* In debug builds, panic the process.

I'll add uses of the macro separately.
---
 libs/utils/src/logging.rs | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index e205d60d74..753f05b6fd 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -5,6 +5,24 @@ use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, VariantNames};
 
+/// Logs a critical error, similarly to `tracing::error!`. This will:
+///
+/// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace.
+/// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error".
+/// * Trigger a pageable alert (via the metric above).
+/// * In debug builds, panic the process.
+#[macro_export]
+macro_rules! critical {
+    ($($arg:tt)*) => {
+        if cfg!(debug_assertions) {
+            panic!($($arg)*);
+        }
+        $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
+        let backtrace = std::backtrace::Backtrace::capture();
+        tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*));
+    };
+}
+
 #[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
 #[strum(serialize_all = "snake_case")]
 pub enum LogFormat {
@@ -25,7 +43,10 @@ impl LogFormat {
     }
 }
 
-struct TracingEventCountMetric {
+pub struct TracingEventCountMetric {
+    /// CRITICAL is not a `tracing` log level. Instead, we increment it in the `critical!` macro,
+    /// and also emit it as a regular error. These are thus double-counted, but that seems fine.
+    critical: IntCounter,
     error: IntCounter,
     warn: IntCounter,
     info: IntCounter,
@@ -33,7 +54,7 @@ struct TracingEventCountMetric {
     trace: IntCounter,
 }
 
-static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
+pub static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
     let vec = metrics::register_int_counter_vec!(
         "libmetrics_tracing_event_count",
         "Number of tracing events, by level",
@@ -46,6 +67,7 @@ static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(||
 impl TracingEventCountMetric {
     fn new(vec: IntCounterVec) -> Self {
         Self {
+            critical: vec.with_label_values(&["critical"]),
             error: vec.with_label_values(&["error"]),
             warn: vec.with_label_values(&["warn"]),
             info: vec.with_label_values(&["info"]),
@@ -54,6 +76,11 @@ impl TracingEventCountMetric {
         }
     }
 
+    // Allow public access from `critical!` macro.
+    pub fn inc_critical(&self) {
+        self.critical.inc();
+    }
+
     fn inc_for_level(&self, level: tracing::Level) {
         let counter = match level {
             tracing::Level::ERROR => &self.error,

From d80cbb244312aacfacbc5b4b8b26efbe63752a56 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 3 Feb 2025 19:42:40 +0000
Subject: [PATCH 358/583] build(deps): bump openssl from 0.10.66 to 0.10.70 in
 /test_runner/pg_clients/rust/tokio-postgres in the cargo group across 1
 directory (#10642)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 test_runner/pg_clients/rust/tokio-postgres/Cargo.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
index 354fc15745..0b138bf167 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
@@ -421,9 +421,9 @@ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 
 [[package]]
 name = "openssl"
-version = "0.10.66"
+version = "0.10.70"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1"
+checksum = "61cfb4e166a8bb8c9b55c500bc2308550148ece889be90f609377e58140f42c6"
 dependencies = [
  "bitflags 2.6.0",
  "cfg-if",
@@ -453,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.103"
+version = "0.9.105"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6"
+checksum = "8b22d5b84be05a8d6947c7cb71f7c849aa0f112acd4bf51c2a7c1c988ac0a9dc"
 dependencies = [
  "cc",
  "libc",

From c1be84197eb6f0f8c793bd251428501097b7c580 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 3 Feb 2025 15:55:47 -0500
Subject: [PATCH 359/583] feat(pageserver): preempt image layer generation if
 L0 piles up (#10572)

## Problem

Image layer generation could block L0 compactions for a long time.

## Summary of changes

* Refactored the return value of `create_image_layers_for_*` functions
to make it self-explainable.
* Preempt image layer generation in `Try` mode if L0 piles up.

Note that we might potentially run into a state that only the beginning
part of the keyspace gets image coverage. In that case, we either need
to implement something to prioritize some keyspaces with image coverage,
or tune the image_creation_threshold to ensure that the frequency of
image creation could keep up with L0 compaction.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Erik Grinaker <erik@neon.tech>
---
 control_plane/src/pageserver.rs               |   5 +
 libs/pageserver_api/src/config.rs             |   9 +
 libs/pageserver_api/src/models.rs             |   8 +
 pageserver/src/tenant.rs                      |   3 +
 pageserver/src/tenant/config.rs               |  12 ++
 .../storage_layer/batch_split_writer.rs       |   4 +
 pageserver/src/tenant/timeline.rs             | 197 +++++++++++++-----
 pageserver/src/tenant/timeline/compaction.rs  |  30 ++-
 .../regress/test_attach_tenant_config.py      |   1 +
 9 files changed, 209 insertions(+), 60 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 383c174684..dd37bfc407 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -388,6 +388,11 @@ impl PageServerNode {
                 .map(|x| x.parse::<u8>())
                 .transpose()
                 .context("Failed to parse 'image_creation_check_threshold' as integer")?,
+            image_creation_preempt_threshold: settings
+                .remove("image_creation_preempt_threshold")
+                .map(|x| x.parse::<usize>())
+                .transpose()
+                .context("Failed to parse 'image_creation_preempt_threshold' as integer")?,
             pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
             walreceiver_connect_timeout: settings
                 .remove("walreceiver_connect_timeout")
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 422da0dc95..a0b5feea94 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -323,6 +323,10 @@ pub struct TenantConfigToml {
     // Expresed in multiples of checkpoint distance.
     pub image_layer_creation_check_threshold: u8,
 
+    // How many multiples of L0 `compaction_threshold` will preempt image layer creation and do L0 compaction.
+    // Set to 0 to disable preemption.
+    pub image_creation_preempt_threshold: usize,
+
     /// The length for an explicit LSN lease request.
     /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
     #[serde(with = "humantime_serde")]
@@ -547,6 +551,10 @@ pub mod tenant_conf_defaults {
     // Relevant: https://github.com/neondatabase/neon/issues/3394
     pub const DEFAULT_GC_PERIOD: &str = "1 hr";
     pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
+    // If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
+    // layer creation will end immediately. Set to 0 to disable. The target default will be 3 once we
+    // want to enable this feature.
+    pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 0;
     pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
     pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
     pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
@@ -605,6 +613,7 @@ impl Default for TenantConfigToml {
             lazy_slru_download: false,
             timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
             image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
+            image_creation_preempt_threshold: DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD,
             lsn_lease_length: LsnLease::DEFAULT_LENGTH,
             lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
             timeline_offloading: false,
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 43447c67bd..19beb37ab3 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -498,6 +498,8 @@ pub struct TenantConfigPatch {
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub image_layer_creation_check_threshold: FieldPatch<u8>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_creation_preempt_threshold: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub lsn_lease_length: FieldPatch<String>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub lsn_lease_length_for_ts: FieldPatch<String>,
@@ -544,6 +546,7 @@ pub struct TenantConfig {
     pub lazy_slru_download: Option<bool>,
     pub timeline_get_throttle: Option<ThrottleConfig>,
     pub image_layer_creation_check_threshold: Option<u8>,
+    pub image_creation_preempt_threshold: Option<usize>,
     pub lsn_lease_length: Option<String>,
     pub lsn_lease_length_for_ts: Option<String>,
     pub timeline_offloading: Option<bool>,
@@ -581,6 +584,7 @@ impl TenantConfig {
             mut lazy_slru_download,
             mut timeline_get_throttle,
             mut image_layer_creation_check_threshold,
+            mut image_creation_preempt_threshold,
             mut lsn_lease_length,
             mut lsn_lease_length_for_ts,
             mut timeline_offloading,
@@ -635,6 +639,9 @@ impl TenantConfig {
         patch
             .image_layer_creation_check_threshold
             .apply(&mut image_layer_creation_check_threshold);
+        patch
+            .image_creation_preempt_threshold
+            .apply(&mut image_creation_preempt_threshold);
         patch.lsn_lease_length.apply(&mut lsn_lease_length);
         patch
             .lsn_lease_length_for_ts
@@ -679,6 +686,7 @@ impl TenantConfig {
             lazy_slru_download,
             timeline_get_throttle,
             image_layer_creation_check_threshold,
+            image_creation_preempt_threshold,
             lsn_lease_length,
             lsn_lease_length_for_ts,
             timeline_offloading,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1914a95562..80a61eba92 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5486,6 +5486,9 @@ pub(crate) mod harness {
                 image_layer_creation_check_threshold: Some(
                     tenant_conf.image_layer_creation_check_threshold,
                 ),
+                image_creation_preempt_threshold: Some(
+                    tenant_conf.image_creation_preempt_threshold,
+                ),
                 lsn_lease_length: Some(tenant_conf.lsn_lease_length),
                 lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
                 timeline_offloading: Some(tenant_conf.timeline_offloading),
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 139ed27bd2..972837dc44 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -357,6 +357,9 @@ pub struct TenantConfOpt {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub image_layer_creation_check_threshold: Option<u8>,
 
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_creation_preempt_threshold: Option<usize>,
+
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(with = "humantime_serde")]
     #[serde(default)]
@@ -453,6 +456,9 @@ impl TenantConfOpt {
             image_layer_creation_check_threshold: self
                 .image_layer_creation_check_threshold
                 .unwrap_or(global_conf.image_layer_creation_check_threshold),
+            image_creation_preempt_threshold: self
+                .image_creation_preempt_threshold
+                .unwrap_or(global_conf.image_creation_preempt_threshold),
             lsn_lease_length: self
                 .lsn_lease_length
                 .unwrap_or(global_conf.lsn_lease_length),
@@ -504,6 +510,7 @@ impl TenantConfOpt {
             mut lazy_slru_download,
             mut timeline_get_throttle,
             mut image_layer_creation_check_threshold,
+            mut image_creation_preempt_threshold,
             mut lsn_lease_length,
             mut lsn_lease_length_for_ts,
             mut timeline_offloading,
@@ -578,6 +585,9 @@ impl TenantConfOpt {
         patch
             .image_layer_creation_check_threshold
             .apply(&mut image_layer_creation_check_threshold);
+        patch
+            .image_creation_preempt_threshold
+            .apply(&mut image_creation_preempt_threshold);
         patch
             .lsn_lease_length
             .map(|v| humantime::parse_duration(&v))?
@@ -626,6 +636,7 @@ impl TenantConfOpt {
             lazy_slru_download,
             timeline_get_throttle,
             image_layer_creation_check_threshold,
+            image_creation_preempt_threshold,
             lsn_lease_length,
             lsn_lease_length_for_ts,
             timeline_offloading,
@@ -689,6 +700,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             lazy_slru_download: value.lazy_slru_download,
             timeline_get_throttle: value.timeline_get_throttle,
             image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
+            image_creation_preempt_threshold: value.image_creation_preempt_threshold,
             lsn_lease_length: value.lsn_lease_length.map(humantime),
             lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
             timeline_offloading: value.timeline_offloading,
diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
index 22d8b81bcc..7da51c27df 100644
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -166,6 +166,10 @@ impl BatchLayerWriter {
         // END: catch every error and do the recovery in the above section
         Ok(generated_layers)
     }
+
+    pub fn pending_layer_num(&self) -> usize {
+        self.generated_layer_writers.len()
+    }
 }
 
 /// An image writer that takes images and produces multiple image layers.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d6a8eaa4d9..d65b382e50 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -189,6 +189,14 @@ pub enum ImageLayerCreationMode {
     Initial,
 }
 
+#[derive(Clone, Debug, Default)]
+pub enum LastImageLayerCreationStatus {
+    Incomplete, // TODO: record the last key being processed
+    Complete,
+    #[default]
+    Initial,
+}
+
 impl std::fmt::Display for ImageLayerCreationMode {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{:?}", self)
@@ -347,6 +355,8 @@ pub struct Timeline {
     // garbage collecting data that is still needed by the child timelines.
     pub(crate) gc_info: std::sync::RwLock<GcInfo>,
 
+    pub(crate) last_image_layer_creation_status: ArcSwap<LastImageLayerCreationStatus>,
+
     // It may change across major versions so for simplicity
     // keep it after running initdb for a timeline.
     // It is needed in checks when we want to error on some operations
@@ -936,9 +946,16 @@ pub(crate) enum ShutdownMode {
     Hard,
 }
 
-struct ImageLayerCreationOutcome {
-    unfinished_image_layer: Option<ImageLayerWriter>,
-    next_start_key: Key,
+enum ImageLayerCreationOutcome {
+    /// We generated an image layer
+    Generated {
+        unfinished_image_layer: ImageLayerWriter,
+    },
+    /// The key range is empty
+    Empty,
+    /// (Only used in metadata image layer creation), after reading the metadata keys, we decide to skip
+    /// the image layer creation.
+    Skip,
 }
 
 /// Public interface functions
@@ -2349,6 +2366,18 @@ impl Timeline {
             )
     }
 
+    fn get_image_creation_preempt_threshold(&self) -> usize {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .image_creation_preempt_threshold
+            .unwrap_or(
+                self.conf
+                    .default_tenant_conf
+                    .image_creation_preempt_threshold,
+            )
+    }
+
     /// Resolve the effective WAL receiver protocol to use for this tenant.
     ///
     /// Priority order is:
@@ -2499,6 +2528,10 @@ impl Timeline {
 
                 gc_info: std::sync::RwLock::new(GcInfo::default()),
 
+                last_image_layer_creation_status: ArcSwap::new(Arc::new(
+                    LastImageLayerCreationStatus::default(),
+                )),
+
                 latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
                 initdb_lsn: metadata.initdb_lsn(),
 
@@ -4042,15 +4075,20 @@ impl Timeline {
             }
 
             let mut layers_to_upload = Vec::new();
-            layers_to_upload.extend(
-                self.create_image_layers(
+            let (generated_image_layers, is_complete) = self
+                .create_image_layers(
                     &partitions,
                     self.initdb_lsn,
                     ImageLayerCreationMode::Initial,
                     ctx,
+                    LastImageLayerCreationStatus::Initial,
                 )
-                .await?,
+                .await?;
+            debug_assert!(
+                matches!(is_complete, LastImageLayerCreationStatus::Complete),
+                "init image generation mode must fully cover the keyspace"
             );
+            layers_to_upload.extend(generated_image_layers);
 
             (layers_to_upload, None)
         } else {
@@ -4370,7 +4408,6 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
         img_range: Range<Key>,
-        start: Key,
         io_concurrency: IoConcurrency,
     ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
         let mut wrote_keys = false;
@@ -4458,26 +4495,23 @@ impl Timeline {
                     lsn
                 },
             );
-            Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: Some(image_layer_writer),
-                next_start_key: img_range.end,
+            Ok(ImageLayerCreationOutcome::Generated {
+                unfinished_image_layer: image_layer_writer,
             })
         } else {
-            // Special case: the image layer may be empty if this is a sharded tenant and the
-            // partition does not cover any keys owned by this shard.  In this case, to ensure
-            // we don't leave gaps between image layers, leave `start` where it is, so that the next
-            // layer we write will cover the key range that we just scanned.
             tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
-            Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: None,
-                next_start_key: start,
-            })
+            Ok(ImageLayerCreationOutcome::Empty)
         }
     }
 
     /// Create an image layer for metadata keys. This function produces one image layer for all metadata
     /// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it
     /// would not be too large to fit in a single image layer.
+    ///
+    /// Creating image layers for metadata keys are different from relational keys. Firstly, instead of
+    /// iterating each key and get an image for each of them, we do a `vectored_get` scan over the sparse
+    /// keyspace to get all images in one run. Secondly, we use a different image layer generation metrics
+    /// for metadata keys than relational keys, which is the number of delta files visited during the scan.
     #[allow(clippy::too_many_arguments)]
     async fn create_image_layer_for_metadata_keys(
         self: &Arc<Self>,
@@ -4487,12 +4521,13 @@ impl Timeline {
         ctx: &RequestContext,
         img_range: Range<Key>,
         mode: ImageLayerCreationMode,
-        start: Key,
         io_concurrency: IoConcurrency,
     ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
         // Metadata keys image layer creation.
         let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
         let begin = Instant::now();
+        // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should
+        // not contain too many keys, otherwise this takes a lot of memory.
         let data = self
             .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
             .await?;
@@ -4517,10 +4552,7 @@ impl Timeline {
         );
 
         if !trigger_generation && mode == ImageLayerCreationMode::Try {
-            return Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: None,
-                next_start_key: img_range.end,
-            });
+            return Ok(ImageLayerCreationOutcome::Skip);
         }
         if self.cancel.is_cancelled() {
             return Err(CreateImageLayersError::Cancelled);
@@ -4551,20 +4583,12 @@ impl Timeline {
                     lsn
                 }
             );
-            Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: Some(image_layer_writer),
-                next_start_key: img_range.end,
+            Ok(ImageLayerCreationOutcome::Generated {
+                unfinished_image_layer: image_layer_writer,
             })
         } else {
-            // Special case: the image layer may be empty if this is a sharded tenant and the
-            // partition does not cover any keys owned by this shard. In this case, to ensure
-            // we don't leave gaps between image layers, leave `start` where it is, so that the next
-            // layer we write will cover the key range that we just scanned.
             tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
-            Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: None,
-                next_start_key: start,
-            })
+            Ok(ImageLayerCreationOutcome::Empty)
         }
     }
 
@@ -4620,6 +4644,8 @@ impl Timeline {
         decision
     }
 
+    /// Returns the image layers generated and an enum indicating whether the process is fully completed.
+    /// true = we have generate all image layers, false = we preempt the process for L0 compaction.
     #[tracing::instrument(skip_all, fields(%lsn, %mode))]
     async fn create_image_layers(
         self: &Arc<Timeline>,
@@ -4627,7 +4653,8 @@ impl Timeline {
         lsn: Lsn,
         mode: ImageLayerCreationMode,
         ctx: &RequestContext,
-    ) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
+        last_status: LastImageLayerCreationStatus,
+    ) -> Result<(Vec<ResidentLayer>, LastImageLayerCreationStatus), CreateImageLayersError> {
         let timer = self.metrics.create_images_time_histo.start_timer();
 
         // We need to avoid holes between generated image layers.
@@ -4641,10 +4668,23 @@ impl Timeline {
         // image layers  <100000000..100000099> and <200000000..200000199> are not completely covering it.
         let mut start = Key::MIN;
 
-        let check_for_image_layers = self.should_check_if_image_layers_required(lsn);
+        let check_for_image_layers = if let LastImageLayerCreationStatus::Incomplete = last_status {
+            info!(
+                "resuming image layer creation: last_status={:?}",
+                last_status
+            );
+            true
+        } else {
+            self.should_check_if_image_layers_required(lsn)
+        };
 
         let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?;
 
+        let mut all_generated = true;
+
+        let mut partition_processed = 0;
+        let total_partitions = partitioning.parts.len();
+
         for partition in partitioning.parts.iter() {
             if self.cancel.is_cancelled() {
                 return Err(CreateImageLayersError::Cancelled);
@@ -4717,17 +4757,13 @@ impl Timeline {
                     .map_err(|_| CreateImageLayersError::Cancelled)?,
             );
 
-            let ImageLayerCreationOutcome {
-                unfinished_image_layer,
-                next_start_key,
-            } = if !compact_metadata {
+            let outcome = if !compact_metadata {
                 self.create_image_layer_for_rel_blocks(
                     partition,
                     image_layer_writer,
                     lsn,
                     ctx,
                     img_range.clone(),
-                    start,
                     io_concurrency,
                 )
                 .await?
@@ -4739,18 +4775,58 @@ impl Timeline {
                     ctx,
                     img_range.clone(),
                     mode,
-                    start,
                     io_concurrency,
                 )
                 .await?
             };
-            start = next_start_key;
-            if let Some(unfinished_image_layer) = unfinished_image_layer {
-                batch_image_writer.add_unfinished_image_writer(
+            match outcome {
+                ImageLayerCreationOutcome::Empty => {
+                    // No data in this partition, so we don't need to create an image layer (for now).
+                    // The next image layer should cover this key range, so we don't advance the `start`
+                    // key.
+                }
+                ImageLayerCreationOutcome::Generated {
                     unfinished_image_layer,
-                    img_range,
-                    lsn,
-                );
+                } => {
+                    batch_image_writer.add_unfinished_image_writer(
+                        unfinished_image_layer,
+                        img_range.clone(),
+                        lsn,
+                    );
+                    // The next image layer should be generated right after this one.
+                    start = img_range.end;
+                }
+                ImageLayerCreationOutcome::Skip => {
+                    // We don't need to create an image layer for this partition.
+                    // The next image layer should NOT cover this range, otherwise
+                    // the keyspace becomes empty (reads don't go past image layers).
+                    start = img_range.end;
+                }
+            }
+
+            partition_processed += 1;
+
+            if let ImageLayerCreationMode::Try = mode {
+                // We have at least made some progress
+                if batch_image_writer.pending_layer_num() >= 1 {
+                    // The `Try` mode is currently only used on the compaction path. We want to avoid
+                    // image layer generation taking too long time and blocking L0 compaction. So in this
+                    // mode, we also inspect the current number of L0 layers and skip image layer generation
+                    // if there are too many of them.
+                    let num_of_l0_layers = {
+                        let layers = self.layers.read().await;
+                        layers.layer_map()?.level0_deltas().len()
+                    };
+                    let image_preempt_threshold = self.get_image_creation_preempt_threshold()
+                        * self.get_compaction_threshold();
+                    if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold {
+                        tracing::info!(
+                        "preempt image layer generation at {start} at {lsn}: too many L0 layers {num_of_l0_layers}",
+                    );
+                        all_generated = false;
+                        break;
+                    }
+                }
             }
         }
 
@@ -4765,14 +4841,35 @@ impl Timeline {
             .open_mut()?
             .track_new_image_layers(&image_layers, &self.metrics);
         drop_wlock(guard);
-        timer.stop_and_record();
+        let duration = timer.stop_and_record();
 
         // Creating image layers may have caused some previously visible layers to be covered
         if !image_layers.is_empty() {
             self.update_layer_visibility().await?;
         }
 
-        Ok(image_layers)
+        let total_layer_size = image_layers
+            .iter()
+            .map(|l| l.metadata().file_size)
+            .sum::<u64>();
+
+        info!(
+            "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions",
+            image_layers.len(),
+            total_layer_size,
+            duration.as_secs_f64(),
+            partition_processed,
+            total_partitions
+        );
+
+        Ok((
+            image_layers,
+            if all_generated {
+                LastImageLayerCreationStatus::Complete
+            } else {
+                LastImageLayerCreationStatus::Incomplete
+            },
+        ))
     }
 
     /// Wait until the background initial logical size calculation is complete, or
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 7244e946cb..9bd61bbac5 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -11,7 +11,7 @@ use std::sync::Arc;
 use super::layer_manager::LayerManager;
 use super::{
     CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
-    RecordedDuration, Timeline,
+    LastImageLayerCreationStatus, RecordedDuration, Timeline,
 };
 
 use anyhow::{anyhow, bail, Context};
@@ -709,7 +709,7 @@ impl Timeline {
                     .extend(sparse_partitioning.into_dense().parts);
 
                 // 3. Create new image layers for partitions that have been modified "enough".
-                let image_layers = self
+                let (image_layers, outcome) = self
                     .create_image_layers(
                         &partitioning,
                         lsn,
@@ -722,10 +722,22 @@ impl Timeline {
                             ImageLayerCreationMode::Try
                         },
                         &image_ctx,
+                        self.last_image_layer_creation_status
+                            .load()
+                            .as_ref()
+                            .clone(),
                     )
                     .await?;
 
+                self.last_image_layer_creation_status
+                    .store(Arc::new(outcome.clone()));
+
                 self.upload_new_image_layers(image_layers)?;
+                if let LastImageLayerCreationStatus::Incomplete = outcome {
+                    // Yield and do not do any other kind of compaction.
+                    info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
+                    return Ok(true);
+                }
                 partitioning.parts.len()
             }
             Err(err) => {
@@ -3232,11 +3244,7 @@ impl TimelineAdaptor {
             ranges: self.get_keyspace(key_range, lsn, ctx).await?,
         };
         // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
-        let start = Key::MIN;
-        let ImageLayerCreationOutcome {
-            unfinished_image_layer,
-            next_start_key: _,
-        } = self
+        let outcome = self
             .timeline
             .create_image_layer_for_rel_blocks(
                 &keyspace,
@@ -3244,13 +3252,15 @@ impl TimelineAdaptor {
                 lsn,
                 ctx,
                 key_range.clone(),
-                start,
                 IoConcurrency::sequential(),
             )
             .await?;
 
-        if let Some(image_layer_writer) = unfinished_image_layer {
-            let (desc, path) = image_layer_writer.finish(ctx).await?;
+        if let ImageLayerCreationOutcome::Generated {
+            unfinished_image_layer,
+        } = outcome
+        {
+            let (desc, path) = unfinished_image_layer.finish(ctx).await?;
             let image_layer =
                 Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
             self.new_images.push(image_layer);
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index e88d245c8f..a4b9eabf8e 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -184,6 +184,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "gc_compaction_enabled": True,
         "gc_compaction_initial_threshold_kb": 1024000,
         "gc_compaction_ratio_percent": 200,
+        "image_creation_preempt_threshold": 5,
     }
 
     vps_http = env.storage_controller.pageserver_api()

From e219d48bfe9991f40eb0d6c4f8713c9f8dc9eb05 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 3 Feb 2025 16:56:55 -0500
Subject: [PATCH 360/583] refactor(pageserver): clearify compaction return
 value (#10643)

## Problem

## Summary of changes

Make the return value of the set of compaction functions less confusing.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs                     | 35 +++++++------
 pageserver/src/tenant/tasks.rs               |  5 +-
 pageserver/src/tenant/timeline.rs            | 13 ++---
 pageserver/src/tenant/timeline/compaction.rs | 55 +++++++++++++-------
 4 files changed, 67 insertions(+), 41 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 80a61eba92..c1b408ed72 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -46,6 +46,7 @@ use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
+use timeline::compaction::CompactionOutcome;
 use timeline::compaction::GcCompactionQueue;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
@@ -2907,10 +2908,10 @@ impl Tenant {
         self: &Arc<Self>,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> Result<bool, timeline::CompactionError> {
+    ) -> Result<CompactionOutcome, timeline::CompactionError> {
         // Don't start doing work during shutdown, or when broken, we do not need those in the logs
         if !self.is_active() {
-            return Ok(false);
+            return Ok(CompactionOutcome::Done);
         }
 
         {
@@ -2924,7 +2925,7 @@ impl Tenant {
             // to AttachedSingle state.
             if !conf.location.may_upload_layers_hint() {
                 info!("Skipping compaction in location state {:?}", conf.location);
-                return Ok(false);
+                return Ok(CompactionOutcome::Done);
             }
         }
 
@@ -2967,7 +2968,7 @@ impl Tenant {
         // Before doing any I/O work, check our circuit breaker
         if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
             info!("Skipping compaction due to previous failures");
-            return Ok(false);
+            return Ok(CompactionOutcome::Done);
         }
 
         let mut has_pending_task = false;
@@ -2975,10 +2976,10 @@ impl Tenant {
         for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload
         {
             // pending_task_left == None: cannot compact, maybe still pending tasks
-            // pending_task_left == Some(true): compaction task left
-            // pending_task_left == Some(false): no compaction task left
+            // pending_task_left == Some(Pending): compaction task left
+            // pending_task_left == Some(Done): no compaction task left
             let pending_task_left = if *can_compact {
-                let has_pending_l0_compaction_task = timeline
+                let compaction_outcome = timeline
                     .compact(cancel, EnumSet::empty(), ctx)
                     .instrument(info_span!("compact_timeline", %timeline_id))
                     .await
@@ -2996,27 +2997,27 @@ impl Tenant {
                                 .fail(&CIRCUIT_BREAKERS_BROKEN, e);
                         }
                     })?;
-                if has_pending_l0_compaction_task {
-                    Some(true)
+                if let CompactionOutcome::Pending = compaction_outcome {
+                    Some(CompactionOutcome::Pending)
                 } else {
                     let queue = {
                         let guard = self.scheduled_compaction_tasks.lock().unwrap();
                         guard.get(timeline_id).cloned()
                     };
                     if let Some(queue) = queue {
-                        let has_pending_tasks = queue
+                        let outcome = queue
                             .iteration(cancel, ctx, &self.gc_block, timeline)
                             .await?;
-                        Some(has_pending_tasks)
+                        Some(outcome)
                     } else {
-                        Some(false)
+                        Some(CompactionOutcome::Done)
                     }
                 }
             } else {
                 None
             };
-            has_pending_task |= pending_task_left.unwrap_or(false);
-            if pending_task_left == Some(false) && *can_offload {
+            has_pending_task |= pending_task_left == Some(CompactionOutcome::Pending);
+            if pending_task_left == Some(CompactionOutcome::Done) && *can_offload {
                 pausable_failpoint!("before-timeline-auto-offload");
                 match offload_timeline(self, timeline)
                     .instrument(info_span!("offload_timeline", %timeline_id))
@@ -3036,7 +3037,11 @@ impl Tenant {
             .unwrap()
             .success(&CIRCUIT_BREAKERS_UNBROKEN);
 
-        Ok(has_pending_task)
+        Ok(if has_pending_task {
+            CompactionOutcome::Pending
+        } else {
+            CompactionOutcome::Done
+        })
     }
 
     /// Cancel scheduled compaction tasks
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 3725e2f7fc..b6b64d02dd 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -11,6 +11,7 @@ use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::throttle::Stats;
+use crate::tenant::timeline::compaction::CompactionOutcome;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
 use rand::Rng;
@@ -206,10 +207,10 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                     .run(tenant.compaction_iteration(&cancel, &ctx))
                     .await;
                 match output {
-                    Ok(has_pending_task) => {
+                    Ok(outcome) => {
                         error_run_count = 0;
                         // schedule the next compaction immediately in case there is a pending compaction task
-                        sleep_duration = if has_pending_task {
+                        sleep_duration = if let CompactionOutcome::Pending = outcome {
                             Duration::ZERO
                         } else {
                             period
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d65b382e50..11c0bbdfe5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -18,6 +18,7 @@ use arc_swap::{ArcSwap, ArcSwapOption};
 use bytes::Bytes;
 use camino::Utf8Path;
 use chrono::{DateTime, Utc};
+use compaction::CompactionOutcome;
 use enumset::EnumSet;
 use fail::fail_point;
 use futures::{stream::FuturesUnordered, StreamExt};
@@ -1679,7 +1680,7 @@ impl Timeline {
         cancel: &CancellationToken,
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
         self.compact_with_options(
             cancel,
             CompactOptions {
@@ -1701,7 +1702,7 @@ impl Timeline {
         cancel: &CancellationToken,
         options: CompactOptions,
         ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
         // most likely the cancellation token is from background task, but in tests it could be the
         // request task as well.
 
@@ -1721,8 +1722,8 @@ impl Timeline {
         // compaction task goes over it's period (20s) which is quite often in production.
         let (_guard, _permit) = tokio::select! {
             tuple = prepare => { tuple },
-            _ = self.cancel.cancelled() => return Ok(false),
-            _ = cancel.cancelled() => return Ok(false),
+            _ = self.cancel.cancelled() => return Ok(CompactionOutcome::Done),
+            _ = cancel.cancelled() => return Ok(CompactionOutcome::Done),
         };
 
         let last_record_lsn = self.get_last_record_lsn();
@@ -1730,13 +1731,13 @@ impl Timeline {
         // Last record Lsn could be zero in case the timeline was just created
         if !last_record_lsn.is_valid() {
             warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
-            return Ok(false);
+            return Ok(CompactionOutcome::Done);
         }
 
         let result = match self.get_compaction_algorithm_settings().kind {
             CompactionAlgorithm::Tiered => {
                 self.compact_tiered(cancel, ctx).await?;
-                Ok(false)
+                Ok(CompactionOutcome::Done)
             }
             CompactionAlgorithm::Legacy => self.compact_legacy(cancel, options, ctx).await,
         };
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 9bd61bbac5..7dd37d7232 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -262,13 +262,13 @@ impl GcCompactionQueue {
         ctx: &RequestContext,
         gc_block: &GcBlock,
         timeline: &Arc<Timeline>,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
         let _one_op_at_a_time_guard = self.consumer_lock.lock().await;
         let has_pending_tasks;
         let (id, item) = {
             let mut guard = self.inner.lock().unwrap();
             let Some((id, item)) = guard.queued.pop_front() else {
-                return Ok(false);
+                return Ok(CompactionOutcome::Done);
             };
             guard.running = Some((id, item.clone()));
             has_pending_tasks = !guard.queued.is_empty();
@@ -323,7 +323,11 @@ impl GcCompactionQueue {
             let mut guard = self.inner.lock().unwrap();
             guard.running = None;
         }
-        Ok(has_pending_tasks)
+        Ok(if has_pending_tasks {
+            CompactionOutcome::Pending
+        } else {
+            CompactionOutcome::Done
+        })
     }
 
     #[allow(clippy::type_complexity)]
@@ -589,6 +593,17 @@ impl CompactionStatistics {
     }
 }
 
+#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
+pub enum CompactionOutcome {
+    #[default]
+    /// No layers need to be compacted after this round. Compaction doesn't need
+    /// to be immediately scheduled.
+    Done,
+    /// Still has pending layers to be compacted after this round. Ideally, the scheduler
+    /// should immediately schedule another compaction.
+    Pending,
+}
+
 impl Timeline {
     /// TODO: cancellation
     ///
@@ -598,7 +613,7 @@ impl Timeline {
         cancel: &CancellationToken,
         options: CompactOptions,
         ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
         if options
             .flags
             .contains(CompactFlags::EnhancedGcBottomMostCompaction)
@@ -606,7 +621,7 @@ impl Timeline {
             self.compact_with_gc(cancel, options, ctx)
                 .await
                 .map_err(CompactionError::Other)?;
-            return Ok(false);
+            return Ok(CompactionOutcome::Done);
         }
 
         if options.flags.contains(CompactFlags::DryRun) {
@@ -666,9 +681,9 @@ impl Timeline {
         // Define partitioning schema if needed
 
         // 1. L0 Compact
-        let fully_compacted = {
+        let l0_compaction_outcome = {
             let timer = self.metrics.compact_time_histo.start_timer();
-            let fully_compacted = self
+            let l0_compaction_outcome = self
                 .compact_level0(
                     target_file_size,
                     options.flags.contains(CompactFlags::ForceL0Compaction),
@@ -676,15 +691,15 @@ impl Timeline {
                 )
                 .await?;
             timer.stop_and_record();
-            fully_compacted
+            l0_compaction_outcome
         };
 
-        if !fully_compacted {
+        if let CompactionOutcome::Pending = l0_compaction_outcome {
             // Yield and do not do any other kind of compaction. True means
             // that we have pending L0 compaction tasks and the compaction scheduler
             // will prioritize compacting this tenant/timeline again.
             info!("skipping image layer generation and shard ancestor compaction due to L0 compaction did not include all layers.");
-            return Ok(true);
+            return Ok(CompactionOutcome::Pending);
         }
 
         // 2. Repartition and create image layers if necessary
@@ -736,7 +751,7 @@ impl Timeline {
                 if let LastImageLayerCreationStatus::Incomplete = outcome {
                     // Yield and do not do any other kind of compaction.
                     info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
-                    return Ok(true);
+                    return Ok(CompactionOutcome::Pending);
                 }
                 partitioning.parts.len()
             }
@@ -765,7 +780,7 @@ impl Timeline {
             self.compact_shard_ancestors(rewrite_max, ctx).await?;
         }
 
-        Ok(false)
+        Ok(CompactionOutcome::Done)
     }
 
     /// Check for layers that are elegible to be rewritten:
@@ -1022,11 +1037,11 @@ impl Timeline {
         target_file_size: u64,
         force_compaction_ignore_threshold: bool,
         ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
         let CompactLevel0Phase1Result {
             new_layers,
             deltas_to_compact,
-            fully_compacted,
+            outcome,
         } = {
             let phase1_span = info_span!("compact_level0_phase1");
             let ctx = ctx.attached_child();
@@ -1055,12 +1070,12 @@ impl Timeline {
 
         if new_layers.is_empty() && deltas_to_compact.is_empty() {
             // nothing to do
-            return Ok(true);
+            return Ok(CompactionOutcome::Done);
         }
 
         self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
             .await?;
-        Ok(fully_compacted)
+        Ok(outcome)
     }
 
     /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
@@ -1602,7 +1617,11 @@ impl Timeline {
                 .into_iter()
                 .map(|x| x.drop_eviction_guard())
                 .collect::<Vec<_>>(),
-            fully_compacted,
+            outcome: if fully_compacted {
+                CompactionOutcome::Done
+            } else {
+                CompactionOutcome::Pending
+            },
         })
     }
 }
@@ -1613,7 +1632,7 @@ struct CompactLevel0Phase1Result {
     deltas_to_compact: Vec<Layer>,
     // Whether we have included all L0 layers, or selected only part of them due to the
     // L0 compaction size limit.
-    fully_compacted: bool,
+    outcome: CompactionOutcome,
 }
 
 #[derive(Default)]

From 8107140f7f8145148f8b9ffd007046c81c724c25 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 4 Feb 2025 11:35:43 +0100
Subject: [PATCH 361/583] Refactor compute dockerfile (#10371)

Refactor how extensions are built in compute Dockerfile

1. Rename some of the extension layers, so that names correspond more
   precisely to the upstream repository name and the source directory
   name. For example, instead of "pg-jsonschema-pg-build", spell it
   "pg_jsonschema-build". Some of the layer names had the extra "pg-"
   part, and some didn't; harmonize on not having it. And use an
   underscore if the upstream project name uses an underscore.

2. Each extension now consists of two dockerfile targets:
   [extension]-src and [extension]-build. By convention, the -src
   target downloads the sources and applies any neon-specific patches
   if necessary. The source tarball is downloaded and extracted under
   /ext-src. For example, the 'pgvector' extension creates the
   following files and directory:

        /ext-src/pgvector.tar.gz  # original tarball
/ext-src/pgvector.patch # neon-specific patch, copied from patches/ dir
/ext-src/pgvector-src/ # extracted tarball, with patch applied

    This separation avoids re-downloading the sources every time the
    extension is recompiled. The 'extension-tests' target also uses the
    [extension]-src layers, by copying the /ext-src/ dirs from all
    the extensions together into one image.

This refactoring came about when I was experimenting with different
ways of splitting up the Dockerfile so that each extension would be in
a separate file. That's not part of this PR yet, but this is a good
step in modularizing the extensions.
---
 .github/workflows/build_and_test.yml |   2 +-
 compute/compute-node.Dockerfile      | 896 ++++++++++++++++++---------
 2 files changed, 599 insertions(+), 299 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 1274543429..5a4bdecb99 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -682,7 +682,7 @@ jobs:
           push: true
           pull: true
           file: compute/compute-node.Dockerfile
-          target: neon-pg-ext-test
+          target: extension-tests
           cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
           tags: |
             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 32226c56a5..ea29630001 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1,3 +1,81 @@
+#
+# This Dockerfile builds the compute image. It is built multiple times to produce
+# different images for each PostgreSQL major version.
+#
+# We use Debian as the base for all the steps. The production images use Debian bookworm
+# for v17, and Debian bullseye for older PostgreSQL versions.
+#
+# ## Intermediary layers
+#
+# build-tools:   This contains Rust compiler toolchain and other tools needed at compile
+#                time. This is also used for the storage builds. This image is defined in
+#                build-tools.Dockerfile.
+#
+# build-deps:    Contains C compiler, other build tools, and compile-time dependencies
+#                needed to compile PostgreSQL and most extensions. (Some extensions need
+#                extra tools and libraries that are not included in this image. They are
+#                installed in the extension-specific build stages.)
+#
+# pg-build:      Result of compiling PostgreSQL. The PostgreSQL binaries are copied from
+#                this to the final image. This is also used as the base for compiling all
+#                the extensions.
+#
+# compute-tools: This contains compute_ctl, the launcher program that starts Postgres
+#                in Neon. It also contains a few other tools that are built from the
+#                sources from this repository and used in compute VMs: 'fast_import' and
+#                'local_proxy'
+#
+# ## Extensions
+#
+# By convention, the build of each extension consists of two layers:
+#
+# {extension}-src:   Contains the source tarball, possible neon-specific patches, and
+#                    the extracted tarball with the patches applied. All of these are
+#                    under the /ext-src/ directory.
+#
+# {extension}-build: Contains the installed extension files, under /usr/local/pgsql
+#                    (in addition to the PostgreSQL binaries inherited from the pg-build
+#                    image). A few extensions need extra libraries or other files
+#                    installed elsewhere in the filesystem. They are installed by ONBUILD
+#                    directives.
+#
+# These are merged together into two layers:
+#
+# all-extensions:    All the extension -build layers merged together
+#
+# extension-tests:   All the extension -src layers merged together. This is used by the
+#                    extension tests. The tests are executed against the compiled image,
+#                    but the tests need test scripts, expected result files etc. from the
+#                    original sources, which are not included in the binary image.
+#
+# ## Extra components
+#
+# These are extra included in the compute image, but are not directly used by PostgreSQL
+# itself.
+#
+# pgbouncer:         pgbouncer and its configuration
+#
+# sql_exporter:      Metrics exporter daemon.
+#
+# postgres_exporter: Another metrics exporter daemon, for different sets of metrics.
+#
+# The configuration files for the metrics exporters are under etc/ directory. We use
+# a templating system to handle variations between different PostgreSQL versions,
+# building slightly different config files for each PostgreSQL version.
+#
+#
+# ## Final image
+#
+# The final image puts together the PostgreSQL binaries (pg-build), the compute tools
+# (compute-tools), all the extensions (all-extensions) and the extra components into
+# one image.
+#
+# VM image: The final image built by this dockerfile isn't actually the final image that
+# we use in computes VMs. There's an extra step that adds some files and makes other
+# small adjustments, and builds the QCOV2 filesystem image suitable for using in a VM.
+# That step is done by the 'vm-builder' tool. See the vm-compute-node-image job in the
+# build_and_test.yml github workflow for how that's done.
+
 ARG PG_VERSION
 ARG REPOSITORY=neondatabase
 ARG IMAGE=build-tools
@@ -122,17 +200,9 @@ ENV PATH="/usr/local/pgsql/bin:$PATH"
 # Build PostGIS from the upstream PostGIS mirror.
 #
 #########################################################################################
-FROM pg-build AS postgis-build
+FROM build-deps AS postgis-src
 ARG DEBIAN_VERSION
 ARG PG_VERSION
-RUN apt update && \
-    apt install --no-install-recommends --no-install-suggests -y \
-    gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
-    libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \
-    libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
-    protobuf-c-compiler xsltproc \
-    && apt clean && rm -rf /var/lib/apt/lists/*
-
 
 # Postgis 3.5.0 requires SFCGAL 1.4+
 #
@@ -141,6 +211,7 @@ RUN apt update && \
 # and also we must check backward compatibility with older versions of PostGIS.
 #
 # Use new version only for v17
+WORKDIR /ext-src
 RUN case "${DEBIAN_VERSION}" in \
     "bookworm") \
         export SFCGAL_VERSION=1.4.1 \
@@ -154,15 +225,12 @@ RUN case "${DEBIAN_VERSION}" in \
         echo "unexpected PostgreSQL version" && exit 1 \
     ;; \
     esac && \
-    mkdir -p /sfcgal && \
     wget https://gitlab.com/sfcgal/SFCGAL/-/archive/v${SFCGAL_VERSION}/SFCGAL-v${SFCGAL_VERSION}.tar.gz -O SFCGAL.tar.gz && \
     echo "${SFCGAL_CHECKSUM} SFCGAL.tar.gz" | sha256sum --check && \
-    mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
-    cmake -DCMAKE_BUILD_TYPE=Release -GNinja . && ninja -j $(getconf _NPROCESSORS_ONLN) && \
-    DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \
-    ninja clean && cp -R /sfcgal/* /
+    mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C .
 
 # Postgis 3.5.0 supports v17
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
     "v17") \
         export POSTGIS_VERSION=3.5.0 \
@@ -178,8 +246,27 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \
     echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \
-    mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
-    ./autogen.sh && \
+    mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C .
+
+# This is reused for pgrouting
+FROM pg-build AS postgis-build-deps
+RUN apt update && \
+    apt install --no-install-recommends --no-install-suggests -y \
+    gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
+    libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \
+    libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
+    protobuf-c-compiler xsltproc \
+    && apt clean && rm -rf /var/lib/apt/lists/*
+
+FROM postgis-build-deps AS postgis-build
+COPY --from=postgis-src /ext-src/ /ext-src/
+WORKDIR /ext-src/sfcgal-src
+RUN cmake -DCMAKE_BUILD_TYPE=Release -GNinja . && ninja -j $(getconf _NPROCESSORS_ONLN) && \
+    DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \
+    ninja clean && cp -R /sfcgal/* /
+
+WORKDIR /ext-src/postgis-src
+RUN ./autogen.sh && \
     ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -202,12 +289,23 @@ RUN case "${PG_VERSION}" in \
     cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
     cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis
 
+#########################################################################################
+#
+# Layer "pgrouting-build"
+# Build pgrouting. Note: This depends on the postgis-build-deps layer built above
+#
+#########################################################################################
+
 # Uses versioned libraries, i.e. libpgrouting-3.4
 # and may introduce function signature changes between releases
 # i.e. release 3.5.0 has new signature for pg_dijkstra function
 #
 # Use new version only for v17
 # last release v3.6.2 - Mar 30, 2024
+FROM build-deps AS pgrouting-src
+ARG DEBIAN_VERSION
+ARG PG_VERSION
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
     "v17") \
         export PGROUTING_VERSION=3.6.2 \
@@ -223,8 +321,12 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/pgRouting/pgrouting/archive/v${PGROUTING_VERSION}.tar.gz -O pgrouting.tar.gz && \
     echo "${PGROUTING_CHECKSUM} pgrouting.tar.gz" | sha256sum --check && \
-    mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \
-    mkdir build && cd build && \
+    mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C .
+
+FROM postgis-build-deps AS pgrouting-build
+COPY --from=pgrouting-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pgrouting-src
+RUN mkdir build && cd build && \
     cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. && \
     ninja -j $(getconf _NPROCESSORS_ONLN) && \
     ninja -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -236,15 +338,11 @@ RUN case "${PG_VERSION}" in \
 # Build plv8
 #
 #########################################################################################
-FROM pg-build AS plv8-build
+FROM build-deps AS plv8-src
 ARG PG_VERSION
+WORKDIR /ext-src
 
-COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch
-
-RUN apt update && \
-    apt install --no-install-recommends --no-install-suggests -y \
-    ninja-build python3-dev libncurses5 binutils clang \
-    && apt clean && rm -rf /var/lib/apt/lists/*
+COPY compute/patches/plv8-3.1.10.patch .
 
 # plv8 3.2.3 supports v17
 # last release v3.2.3 - Sep 7, 2024
@@ -268,9 +366,20 @@ RUN case "${PG_VERSION}" in \
     git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \
     tar -czf plv8.tar.gz --exclude .git plv8-src && \
     cd plv8-src && \
-    if [[ "${PG_VERSION}" < "v17" ]]; then patch -p1 < /plv8-3.1.10.patch; fi && \
+    if [[ "${PG_VERSION}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi
+
+FROM pg-build AS plv8-build
+ARG PG_VERSION
+RUN apt update && \
+    apt install --no-install-recommends --no-install-suggests -y \
+    ninja-build python3-dev libncurses5 binutils clang \
+    && apt clean && rm -rf /var/lib/apt/lists/*
+
+COPY --from=plv8-src /ext-src/ /ext-src/
+WORKDIR /ext-src/plv8-src
+RUN \
     # generate and copy upgrade scripts
-    mkdir -p upgrade && ./generate_upgrade.sh ${PLV8_TAG#v} && \
+    make generate_upgrades && \
     cp upgrade/* /usr/local/pgsql/share/extension/ && \
     make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
     rm -rf /plv8-* && \
@@ -298,16 +407,28 @@ RUN case "${PG_VERSION}" in \
 # Build h3_pg
 #
 #########################################################################################
-FROM pg-build AS h3-pg-build
+FROM build-deps AS h3-pg-src
 ARG PG_VERSION
+WORKDIR /ext-src
 
 # not version-specific
 # last release v4.1.0 - Jan 18, 2023
 RUN mkdir -p /h3/usr/ && \
     wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
     echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \
-    mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \
-    mkdir build && cd build && \
+    mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C .
+
+# not version-specific
+# last release v4.1.3 - Jul 26, 2023
+WORKDIR /ext-src
+RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
+    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
+    mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS h3-pg-build
+COPY --from=h3-pg-src /ext-src/ /ext-src/
+WORKDIR /ext-src/h3-src
+RUN mkdir build && cd build && \
     cmake .. -GNinja -DBUILD_BENCHMARKS=0 -DCMAKE_BUILD_TYPE=Release \
         -DBUILD_FUZZERS=0 -DBUILD_FILTERS=0 -DBUILD_GENERATORS=0 -DBUILD_TESTING=0 \
     && ninja -j $(getconf _NPROCESSORS_ONLN) && \
@@ -315,11 +436,8 @@ RUN mkdir -p /h3/usr/ && \
     cp -R /h3/usr / && \
     rm -rf build
 
-# not version-specific
-# last release v4.1.3 - Jul 26, 2023
-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
-    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
-    mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
+WORKDIR /ext-src/h3-pg-src
+RUN ls -l && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
@@ -327,19 +445,24 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3
 
 #########################################################################################
 #
-# Layer "unit-pg-build"
+# Layer "postgresql-unit-build"
 # compile unit extension
 #
 #########################################################################################
-FROM pg-build AS unit-pg-build
+FROM build-deps AS postgresql-unit-src
 ARG PG_VERSION
 
 # not version-specific
 # last release 7.9 - Sep 15, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -O postgresql-unit.tar.gz && \
     echo "e46de6245dcc8b2c2ecf29873dbd43b2b346773f31dd5ce4b8315895a052b456 postgresql-unit.tar.gz" | sha256sum --check && \
-    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS postgresql-unit-build
+COPY --from=postgresql-unit-src /ext-src/ /ext-src/
+WORKDIR /ext-src/postgresql-unit-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
     # We move the extension from '/usr/local/pgsql/' to '/usr/local/'  after it is build. So we need to adjust the path.
@@ -350,14 +473,15 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -
 
 #########################################################################################
 #
-# Layer "vector-pg-build"
+# Layer "pgvector-build"
 # compile pgvector extension
 #
 #########################################################################################
-FROM pg-build AS vector-pg-build
+FROM build-deps AS pgvector-src
 ARG PG_VERSION
 
-COPY compute/patches/pgvector.patch /pgvector.patch
+WORKDIR /ext-src
+COPY compute/patches/pgvector.patch .
 
 # By default, pgvector Makefile uses `-march=native`. We don't want that,
 # because we build the images on different machines than where we run them.
@@ -370,74 +494,94 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O
     mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
     wget https://github.com/pgvector/pgvector/raw/refs/tags/v0.7.4/sql/vector.sql -O ./sql/vector--0.7.4.sql && \
     echo "10218d05dc02299562252a9484775178b14a1d8edb92a2d1672ef488530f7778 ./sql/vector--0.7.4.sql" | sha256sum --check && \
-    patch -p1 < /pgvector.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \
+    patch -p1 < /ext-src/pgvector.patch
+
+FROM pg-build AS pgvector-build
+COPY --from=pgvector-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pgvector-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \
     make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
 
 #########################################################################################
 #
-# Layer "pgjwt-pg-build"
+# Layer "pgjwt-build"
 # compile pgjwt extension
 #
 #########################################################################################
-FROM pg-build AS pgjwt-pg-build
+FROM build-deps AS pgjwt-src
 ARG PG_VERSION
 
 # not version-specific
 # doesn't use releases, last commit f3d82fd - Mar 2, 2023
+WORKDIR /ext-src
 RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \
     echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \
-    mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pgjwt-build
+COPY --from=pgjwt-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pgjwt-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
 
 #########################################################################################
 #
-# Layer "hypopg-pg-build"
+# Layer "hypopg-build"
 # compile hypopg extension
 #
 #########################################################################################
-FROM pg-build AS hypopg-pg-build
+FROM build-deps AS hypopg-src
 ARG PG_VERSION
 
 # HypoPG 1.4.1 supports v17
 # last release 1.4.1 - Apr 28, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \
     echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \
-    mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS hypopg-build
+COPY --from=hypopg-src /ext-src/ /ext-src/
+WORKDIR /ext-src/hypopg-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
 
 #########################################################################################
 #
-# Layer "pg-hashids-pg-build"
+# Layer "pg_hashids-build"
 # compile pg_hashids extension
 #
 #########################################################################################
-FROM pg-build AS pg-hashids-pg-build
+FROM build-deps AS pg_hashids-src
 ARG PG_VERSION
 
 # not version-specific
 # last release v1.2.1 -Jan 12, 2018
+WORKDIR /ext-src
 RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
     echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
-    mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_hashids-build
+COPY --from=pg_hashids-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_hashids-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
 
 #########################################################################################
 #
-# Layer "rum-pg-build"
+# Layer "rum-build"
 # compile rum extension
 #
 #########################################################################################
-FROM pg-build AS rum-pg-build
+FROM build-deps AS rum-src
 ARG PG_VERSION
 
-COPY compute/patches/rum.patch /rum.patch
+WORKDIR /ext-src
+COPY compute/patches/rum.patch .
 
 # supports v17 since https://github.com/postgrespro/rum/commit/cb1edffc57736cd2a4455f8d0feab0d69928da25
 # doesn't use releases since 1.3.13 - Sep 19, 2022
@@ -445,110 +589,140 @@ COPY compute/patches/rum.patch /rum.patch
 RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0feab0d69928da25.tar.gz -O rum.tar.gz && \
     echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \
     mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
-    patch -p1 < /rum.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    patch -p1 < /ext-src/rum.patch
+
+FROM pg-build AS rum-build
+COPY --from=rum-src /ext-src/ /ext-src/
+WORKDIR /ext-src/rum-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
 
 #########################################################################################
 #
-# Layer "pgtap-pg-build"
+# Layer "pgtap-build"
 # compile pgTAP extension
 #
 #########################################################################################
-FROM pg-build AS pgtap-pg-build
+FROM build-deps AS pgtap-src
 ARG PG_VERSION
 
 # pgtap 1.3.3 supports v17
 # last release v1.3.3 - Apr 8, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \
     echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \
-    mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pgtap-build
+COPY --from=pgtap-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pgtap-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
 
 #########################################################################################
 #
-# Layer "ip4r-pg-build"
+# Layer "ip4r-build"
 # compile ip4r extension
 #
 #########################################################################################
-FROM pg-build AS ip4r-pg-build
+FROM build-deps AS ip4r-src
 ARG PG_VERSION
 
 # not version-specific
 # last release v2.4.2 - Jul 29, 2023
+WORKDIR /ext-src
 RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
     echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
-    mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS ip4r-build
+COPY --from=ip4r-src /ext-src/ /ext-src/
+WORKDIR /ext-src/ip4r-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control
 
 #########################################################################################
 #
-# Layer "prefix-pg-build"
+# Layer "prefix-build"
 # compile Prefix extension
 #
 #########################################################################################
-FROM pg-build AS prefix-pg-build
+FROM build-deps AS prefix-src
 ARG PG_VERSION
 
 # not version-specific
 # last release v1.2.10  - Jul 5, 2023
+WORKDIR /ext-src
 RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
     echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
-    mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS prefix-build
+COPY --from=prefix-src /ext-src/ /ext-src/
+WORKDIR /ext-src/prefix-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
 
 #########################################################################################
 #
-# Layer "hll-pg-build"
+# Layer "hll-build"
 # compile hll extension
 #
 #########################################################################################
-FROM pg-build AS hll-pg-build
+FROM build-deps AS hll-src
 ARG PG_VERSION
 
 # not version-specific
 # last release v2.18 - Aug 29, 2023
+WORKDIR /ext-src
 RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
     echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
-    mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS hll-build
+COPY --from=hll-src /ext-src/ /ext-src/
+WORKDIR /ext-src/hll-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
 
 #########################################################################################
 #
-# Layer "plpgsql-check-pg-build"
+# Layer "plpgsql_check-build"
 # compile plpgsql_check extension
 #
 #########################################################################################
-FROM pg-build AS plpgsql-check-pg-build
+FROM build-deps AS plpgsql_check-src
 ARG PG_VERSION
 
 # plpgsql_check v2.7.11 supports v17
 # last release v2.7.11 - Sep 16, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \
     echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \
-    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS plpgsql_check-build
+COPY --from=plpgsql_check-src /ext-src/ /ext-src/
+WORKDIR /ext-src/plpgsql_check-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
 
 #########################################################################################
 #
-# Layer "timescaledb-pg-build"
+# Layer "timescaledb-build"
 # compile timescaledb extension
 #
 #########################################################################################
-FROM pg-build AS timescaledb-pg-build
+FROM build-deps AS timescaledb-src
 ARG PG_VERSION
 
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
         export TIMESCALEDB_VERSION=2.10.1 \
@@ -565,8 +739,12 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
     echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
-    mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \
-    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
+    mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS timescaledb-build
+COPY --from=timescaledb-src /ext-src/ /ext-src/
+WORKDIR /ext-src/timescaledb-src
+RUN ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
     cd build && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make install -j $(getconf _NPROCESSORS_ONLN) && \
@@ -574,14 +752,15 @@ RUN case "${PG_VERSION}" in \
 
 #########################################################################################
 #
-# Layer "pg-hint-plan-pg-build"
+# Layer "pg_hint_plan-build"
 # compile pg_hint_plan extension
 #
 #########################################################################################
-FROM pg-build AS pg-hint-plan-pg-build
+FROM build-deps AS pg_hint_plan-src
 ARG PG_VERSION
 
 # version-specific, has separate releases for each version
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
       "v14") \
         export PG_HINT_PLAN_VERSION=14_1_4_1 \
@@ -605,50 +784,51 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \
     echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \
-    mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_hint_plan-build
+COPY --from=pg_hint_plan-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_hint_plan-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make install -j $(getconf _NPROCESSORS_ONLN) && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control
 
 
 #########################################################################################
 #
-# Layer "pg-cron-pg-build"
+# Layer "pg_cron-build"
 # compile pg_cron extension
 #
 #########################################################################################
-FROM pg-build AS pg-cron-pg-build
+FROM build-deps AS pg_cron-src
 ARG PG_VERSION
 
 # This is an experimental extension that we do not support on prod yet.
 # !Do not remove!
 # We set it in shared_preload_libraries and computes will fail to start if library is not found.
+WORKDIR /ext-src
+COPY compute/patches/pg_cron.patch .
 RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \
     echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \
     mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    patch < /ext-src/pg_cron.patch
+
+FROM pg-build AS pg_cron-build
+COPY --from=pg_cron-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_cron-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
 
 #########################################################################################
 #
-# Layer "rdkit-pg-build"
+# Layer "rdkit-build"
 # compile rdkit extension
 #
 #########################################################################################
-FROM pg-build AS rdkit-pg-build
+FROM build-deps AS rdkit-src
 ARG PG_VERSION
 
-RUN apt update && \
-    apt install --no-install-recommends --no-install-suggests -y \
-        libboost-iostreams1.74-dev \
-        libboost-regex1.74-dev \
-        libboost-serialization1.74-dev \
-        libboost-system1.74-dev \
-        libeigen3-dev \
-        libboost-all-dev \
-    && apt clean && rm -rf /var/lib/apt/lists/*
-
 # rdkit Release_2024_09_1 supports v17
 # last release Release_2024_09_1 - Sep 27, 2024
 #
@@ -656,12 +836,7 @@ RUN apt update && \
 # because Release_2024_09_1 has some backward incompatible changes
 # https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
 
-# XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find
-# pg_config. For some reason the rdkit cmake script doesn't work with just that,
-# however. By also adding /usr/local/pgsql, it works, which is weird because there
-# are no executables in that directory.
-ENV PATH="/usr/local/pgsql:$PATH"
-
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
     "v17") \
         export RDKIT_VERSION=Release_2024_09_1 \
@@ -677,8 +852,28 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/rdkit/rdkit/archive/refs/tags/${RDKIT_VERSION}.tar.gz -O rdkit.tar.gz && \
     echo "${RDKIT_CHECKSUM} rdkit.tar.gz" | sha256sum --check && \
-    mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
-    cmake \
+    mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS rdkit-build
+RUN apt update && \
+    apt install --no-install-recommends --no-install-suggests -y \
+        libboost-iostreams1.74-dev \
+        libboost-regex1.74-dev \
+        libboost-serialization1.74-dev \
+        libboost-system1.74-dev \
+        libeigen3-dev \
+        libboost-all-dev \
+    && apt clean && rm -rf /var/lib/apt/lists/*
+
+COPY --from=rdkit-src /ext-src/ /ext-src/
+WORKDIR /ext-src/rdkit-src
+
+# XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find
+# pg_config. For some reason the rdkit cmake script doesn't work with just that,
+# however. By also adding /usr/local/pgsql, it works, which is weird because there
+# are no executables in that directory.
+ENV PATH="/usr/local/pgsql:$PATH"
+RUN cmake \
         -D RDK_BUILD_CAIRO_SUPPORT=OFF \
         -D RDK_BUILD_INCHI_SUPPORT=ON \
         -D RDK_BUILD_AVALON_SUPPORT=ON \
@@ -710,47 +905,57 @@ RUN case "${PG_VERSION}" in \
 
 #########################################################################################
 #
-# Layer "pg-uuidv7-pg-build"
+# Layer "pg_uuidv7-build"
 # compile pg_uuidv7 extension
 #
 #########################################################################################
-FROM pg-build AS pg-uuidv7-pg-build
+FROM build-deps AS pg_uuidv7-src
 ARG PG_VERSION
 
 # not version-specific
 # last release v1.6.0 - Oct 9, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \
     echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \
-    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_uuidv7-build
+COPY --from=pg_uuidv7-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_uuidv7-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
 
 #########################################################################################
 #
-# Layer "pg-roaringbitmap-pg-build"
+# Layer "pg_roaringbitmap-build"
 # compile pg_roaringbitmap extension
 #
 #########################################################################################
-FROM pg-build AS pg-roaringbitmap-pg-build
+FROM build-deps AS pg_roaringbitmap-src
 ARG PG_VERSION
 
 # not version-specific
 # last release v0.5.4 - Jun 28, 2022
+WORKDIR /ext-src
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
     echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
-    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_roaringbitmap-build
+COPY --from=pg_roaringbitmap-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_roaringbitmap-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
 
 #########################################################################################
 #
-# Layer "pg-semver-pg-build"
+# Layer "pg_semver-build"
 # compile pg_semver extension
 #
 #########################################################################################
-FROM pg-build AS pg-semver-pg-build
+FROM build-deps AS pg_semver-src
 ARG PG_VERSION
 
 # Release 0.40.0 breaks backward compatibility with previous versions
@@ -758,6 +963,7 @@ ARG PG_VERSION
 # Use new version only for v17
 #
 # last release v0.40.0 - Jul 22, 2024
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
     "v17") \
         export SEMVER_VERSION=0.40.0 \
@@ -773,22 +979,27 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/theory/pg-semver/archive/refs/tags/v${SEMVER_VERSION}.tar.gz -O pg_semver.tar.gz && \
     echo "${SEMVER_CHECKSUM} pg_semver.tar.gz" | sha256sum --check && \
-    mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_semver-build
+COPY --from=pg_semver-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_semver-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control
 
 #########################################################################################
 #
-# Layer "pg-embedding-pg-build"
+# Layer "pg_embedding-build"
 # compile pg_embedding extension
 #
 #########################################################################################
-FROM pg-build AS pg-embedding-pg-build
+FROM build-deps AS pg_embedding-src
+ARG PG_VERSION
 
 # This is our extension, support stopped in favor of pgvector
 # TODO: deprecate it
-ARG PG_VERSION
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
         export PG_EMBEDDING_VERSION=0.3.5 \
@@ -799,29 +1010,44 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \
     echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
-    mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install
+    mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_embedding-build
+COPY --from=pg_embedding-src /ext-src/ /ext-src/
+WORKDIR /ext-src/
+RUN  if [ -d pg_embedding-src ]; then \
+        cd pg_embedding-src && \
+        make -j $(getconf _NPROCESSORS_ONLN) && \
+        make -j $(getconf _NPROCESSORS_ONLN) install; \
+    fi
 
 #########################################################################################
 #
-# Layer "pg-anon-pg-build"
+# Layer "pg_anon-build"
 # compile anon extension
 #
 #########################################################################################
-FROM pg-build AS pg-anon-pg-build
+FROM build-deps AS pg_anon-src
 ARG PG_VERSION
 
 # This is an experimental extension, never got to real production.
 # !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in "v17") \
     echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
     esac && \
     wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
     echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
-    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control
+    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_anon-build
+COPY --from=pg_anon-src /ext-src/ /ext-src/
+WORKDIR /ext-src
+RUN if [ -d pg_anon-src ]; then \
+        cd pg_anon-src && \
+        make -j $(getconf _NPROCESSORS_ONLN) install && \
+        echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control; \
+    fi
 
 #########################################################################################
 #
@@ -887,50 +1113,57 @@ USER root
 
 #########################################################################################
 #
-# Layers "pg-onnx-build" and "pgrag-pg-build"
+# Layers "pg-onnx-build" and "pgrag-build"
 # Compile "pgrag" extensions
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgrx12 AS pg-onnx-build
+FROM build-deps AS pgrag-src
+ARG PG_VERSION
 
+WORKDIR /ext-src
+RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \
+    mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
+    echo "#nothing to test here" > neon-test.sh
+
+RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz &&  \
+    echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \
+    mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C .
+
+FROM rust-extensions-build-pgrx12 AS pgrag-build
+COPY --from=pgrag-src /ext-src/ /ext-src/
+
+# Install build-time dependencies
 # cmake 3.26 or higher is required, so installing it using pip (bullseye-backports has cmake 3.25).
 # Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise
+WORKDIR /ext-src/onnxruntime-src
 RUN apt update && apt install --no-install-recommends --no-install-suggests -y \
-    python3 python3-pip python3-venv && \
+    python3 python3-pip python3-venv protobuf-compiler && \
     apt clean && rm -rf /var/lib/apt/lists/* && \
     python3 -m venv venv && \
     . venv/bin/activate && \
-    python3 -m pip install cmake==3.30.5 && \
-    wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \
-    mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
+    python3 -m pip install cmake==3.30.5
+
+RUN . venv/bin/activate && \
     ./build.sh --config Release --parallel --cmake_generator Ninja \
     --skip_submodule_sync --skip_tests --allow_running_as_root
 
-
-FROM pg-onnx-build AS pgrag-pg-build
-
-RUN apt update && apt install --no-install-recommends --no-install-suggests -y protobuf-compiler \
-    && apt clean && rm -rf /var/lib/apt/lists/* && \
-    wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz &&  \
-    echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \
-    mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \
-    \
-    cd exts/rag && \
+WORKDIR /ext-src/pgrag-src
+RUN cd exts/rag && \
     sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
-    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \
-    \
-    cd ../rag_bge_small_en_v15 && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control
+
+RUN cd exts/rag_bge_small_en_v15 && \
     sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
+    ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
         REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
         cargo pgrx install --release --features remote_onnx && \
-    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \
-    \
-    cd ../rag_jina_reranker_v1_tiny_en && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control
+
+RUN cd exts/rag_jina_reranker_v1_tiny_en && \
     sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
+    ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
         REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
         cargo pgrx install --release --features remote_onnx && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_jina_reranker_v1_tiny_en.control
@@ -938,17 +1171,23 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p
 
 #########################################################################################
 #
-# Layer "pg-jsonschema-pg-build"
+# Layer "pg_jsonschema-build"
 # Compile "pg_jsonschema" extension
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgrx12 AS pg-jsonschema-pg-build
+FROM build-deps AS pg_jsonschema-src
 ARG PG_VERSION
 # last release v0.3.3 - Oct 16, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.gz -O pg_jsonschema.tar.gz && \
     echo "40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac pg_jsonschema.tar.gz" | sha256sum --check && \
-    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
+    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C .
+
+FROM rust-extensions-build-pgrx12 AS pg_jsonschema-build
+COPY --from=pg_jsonschema-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_jsonschema-src
+RUN \
     # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
     # `unsafe-postgres` feature allows to build pgx extensions
     # against postgres forks that decided to change their ABI name (like us).
@@ -961,55 +1200,69 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.
 
 #########################################################################################
 #
-# Layer "pg-graphql-pg-build"
+# Layer "pg_graphql-build"
 # Compile "pg_graphql" extension
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgrx12 AS pg-graphql-pg-build
+FROM build-deps AS pg_graphql-src
 ARG PG_VERSION
 
 # last release v1.5.9 - Oct 16, 2024
+WORKDIR /ext-src
+COPY compute/patches/pg_graphql.patch .
 RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \
     echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \
     mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "=0.12.9"/g' Cargo.toml && \
-    cargo pgrx install --release && \
+    patch -p1 < /ext-src/pg_graphql.patch
+
+
+FROM rust-extensions-build-pgrx12 AS pg_graphql-build
+COPY --from=pg_graphql-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_graphql-src
+RUN cargo pgrx install --release && \
     # it's needed to enable extension because it uses untrusted C language
     sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control
 
 #########################################################################################
 #
-# Layer "pg-tiktoken-build"
+# Layer "pg_tiktoken-build"
 # Compile "pg_tiktoken" extension
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgrx12 AS pg-tiktoken-pg-build
+FROM build-deps AS pg_tiktoken-src
 ARG PG_VERSION
 
 # doesn't use releases
 # 9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7 - on Oct 29, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7.tar.gz -O pg_tiktoken.tar.gz && \
     echo "a5bc447e7920ee149d3c064b8b9f0086c0e83939499753178f7d35788416f628 pg_tiktoken.tar.gz" | sha256sum --check && \
     mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = { version = "=0.12.6",/pgrx = { version = "0.12.9",/g' Cargo.toml && \
-    sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \
-    cargo pgrx install --release && \
+    sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml
+
+FROM rust-extensions-build-pgrx12 AS pg_tiktoken-build
+COPY --from=pg_tiktoken-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_tiktoken-src
+RUN cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
 
 #########################################################################################
 #
-# Layer "pg-pgx-ulid-build"
+# Layer "pgx_ulid-build"
 # Compile "pgx_ulid" extension for v16 and below
 #
 #########################################################################################
 
-FROM rust-extensions-build AS pg-pgx-ulid-build
+FROM build-deps AS pgx_ulid-src
 ARG PG_VERSION
 
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
     "v14" | "v15" | "v16") \
         ;; \
@@ -1020,20 +1273,28 @@ RUN case "${PG_VERSION}" in \
     wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
     echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17  pgx_ulid.tar.gz" | sha256sum --check && \
     mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx       = "^0.11.2"/pgrx       = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgrx install --release && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/ulid.control
+    sed -i 's/pgrx       = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml
+
+FROM rust-extensions-build AS pgx_ulid-build
+COPY --from=pgx_ulid-src /ext-src/ /ext-src/
+WORKDIR /ext-src/
+RUN if [ -d pgx_ulid-src ]; then \
+        cd pgx_ulid-src && \
+        cargo pgrx install --release && \
+        echo 'trusted = true' >> /usr/local/pgsql/share/extension/ulid.control; \
+    fi
 
 #########################################################################################
 #
-# Layer "pg-pgx-ulid-pgrx12-build"
+# Layer "pgx_ulid-pgrx12-build"
 # Compile "pgx_ulid" extension for v17 and up
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgrx12 AS pg-pgx-ulid-pgrx12-build
+FROM build-deps AS pgx_ulid-pgrx12-src
 ARG PG_VERSION
 
+WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
     "v17") \
         ;; \
@@ -1044,23 +1305,32 @@ RUN case "${PG_VERSION}" in \
     wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.2.0.tar.gz -O pgx_ulid.tar.gz && \
     echo "cef6a9a2e5e7bd1a10a18989286586ee9e6c1c06005a4055cff190de41bf3e9f pgx_ulid.tar.gz" | sha256sum --check && \
     mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx       = "^0.12.7"/pgrx       = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgrx install --release && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgx_ulid.control
+    sed -i 's/pgrx       = "^0.12.7"/pgrx       = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml
+
+FROM rust-extensions-build-pgrx12 AS pgx_ulid-pgrx12-build
+ARG PG_VERSION
+WORKDIR /ext-src
+COPY --from=pgx_ulid-pgrx12-src /ext-src/ /ext-src/
+RUN if [ -d pgx_ulid-src ]; then \
+        cd pgx_ulid-src && \
+        cargo pgrx install --release && \
+        echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgx_ulid.control; \
+    fi
 
 #########################################################################################
 #
-# Layer "pg-session-jwt-build"
+# Layer "pg_session_jwt-build"
 # Compile "pg_session_jwt" extension
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgrx12 AS pg-session-jwt-build
+FROM build-deps AS pg_session_jwt-src
 ARG PG_VERSION
 
 # NOTE: local_proxy depends on the version of pg_session_jwt
 # Do not update without approve from proxy team
 # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
+WORKDIR /ext-src
 RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
     echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
     mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
@@ -1068,8 +1338,12 @@ RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0
     sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \
     sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' pgrx-tests/Cargo.toml && \
     sed -i 's/pgrx-macros = "=0.12.6"/pgrx-macros = "=0.12.9"/g' pgrx-tests/Cargo.toml && \
-    sed -i 's/pgrx-pg-config = "=0.12.6"/pgrx-pg-config = "=0.12.9"/g' pgrx-tests/Cargo.toml && \
-    cargo pgrx install --release
+    sed -i 's/pgrx-pg-config = "=0.12.6"/pgrx-pg-config = "=0.12.9"/g' pgrx-tests/Cargo.toml
+
+FROM rust-extensions-build-pgrx12 AS pg_session_jwt-build
+COPY --from=pg_session_jwt-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_session_jwt-src
+RUN cargo pgrx install --release
 
 #########################################################################################
 #
@@ -1078,15 +1352,20 @@ RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0
 #
 #########################################################################################
 
-FROM pg-build AS wal2json-pg-build
+FROM build-deps AS wal2json-src
 ARG PG_VERSION
 
 # wal2json wal2json_2_6 supports v17
 # last release wal2json_2_6 - Apr 25, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \
     echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \
-    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS wal2json-build
+COPY --from=wal2json-src /ext-src/ /ext-src/
+WORKDIR /ext-src/wal2json-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install
 
 #########################################################################################
@@ -1095,15 +1374,20 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.
 # compile pg_ivm extension
 #
 #########################################################################################
-FROM pg-build AS pg-ivm-build
+FROM build-deps AS pg_ivm-src
 ARG PG_VERSION
 
 # pg_ivm v1.9 supports v17
 # last release v1.9 - Jul 31
+WORKDIR /ext-src
 RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \
     echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \
-    mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_ivm-build
+COPY --from=pg_ivm-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_ivm-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
 
@@ -1113,15 +1397,20 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_iv
 # compile pg_partman extension
 #
 #########################################################################################
-FROM pg-build AS pg-partman-build
+FROM build-deps AS pg_partman-src
 ARG PG_VERSION
 
 # should support v17 https://github.com/pgpartman/pg_partman/discussions/693
 # last release 5.1.0  Apr 2, 2024
+WORKDIR /ext-src
 RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \
     echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \
-    mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pg_partman-build
+COPY --from=pg_partman-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_partman-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
 
@@ -1131,13 +1420,19 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
 # compile pg_mooncake extension
 #
 #########################################################################################
-FROM rust-extensions-build AS pg-mooncake-build
+FROM build-deps AS pg_mooncake-src
 ARG PG_VERSION
-
+WORKDIR /ext-src
 RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.1/pg_mooncake-0.1.1.tar.gz -O pg_mooncake.tar.gz && \
     echo "a2d16eff7948dde64f072609ca5d2962d6b4d07cb89d45952add473529c55f55 pg_mooncake.tar.gz" | sha256sum --check && \
     mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \
-    make release -j $(getconf _NPROCESSORS_ONLN) && \
+    echo "make -f pg_mooncake-src/Makefile.build installcheck TEST_DIR=./test SQL_DIR=./sql SRC_DIR=./src" > neon-test.sh && \
+    chmod a+x neon-test.sh
+
+FROM rust-extensions-build AS pg_mooncake-build
+COPY --from=pg_mooncake-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_mooncake-src
+RUN make release -j $(getconf _NPROCESSORS_ONLN) && \
     make install -j $(getconf _NPROCESSORS_ONLN) && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
 
@@ -1148,80 +1443,92 @@ RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.1/p
 #
 #########################################################################################
 
-FROM pg-build AS pg-repack-build
+FROM build-deps AS pg_repack-src
 ARG PG_VERSION
-
+WORKDIR /ext-src
 RUN wget https://github.com/reorg/pg_repack/archive/refs/tags/ver_1.5.2.tar.gz -O pg_repack.tar.gz && \
     echo '4516cad42251ed3ad53ff619733004db47d5755acac83f75924cd94d1c4fb681 pg_repack.tar.gz' | sha256sum --check && \
-    mkdir pg_repack-src && cd pg_repack-src && tar xzf ../pg_repack.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
+    mkdir pg_repack-src && cd pg_repack-src && tar xzf ../pg_repack.tar.gz --strip-components=1 -C .
+
+FROM rust-extensions-build AS pg_repack-build
+COPY --from=pg_repack-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_repack-src
+RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install
 
 #########################################################################################
 #
-# Layer "neon-pg-ext-build"
+# Layer "neon-ext-build"
 # compile neon extensions
 #
 #########################################################################################
-FROM build-deps AS neon-pg-ext-build
+FROM pg-build AS neon-ext-build
+ARG PG_VERSION
+
+COPY pgxn/ pgxn/
+RUN make -j $(getconf _NPROCESSORS_ONLN) \
+        -C pgxn/neon \
+        -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        -C pgxn/neon_utils \
+        -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        -C pgxn/neon_test_utils \
+        -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        -C pgxn/neon_rmgr \
+        -s install
+
+#########################################################################################
+#
+# Layer "all-extensions"
+# Bundle together all the extensions
+#
+#########################################################################################
+FROM build-deps AS all-extensions
 ARG PG_VERSION
 
 # Public extensions
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=postgis-build /sfcgal/* /
+COPY --from=pgrouting-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /h3/usr /
-COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-tiktoken-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=rum-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=ip4r-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-pgx-ulid-pgrx12-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-session-jwt-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
-COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-repack-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY pgxn/ pgxn/
+COPY --from=postgresql-unit-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgvector-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgjwt-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgrag-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_jsonschema-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_graphql-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_tiktoken-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=hypopg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_hashids-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=rum-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgtap-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=ip4r-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=prefix-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=hll-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=plpgsql_check-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=timescaledb-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_hint_plan-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_cron-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgx_ulid-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgx_ulid-pgrx12-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_session_jwt-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=rdkit-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_uuidv7-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_embedding-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql
+COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/neon \
-        -s install && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/neon_utils \
-        -s install && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/neon_test_utils \
-        -s install && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/neon_rmgr \
-        -s install
+COPY --from=neon-ext-build /usr/local/pgsql/ /usr/local/pgsql/
 
 #########################################################################################
 #
@@ -1302,8 +1609,8 @@ RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 30
 # Clean up postgres folder before inclusion
 #
 #########################################################################################
-FROM neon-pg-ext-build AS postgres-cleanup-layer
-COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
+FROM neon-ext-build AS postgres-cleanup-layer
+COPY --from=all-extensions /usr/local/pgsql /usr/local/pgsql
 
 # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
 RUN cd /usr/local/pgsql/bin && rm -f ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
@@ -1332,66 +1639,59 @@ RUN make PG_VERSION="${PG_VERSION}" -C compute
 
 #########################################################################################
 #
-# Layer neon-pg-ext-test
+# Layer extension-tests
 #
 #########################################################################################
 
-FROM neon-pg-ext-build AS neon-pg-ext-test
+FROM pg-build AS extension-tests
 ARG PG_VERSION
 RUN mkdir /ext-src
 
 COPY --from=pg-build /postgres /postgres
-#COPY --from=postgis-build /postgis.tar.gz /ext-src/
-#COPY --from=postgis-build /sfcgal/* /usr
-COPY --from=plv8-build /plv8.tar.gz /ext-src/
-#COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/
-COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
-COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
-COPY --from=vector-pg-build /pgvector.patch /ext-src/
-COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
-#COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-#COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src
-COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
-COPY compute/patches/pg_graphql.patch /ext-src
-#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
-COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
-COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
-COPY --from=rum-pg-build /rum.tar.gz /ext-src
-COPY compute/patches/rum.patch /ext-src
-#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
-COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
-COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
-COPY --from=hll-pg-build /hll.tar.gz /ext-src
-COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
-#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
-COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
+#COPY --from=postgis-src /ext-src/ /ext-src/
+COPY --from=plv8-src /ext-src/ /ext-src/
+#COPY --from=h3-pg-src /ext-src/ /ext-src/
+COPY --from=postgresql-unit-src /ext-src/ /ext-src/
+COPY --from=pgvector-src /ext-src/ /ext-src/
+COPY --from=pgjwt-src /ext-src/ /ext-src/
+#COPY --from=pgrag-src /ext-src/ /ext-src/
+#COPY --from=pg_jsonschema-src /ext-src/ /ext-src/
+COPY --from=pg_graphql-src /ext-src/ /ext-src/
+#COPY --from=pg_tiktoken-src /ext-src/ /ext-src/
+COPY --from=hypopg-src /ext-src/ /ext-src/
+COPY --from=pg_hashids-src /ext-src/ /ext-src/
+COPY --from=rum-src /ext-src/ /ext-src/
+#COPY --from=pgtap-src /ext-src/ /ext-src/
+COPY --from=ip4r-src /ext-src/ /ext-src/
+COPY --from=prefix-src /ext-src/ /ext-src/
+COPY --from=hll-src /ext-src/ /ext-src/
+COPY --from=plpgsql_check-src /ext-src/ /ext-src/
+#COPY --from=timescaledb-src /ext-src/ /ext-src/
+COPY --from=pg_hint_plan-src /ext-src/ /ext-src/
 COPY compute/patches/pg_hint_plan_${PG_VERSION}.patch /ext-src
-COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
-COPY compute/patches/pg_cron.patch /ext-src
-#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
-#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
-COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
-COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
-COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
-#COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
-#COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
-COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
-COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
-RUN cd /ext-src/ && for f in *.tar.gz; \
-    do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
-    rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
-    || exit 1; rm -f $f; done
-RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
-RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
 RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch
+COPY --from=pg_cron-src /ext-src/ /ext-src/
+#COPY --from=pgx_ulid-src /ext-src/ /ext-src/
+#COPY --from=pgx_ulid-pgrx12-src /ext-src/ /ext-src/
+#COPY --from=pg_session_jwt-src /ext-src/ /ext-src/
+#COPY --from=rdkit-src /ext-src/ /ext-src/
+COPY --from=pg_uuidv7-src /ext-src/ /ext-src/
+COPY --from=pg_roaringbitmap-src /ext-src/ /ext-src/
+COPY --from=pg_semver-src /ext-src/ /ext-src/
+#COPY --from=pg_embedding-src /ext-src/ /ext-src/
+#COPY --from=wal2json-src /ext-src/ /ext-src/
+COPY --from=pg_ivm-src /ext-src/ /ext-src/
+COPY --from=pg_partman-src /ext-src/ /ext-src/
+#COPY --from=pg_mooncake-src /ext-src/ /ext-src/
+#COPY --from=pg_repack-src /ext-src/ /ext-src/
+
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
-RUN patch -p1 </ext-src/pg_cron.patch
-RUN cd /ext-src/pg_graphql-src && patch -p1 </ext-src/pg_graphql.patch
 ENV PATH=/usr/local/pgsql/bin:$PATH
 ENV PGHOST=compute
 ENV PGPORT=55433
 ENV PGUSER=cloud_admin
 ENV PGDATABASE=postgres
+
 #########################################################################################
 #
 # Final layer

From d5c3a4e2b911bd6dcf8a318d861c99ee3264008a Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 4 Feb 2025 14:49:44 +0100
Subject: [PATCH 362/583] Add support for pgjwt test (#10611)

## Problem
We don't currently test pgjwt, while it is based on pg_prove and can be
easily added
## Summary of changes
The test for pgjwt was added.
---
 docker-compose/docker_compose_test.sh             |  1 +
 docker-compose/ext-src/pgjwt-src/neon-test.sh     |  4 ++++
 .../ext-src/pgjwt-src/test-upgrade.patch          | 15 +++++++++++++++
 docker-compose/ext-src/pgjwt-src/test-upgrade.sh  |  5 +++++
 docker-compose/test_extensions_upgrade.sh         |  5 +++--
 5 files changed, 28 insertions(+), 2 deletions(-)
 create mode 100755 docker-compose/ext-src/pgjwt-src/neon-test.sh
 create mode 100644 docker-compose/ext-src/pgjwt-src/test-upgrade.patch
 create mode 100755 docker-compose/ext-src/pgjwt-src/test-upgrade.sh

diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index e0c537edf3..c4ff86ab66 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -52,6 +52,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
 
     if [ $pg_version -ge 16 ]; then
         docker cp ext-src $TEST_CONTAINER_NAME:/
+        docker exec $TEST_CONTAINER_NAME bash -c "apt update && apt install -y libtap-parser-sourcehandler-pgtap-perl"
         # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
         # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
         echo Adding dummy config
diff --git a/docker-compose/ext-src/pgjwt-src/neon-test.sh b/docker-compose/ext-src/pgjwt-src/neon-test.sh
new file mode 100755
index 0000000000..95af0be77b
--- /dev/null
+++ b/docker-compose/ext-src/pgjwt-src/neon-test.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -ex
+cd "$(dirname "${0}")"
+pg_prove test.sql
\ No newline at end of file
diff --git a/docker-compose/ext-src/pgjwt-src/test-upgrade.patch b/docker-compose/ext-src/pgjwt-src/test-upgrade.patch
new file mode 100644
index 0000000000..85b3565480
--- /dev/null
+++ b/docker-compose/ext-src/pgjwt-src/test-upgrade.patch
@@ -0,0 +1,15 @@
+diff --git a/test.sql b/test.sql
+index d7a0ca8..f15bc76 100644
+--- a/test.sql
++++ b/test.sql
+@@ -9,9 +9,7 @@
+ \set ON_ERROR_STOP true
+ \set QUIET 1
+ 
+-CREATE EXTENSION pgcrypto;
+-CREATE EXTENSION pgtap;
+-CREATE EXTENSION pgjwt;
++CREATE EXTENSION IF NOT EXISTS pgtap;
+ 
+ BEGIN;
+ SELECT plan(23);
diff --git a/docker-compose/ext-src/pgjwt-src/test-upgrade.sh b/docker-compose/ext-src/pgjwt-src/test-upgrade.sh
new file mode 100755
index 0000000000..b7158d2340
--- /dev/null
+++ b/docker-compose/ext-src/pgjwt-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+pg_prove test.sql
\ No newline at end of file
diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh
index ff93b98065..08b1a60f2d 100755
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -24,7 +24,7 @@ function wait_for_ready {
 }
 function create_extensions() {
   for ext in ${1}; do
-    docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext}"
+    docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext} CASCADE"
   done
 }
 EXTENSIONS='[
@@ -40,7 +40,8 @@ EXTENSIONS='[
 {"extname": "pg_uuidv7", "extdir": "pg_uuidv7-src"},
 {"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"},
 {"extname": "semver", "extdir": "pg_semver-src"},
-{"extname": "pg_ivm", "extdir": "pg_ivm-src"}
+{"extname": "pg_ivm", "extdir": "pg_ivm-src"},
+{"extname": "pgjwt", "extdir": "pgjwt-src"}
 ]'
 EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
 TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d

From b6e9daea9a3fe8a6a9f0e070053e8e643722258b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 4 Feb 2025 15:01:57 +0100
Subject: [PATCH 363/583] storcon: only allow errrors of the server cert
 verification (#10644)

This PR does a bunch of things:

* only allow errors of the server cert verification, not of the TLS
handshake. The TLS handshake doesn't cause any errors for us so we can
just always require it to be valid. This simplifies the code a little.
* As the solution is more permanent than originally anticipated, I think
it makes sense to move the `AcceptAll` verifier outside.
* log the connstr information. this helps with figuring out which domain
names are configured in the connstr, etc. I think it is generally useful
to print it. make extra sure that the password is not leaked.

Follow-up of #10640
---
 storage_controller/src/persistence.rs | 138 ++++++++++++++------------
 1 file changed, 72 insertions(+), 66 deletions(-)

diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 45f3108d6b..c4e5b39589 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -27,7 +27,7 @@ use pageserver_api::shard::ShardConfigError;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
-use rustls::client::danger::ServerCertVerifier;
+use rustls::client::danger::{ServerCertVerified, ServerCertVerifier};
 use rustls::client::WebPkiServerVerifier;
 use rustls::crypto::ring;
 use scoped_futures::ScopedBoxFuture;
@@ -194,6 +194,8 @@ impl Persistence {
         timeout: Duration,
     ) -> Result<(), diesel::ConnectionError> {
         let started_at = Instant::now();
+        log_postgres_connstr_info(database_url)
+            .map_err(|e| diesel::ConnectionError::InvalidConnectionUrl(e.to_string()))?;
         loop {
             match establish_connection_rustls(database_url).await {
                 Ok(_) => {
@@ -1281,6 +1283,51 @@ pub(crate) fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
     Ok(Arc::new(store))
 }
 
+#[derive(Debug)]
+/// A verifier that accepts all certificates (but logs an error still)
+struct AcceptAll(Arc<WebPkiServerVerifier>);
+impl ServerCertVerifier for AcceptAll {
+    fn verify_server_cert(
+        &self,
+        end_entity: &rustls::pki_types::CertificateDer<'_>,
+        intermediates: &[rustls::pki_types::CertificateDer<'_>],
+        server_name: &rustls::pki_types::ServerName<'_>,
+        ocsp_response: &[u8],
+        now: rustls::pki_types::UnixTime,
+    ) -> Result<ServerCertVerified, rustls::Error> {
+        let r =
+            self.0
+                .verify_server_cert(end_entity, intermediates, server_name, ocsp_response, now);
+        if let Err(err) = r {
+            tracing::info!(
+                ?server_name,
+                "ignoring db connection TLS validation error: {err:?}"
+            );
+            return Ok(ServerCertVerified::assertion());
+        }
+        r
+    }
+    fn verify_tls12_signature(
+        &self,
+        message: &[u8],
+        cert: &rustls::pki_types::CertificateDer<'_>,
+        dss: &rustls::DigitallySignedStruct,
+    ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
+        self.0.verify_tls12_signature(message, cert, dss)
+    }
+    fn verify_tls13_signature(
+        &self,
+        message: &[u8],
+        cert: &rustls::pki_types::CertificateDer<'_>,
+        dss: &rustls::DigitallySignedStruct,
+    ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
+        self.0.verify_tls13_signature(message, cert, dss)
+    }
+    fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
+        self.0.supported_verify_schemes()
+    }
+}
+
 /// Loads the root certificates and constructs a client config suitable for connecting.
 /// This function is blocking.
 fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
@@ -1290,76 +1337,12 @@ fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
             .expect("ring should support the default protocol versions");
     static DO_CERT_CHECKS: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
     let do_cert_checks =
-        DO_CERT_CHECKS.get_or_init(|| std::env::var("STORCON_CERT_CHECKS").is_ok());
+        DO_CERT_CHECKS.get_or_init(|| std::env::var("STORCON_DB_CERT_CHECKS").is_ok());
     Ok(if *do_cert_checks {
         client_config
             .with_root_certificates(load_certs()?)
             .with_no_client_auth()
     } else {
-        use rustls::client::danger::{HandshakeSignatureValid, ServerCertVerified};
-        #[derive(Debug)]
-        struct AcceptAll(Arc<WebPkiServerVerifier>);
-        impl ServerCertVerifier for AcceptAll {
-            fn verify_server_cert(
-                &self,
-                end_entity: &rustls::pki_types::CertificateDer<'_>,
-                intermediates: &[rustls::pki_types::CertificateDer<'_>],
-                server_name: &rustls::pki_types::ServerName<'_>,
-                ocsp_response: &[u8],
-                now: rustls::pki_types::UnixTime,
-            ) -> Result<ServerCertVerified, rustls::Error> {
-                let r = self.0.verify_server_cert(
-                    end_entity,
-                    intermediates,
-                    server_name,
-                    ocsp_response,
-                    now,
-                );
-                if let Err(err) = r {
-                    tracing::info!(
-                        ?server_name,
-                        "ignoring db connection TLS validation error: {err:?}"
-                    );
-                    return Ok(ServerCertVerified::assertion());
-                }
-                r
-            }
-            fn verify_tls12_signature(
-                &self,
-                message: &[u8],
-                cert: &rustls::pki_types::CertificateDer<'_>,
-                dss: &rustls::DigitallySignedStruct,
-            ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error>
-            {
-                let r = self.0.verify_tls12_signature(message, cert, dss);
-                if let Err(err) = r {
-                    tracing::info!(
-                        "ignoring db connection 1.2 signature TLS validation error: {err:?}"
-                    );
-                    return Ok(HandshakeSignatureValid::assertion());
-                }
-                r
-            }
-            fn verify_tls13_signature(
-                &self,
-                message: &[u8],
-                cert: &rustls::pki_types::CertificateDer<'_>,
-                dss: &rustls::DigitallySignedStruct,
-            ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error>
-            {
-                let r = self.0.verify_tls13_signature(message, cert, dss);
-                if let Err(err) = r {
-                    tracing::info!(
-                        "ignoring db connection 1.3 signature TLS validation error: {err:?}"
-                    );
-                    return Ok(HandshakeSignatureValid::assertion());
-                }
-                r
-            }
-            fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
-                self.0.supported_verify_schemes()
-            }
-        }
         let verifier = AcceptAll(
             WebPkiServerVerifier::builder_with_provider(
                 load_certs()?,
@@ -1389,6 +1372,29 @@ fn establish_connection_rustls(config: &str) -> BoxFuture<ConnectionResult<Async
     fut.boxed()
 }
 
+#[cfg_attr(test, test)]
+fn test_config_debug_censors_password() {
+    let has_pw =
+        "host=/var/lib/postgresql,localhost port=1234 user=specialuser password='NOT ALLOWED TAG'";
+    let has_pw_cfg = has_pw.parse::<tokio_postgres::Config>().unwrap();
+    assert!(format!("{has_pw_cfg:?}").contains("specialuser"));
+    // Ensure that the password is not leaked by the debug impl
+    assert!(!format!("{has_pw_cfg:?}").contains("NOT ALLOWED TAG"));
+}
+
+fn log_postgres_connstr_info(config_str: &str) -> anyhow::Result<()> {
+    let config = config_str
+        .parse::<tokio_postgres::Config>()
+        .map_err(|_e| anyhow::anyhow!("Couldn't parse config str"))?;
+    // We use debug formatting here, and use a unit test to ensure that we don't leak the password.
+    // To make extra sure the test gets ran, run it every time the function is called
+    // (this is rather cold code, we can afford it).
+    #[cfg(not(test))]
+    test_config_debug_censors_password();
+    tracing::info!("database connection config: {config:?}");
+    Ok(())
+}
+
 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
 #[derive(
     QueryableByName, Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq,

From dcf335a25195bdd2d8fa8dd5080ca661ea4396e3 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 4 Feb 2025 15:50:53 +0100
Subject: [PATCH 364/583] proxy: Switch proxy to JSON logging (#9857)

## Problem

We want to switch proxy and ideally all Rust services to structured JSON
logging to support better filtering and cross-referencing with tracing.

## Summary of changes

* Introduce a custom tracing-subscriber to write the JSON. In a first
attempt a customized tracing::fmt::FmtSubscriber was used, but it's very
inefficient and can still generate invalid JSON. It's also doesn't allow
us to add important fields to the root object.
* Make this opt in: the `LOGFMT` env var can be set to `"json"` to
enable to new logger at startup.
---
 Cargo.lock                |  55 +++
 Cargo.toml                |   3 +
 deny.toml                 |   1 +
 proxy/Cargo.toml          |   8 +
 proxy/src/logging.rs      | 904 +++++++++++++++++++++++++++++++++++++-
 workspace_hack/Cargo.toml |   1 +
 6 files changed, 964 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0133c83564..de1b1218ca 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -206,6 +206,16 @@ dependencies = [
  "syn 2.0.90",
 ]
 
+[[package]]
+name = "assert-json-diff"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "async-channel"
 version = "1.9.0"
@@ -1010,6 +1020,12 @@ dependencies = [
  "generic-array",
 ]
 
+[[package]]
+name = "boxcar"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2721c3c5a6f0e7f7e607125d963fedeb765f545f67adc9d71ed934693881eb42"
+
 [[package]]
 name = "bstr"
 version = "1.5.0"
@@ -2433,6 +2449,16 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "gettid"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "397256552fed4a9e577850498071831ec8f18ea83368aecc114cab469dcb43e5"
+dependencies = [
+ "libc",
+ "winapi",
+]
+
 [[package]]
 name = "gimli"
 version = "0.31.1"
@@ -4212,6 +4238,16 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "papaya"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c"
+dependencies = [
+ "equivalent",
+ "seize",
+]
+
 [[package]]
 name = "parking"
 version = "2.1.1"
@@ -4839,6 +4875,7 @@ dependencies = [
  "ahash",
  "anyhow",
  "arc-swap",
+ "assert-json-diff",
  "async-compression",
  "async-trait",
  "atomic-take",
@@ -4846,6 +4883,7 @@ dependencies = [
  "aws-sdk-iam",
  "aws-sigv4",
  "base64 0.13.1",
+ "boxcar",
  "bstr",
  "bytes",
  "camino",
@@ -4862,6 +4900,7 @@ dependencies = [
  "flate2",
  "framed-websockets",
  "futures",
+ "gettid",
  "hashbrown 0.14.5",
  "hashlink",
  "hex",
@@ -4884,7 +4923,9 @@ dependencies = [
  "measured",
  "metrics",
  "once_cell",
+ "opentelemetry",
  "p256 0.13.2",
+ "papaya",
  "parking_lot 0.12.1",
  "parquet",
  "parquet_derive",
@@ -4931,6 +4972,9 @@ dependencies = [
  "tokio-tungstenite 0.21.0",
  "tokio-util",
  "tracing",
+ "tracing-log",
+ "tracing-opentelemetry",
+ "tracing-serde",
  "tracing-subscriber",
  "tracing-utils",
  "try-lock",
@@ -5884,6 +5928,16 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "seize"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "semver"
 version = "1.0.17"
@@ -8145,6 +8199,7 @@ dependencies = [
  "tower 0.4.13",
  "tracing",
  "tracing-core",
+ "tracing-log",
  "url",
  "zerocopy",
  "zeroize",
diff --git a/Cargo.toml b/Cargo.toml
index 267a91d773..76b54ae1d8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -54,6 +54,7 @@ async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
 backtrace = "0.3.74"
 flate2 = "1.0.26"
+assert-json-diff = "2"
 async-stream = "0.3"
 async-trait = "0.1"
 aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] }
@@ -193,7 +194,9 @@ tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
 tower-service = "0.3.3"
 tracing = "0.1"
 tracing-error = "0.2"
+tracing-log = "0.2"
 tracing-opentelemetry = "0.28"
+tracing-serde = "0.2.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }
diff --git a/deny.toml b/deny.toml
index df00a34c60..b551405568 100644
--- a/deny.toml
+++ b/deny.toml
@@ -32,6 +32,7 @@ reason = "the marvin attack only affects private key decryption, not public key
 # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
 [licenses]
 allow = [
+    "0BSD",
     "Apache-2.0",
     "BSD-2-Clause",
     "BSD-3-Clause",
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 35574e945c..d7880ea7b9 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -19,6 +19,7 @@ aws-config.workspace = true
 aws-sdk-iam.workspace = true
 aws-sigv4.workspace = true
 base64.workspace = true
+boxcar = "0.2.8"
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
 camino.workspace = true
@@ -42,6 +43,7 @@ hyper0.workspace = true
 hyper = { workspace = true, features = ["server", "http1", "http2"] }
 hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
 http-body-util = { version = "0.1" }
+gettid = "0.1.3"
 indexmap = { workspace = true, features = ["serde"] }
 ipnet.workspace = true
 itertools.workspace = true
@@ -50,6 +52,8 @@ lasso = { workspace = true, features = ["multi-threaded"] }
 measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
 once_cell.workspace = true
+opentelemetry = { workspace = true, features = ["trace"] }
+papaya = "0.1.8"
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
@@ -89,6 +93,9 @@ tokio = { workspace = true, features = ["signal"] }
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
+tracing-log.workspace = true
+tracing-serde.workspace = true
+tracing-opentelemetry.workspace = true
 try-lock.workspace = true
 typed-json.workspace = true
 url.workspace = true
@@ -112,6 +119,7 @@ rsa = "0.9"
 workspace_hack.workspace = true
 
 [dev-dependencies]
+assert-json-diff.workspace = true
 camino-tempfile.workspace = true
 fallible-iterator.workspace = true
 flate2.workspace = true
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index 41f10f052f..97c9f5a59c 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -1,10 +1,23 @@
-use tracing::Subscriber;
+use std::cell::{Cell, RefCell};
+use std::collections::HashMap;
+use std::hash::BuildHasher;
+use std::{env, io};
+
+use chrono::{DateTime, Utc};
+use opentelemetry::trace::TraceContextExt;
+use scopeguard::defer;
+use serde::ser::{SerializeMap, Serializer};
+use tracing::span;
+use tracing::subscriber::Interest;
+use tracing::{callsite, Event, Metadata, Span, Subscriber};
+use tracing_opentelemetry::OpenTelemetrySpanExt;
 use tracing_subscriber::filter::{EnvFilter, LevelFilter};
 use tracing_subscriber::fmt::format::{Format, Full};
 use tracing_subscriber::fmt::time::SystemTime;
 use tracing_subscriber::fmt::{FormatEvent, FormatFields};
+use tracing_subscriber::layer::{Context, Layer};
 use tracing_subscriber::prelude::*;
-use tracing_subscriber::registry::LookupSpan;
+use tracing_subscriber::registry::{LookupSpan, SpanRef};
 
 /// Initialize logging and OpenTelemetry tracing and exporter.
 ///
@@ -15,6 +28,8 @@ use tracing_subscriber::registry::LookupSpan;
 /// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`.
 /// See <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables>
 pub async fn init() -> anyhow::Result<LoggingGuard> {
+    let logfmt = LogFormat::from_env()?;
+
     let env_filter = EnvFilter::builder()
         .with_default_directive(LevelFilter::INFO.into())
         .from_env_lossy()
@@ -29,17 +44,36 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
                 .expect("this should be a valid filter directive"),
         );
 
-    let fmt_layer = tracing_subscriber::fmt::layer()
-        .with_ansi(false)
-        .with_writer(std::io::stderr)
-        .with_target(false);
-
     let otlp_layer = tracing_utils::init_tracing("proxy").await;
 
+    let json_log_layer = if logfmt == LogFormat::Json {
+        Some(JsonLoggingLayer {
+            clock: RealClock,
+            skipped_field_indices: papaya::HashMap::default(),
+            writer: StderrWriter {
+                stderr: std::io::stderr(),
+            },
+        })
+    } else {
+        None
+    };
+
+    let text_log_layer = if logfmt == LogFormat::Text {
+        Some(
+            tracing_subscriber::fmt::layer()
+                .with_ansi(false)
+                .with_writer(std::io::stderr)
+                .with_target(false),
+        )
+    } else {
+        None
+    };
+
     tracing_subscriber::registry()
         .with(env_filter)
         .with(otlp_layer)
-        .with(fmt_layer)
+        .with(json_log_layer)
+        .with(text_log_layer)
         .try_init()?;
 
     Ok(LoggingGuard)
@@ -94,3 +128,857 @@ impl Drop for LoggingGuard {
         tracing_utils::shutdown_tracing();
     }
 }
+
+// TODO: make JSON the default
+#[derive(Copy, Clone, PartialEq, Eq, Default, Debug)]
+enum LogFormat {
+    #[default]
+    Text = 1,
+    Json,
+}
+
+impl LogFormat {
+    fn from_env() -> anyhow::Result<Self> {
+        let logfmt = env::var("LOGFMT");
+        Ok(match logfmt.as_deref() {
+            Err(_) => LogFormat::default(),
+            Ok("text") => LogFormat::Text,
+            Ok("json") => LogFormat::Json,
+            Ok(logfmt) => anyhow::bail!("unknown log format: {logfmt}"),
+        })
+    }
+}
+
+trait MakeWriter {
+    fn make_writer(&self) -> impl io::Write;
+}
+
+struct StderrWriter {
+    stderr: io::Stderr,
+}
+
+impl MakeWriter for StderrWriter {
+    #[inline]
+    fn make_writer(&self) -> impl io::Write {
+        self.stderr.lock()
+    }
+}
+
+// TODO: move into separate module or even separate crate.
+trait Clock {
+    fn now(&self) -> DateTime<Utc>;
+}
+
+struct RealClock;
+
+impl Clock for RealClock {
+    #[inline]
+    fn now(&self) -> DateTime<Utc> {
+        Utc::now()
+    }
+}
+
+/// Name of the field used by tracing crate to store the event message.
+const MESSAGE_FIELD: &str = "message";
+
+thread_local! {
+    /// Protects against deadlocks and double panics during log writing.
+    /// The current panic handler will use tracing to log panic information.
+    static REENTRANCY_GUARD: Cell<bool> = const { Cell::new(false) };
+    /// Thread-local instance with per-thread buffer for log writing.
+    static EVENT_FORMATTER: RefCell<EventFormatter> = RefCell::new(EventFormatter::new());
+    /// Cached OS thread ID.
+    static THREAD_ID: u64 = gettid::gettid();
+}
+
+/// Implements tracing layer to handle events specific to logging.
+struct JsonLoggingLayer<C: Clock, W: MakeWriter> {
+    clock: C,
+    skipped_field_indices: papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
+    writer: W,
+}
+
+impl<S, C: Clock + 'static, W: MakeWriter + 'static> Layer<S> for JsonLoggingLayer<C, W>
+where
+    S: Subscriber + for<'a> LookupSpan<'a>,
+{
+    fn on_event(&self, event: &Event<'_>, ctx: Context<'_, S>) {
+        use std::io::Write;
+
+        // TODO: consider special tracing subscriber to grab timestamp very
+        //       early, before OTel machinery, and add as event extension.
+        let now = self.clock.now();
+
+        let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| {
+            if entered.get() {
+                let mut formatter = EventFormatter::new();
+                formatter.format(now, event, &ctx, &self.skipped_field_indices)?;
+                self.writer.make_writer().write_all(formatter.buffer())
+            } else {
+                entered.set(true);
+                defer!(entered.set(false););
+
+                EVENT_FORMATTER.with_borrow_mut(move |formatter| {
+                    formatter.reset();
+                    formatter.format(now, event, &ctx, &self.skipped_field_indices)?;
+                    self.writer.make_writer().write_all(formatter.buffer())
+                })
+            }
+        });
+
+        // In case logging fails we generate a simpler JSON object.
+        if let Err(err) = res {
+            if let Ok(mut line) = serde_json::to_vec(&serde_json::json!( {
+                "timestamp": now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true),
+                "level": "ERROR",
+                "message": format_args!("cannot log event: {err:?}"),
+                "fields": {
+                    "event": format_args!("{event:?}"),
+                },
+            })) {
+                line.push(b'\n');
+                self.writer.make_writer().write_all(&line).ok();
+            }
+        }
+    }
+
+    /// Registers a SpanFields instance as span extension.
+    fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) {
+        let span = ctx.span(id).expect("span must exist");
+        let fields = SpanFields::default();
+        fields.record_fields(attrs);
+        // This could deadlock when there's a panic somewhere in the tracing
+        // event handling and a read or write guard is still held. This includes
+        // the OTel subscriber.
+        span.extensions_mut().insert(fields);
+    }
+
+    fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) {
+        let span = ctx.span(id).expect("span must exist");
+        let ext = span.extensions();
+        if let Some(data) = ext.get::<SpanFields>() {
+            data.record_fields(values);
+        }
+    }
+
+    /// Called (lazily) whenever a new log call is executed. We quickly check
+    /// for duplicate field names and record duplicates as skippable. Last one
+    /// wins.
+    fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest {
+        if !metadata.is_event() {
+            // Must not be never because we wouldn't get trace and span data.
+            return Interest::always();
+        }
+
+        let mut field_indices = SkippedFieldIndices::default();
+        let mut seen_fields = HashMap::<&'static str, usize>::new();
+        for field in metadata.fields() {
+            use std::collections::hash_map::Entry;
+            match seen_fields.entry(field.name()) {
+                Entry::Vacant(entry) => {
+                    // field not seen yet
+                    entry.insert(field.index());
+                }
+                Entry::Occupied(mut entry) => {
+                    // replace currently stored index
+                    let old_index = entry.insert(field.index());
+                    // ... and append it to list of skippable indices
+                    field_indices.push(old_index);
+                }
+            }
+        }
+
+        if !field_indices.is_empty() {
+            self.skipped_field_indices
+                .pin()
+                .insert(metadata.callsite(), field_indices);
+        }
+
+        Interest::always()
+    }
+}
+
+/// Stores span field values recorded during the spans lifetime.
+#[derive(Default)]
+struct SpanFields {
+    // TODO: Switch to custom enum with lasso::Spur for Strings?
+    fields: papaya::HashMap<&'static str, serde_json::Value>,
+}
+
+impl SpanFields {
+    #[inline]
+    fn record_fields<R: tracing_subscriber::field::RecordFields>(&self, fields: R) {
+        fields.record(&mut SpanFieldsRecorder {
+            fields: self.fields.pin(),
+        });
+    }
+}
+
+/// Implements a tracing field visitor to convert and store values.
+struct SpanFieldsRecorder<'m, S, G> {
+    fields: papaya::HashMapRef<'m, &'static str, serde_json::Value, S, G>,
+}
+
+impl<S: BuildHasher, G: papaya::Guard> tracing::field::Visit for SpanFieldsRecorder<'_, S, G> {
+    #[inline]
+    fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
+        if let Ok(value) = i64::try_from(value) {
+            self.fields
+                .insert(field.name(), serde_json::Value::from(value));
+        } else {
+            self.fields
+                .insert(field.name(), serde_json::Value::from(format!("{value}")));
+        }
+    }
+
+    #[inline]
+    fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
+        if let Ok(value) = u64::try_from(value) {
+            self.fields
+                .insert(field.name(), serde_json::Value::from(value));
+        } else {
+            self.fields
+                .insert(field.name(), serde_json::Value::from(format!("{value}")));
+        }
+    }
+
+    #[inline]
+    fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(format!("{value:?}")));
+    }
+
+    #[inline]
+    fn record_error(
+        &mut self,
+        field: &tracing::field::Field,
+        value: &(dyn std::error::Error + 'static),
+    ) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(format!("{value}")));
+    }
+}
+
+/// List of field indices skipped during logging. Can list duplicate fields or
+/// metafields not meant to be logged.
+#[derive(Clone, Default)]
+struct SkippedFieldIndices {
+    bits: u64,
+}
+
+impl SkippedFieldIndices {
+    #[inline]
+    fn is_empty(&self) -> bool {
+        self.bits == 0
+    }
+
+    #[inline]
+    fn push(&mut self, index: usize) {
+        self.bits |= 1u64
+            .checked_shl(index as u32)
+            .expect("field index too large");
+    }
+
+    #[inline]
+    fn contains(&self, index: usize) -> bool {
+        self.bits
+            & 1u64
+                .checked_shl(index as u32)
+                .expect("field index too large")
+            != 0
+    }
+}
+
+/// Formats a tracing event and writes JSON to its internal buffer including a newline.
+// TODO: buffer capacity management, truncate if too large
+struct EventFormatter {
+    logline_buffer: Vec<u8>,
+}
+
+impl EventFormatter {
+    #[inline]
+    fn new() -> Self {
+        EventFormatter {
+            logline_buffer: Vec::new(),
+        }
+    }
+
+    #[inline]
+    fn buffer(&self) -> &[u8] {
+        &self.logline_buffer
+    }
+
+    #[inline]
+    fn reset(&mut self) {
+        self.logline_buffer.clear();
+    }
+
+    fn format<S>(
+        &mut self,
+        now: DateTime<Utc>,
+        event: &Event<'_>,
+        ctx: &Context<'_, S>,
+        skipped_field_indices: &papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
+    ) -> io::Result<()>
+    where
+        S: Subscriber + for<'a> LookupSpan<'a>,
+    {
+        let timestamp = now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true);
+
+        use tracing_log::NormalizeEvent;
+        let normalized_meta = event.normalized_metadata();
+        let meta = normalized_meta.as_ref().unwrap_or_else(|| event.metadata());
+
+        let skipped_field_indices = skipped_field_indices.pin();
+        let skipped_field_indices = skipped_field_indices.get(&meta.callsite());
+
+        let mut serialize = || {
+            let mut serializer = serde_json::Serializer::new(&mut self.logline_buffer);
+
+            let mut serializer = serializer.serialize_map(None)?;
+
+            // Timestamp comes first, so raw lines can be sorted by timestamp.
+            serializer.serialize_entry("timestamp", &timestamp)?;
+
+            // Level next.
+            serializer.serialize_entry("level", &meta.level().as_str())?;
+
+            // Message next.
+            serializer.serialize_key("message")?;
+            let mut message_extractor =
+                MessageFieldExtractor::new(serializer, skipped_field_indices);
+            event.record(&mut message_extractor);
+            let mut serializer = message_extractor.into_serializer()?;
+
+            let mut fields_present = FieldsPresent(false, skipped_field_indices);
+            event.record(&mut fields_present);
+            if fields_present.0 {
+                serializer.serialize_entry(
+                    "fields",
+                    &SerializableEventFields(event, skipped_field_indices),
+                )?;
+            }
+
+            let pid = std::process::id();
+            if pid != 1 {
+                serializer.serialize_entry("process_id", &pid)?;
+            }
+
+            THREAD_ID.with(|tid| serializer.serialize_entry("thread_id", tid))?;
+
+            // TODO: tls cache? name could change
+            if let Some(thread_name) = std::thread::current().name() {
+                if !thread_name.is_empty() && thread_name != "tokio-runtime-worker" {
+                    serializer.serialize_entry("thread_name", thread_name)?;
+                }
+            }
+
+            if let Some(task_id) = tokio::task::try_id() {
+                serializer.serialize_entry("task_id", &format_args!("{task_id}"))?;
+            }
+
+            serializer.serialize_entry("target", meta.target())?;
+
+            if let Some(module) = meta.module_path() {
+                if module != meta.target() {
+                    serializer.serialize_entry("module", module)?;
+                }
+            }
+
+            if let Some(file) = meta.file() {
+                if let Some(line) = meta.line() {
+                    serializer.serialize_entry("src", &format_args!("{file}:{line}"))?;
+                } else {
+                    serializer.serialize_entry("src", file)?;
+                }
+            }
+
+            {
+                let otel_context = Span::current().context();
+                let otel_spanref = otel_context.span();
+                let span_context = otel_spanref.span_context();
+                if span_context.is_valid() {
+                    serializer.serialize_entry(
+                        "trace_id",
+                        &format_args!("{}", span_context.trace_id()),
+                    )?;
+                }
+            }
+
+            serializer.serialize_entry("spans", &SerializableSpanStack(ctx))?;
+
+            serializer.end()
+        };
+
+        serialize().map_err(io::Error::other)?;
+        self.logline_buffer.push(b'\n');
+        Ok(())
+    }
+}
+
+/// Extracts the message field that's mixed will other fields.
+struct MessageFieldExtractor<'a, S: serde::ser::SerializeMap> {
+    serializer: S,
+    skipped_field_indices: Option<&'a SkippedFieldIndices>,
+    state: Option<Result<(), S::Error>>,
+}
+
+impl<'a, S: serde::ser::SerializeMap> MessageFieldExtractor<'a, S> {
+    #[inline]
+    fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
+        Self {
+            serializer,
+            skipped_field_indices,
+            state: None,
+        }
+    }
+
+    #[inline]
+    fn into_serializer(mut self) -> Result<S, S::Error> {
+        match self.state {
+            Some(Ok(())) => {}
+            Some(Err(err)) => return Err(err),
+            None => self.serializer.serialize_value("")?,
+        }
+        Ok(self.serializer)
+    }
+
+    #[inline]
+    fn accept_field(&self, field: &tracing::field::Field) -> bool {
+        self.state.is_none()
+            && field.name() == MESSAGE_FIELD
+            && !self
+                .skipped_field_indices
+                .is_some_and(|i| i.contains(field.index()))
+    }
+}
+
+impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtractor<'_, S> {
+    #[inline]
+    fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&format_args!("{value:x?}")));
+        }
+    }
+
+    #[inline]
+    fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&format_args!("{value:?}")));
+        }
+    }
+
+    #[inline]
+    fn record_error(
+        &mut self,
+        field: &tracing::field::Field,
+        value: &(dyn std::error::Error + 'static),
+    ) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&format_args!("{value}")));
+        }
+    }
+}
+
+/// Checks if there's any fields and field values present. If not, the JSON subobject
+/// can be skipped.
+// This is entirely optional and only cosmetic, though maybe helps a
+// bit during log parsing in dashboards when there's no field with empty object.
+struct FieldsPresent<'a>(pub bool, Option<&'a SkippedFieldIndices>);
+
+// Even though some methods have an overhead (error, bytes) it is assumed the
+// compiler won't include this since we ignore the value entirely.
+impl tracing::field::Visit for FieldsPresent<'_> {
+    #[inline]
+    fn record_debug(&mut self, field: &tracing::field::Field, _: &dyn std::fmt::Debug) {
+        if !self.1.is_some_and(|i| i.contains(field.index()))
+            && field.name() != MESSAGE_FIELD
+            && !field.name().starts_with("log.")
+        {
+            self.0 |= true;
+        }
+    }
+}
+
+/// Serializes the fields directly supplied with a log event.
+struct SerializableEventFields<'a, 'event>(
+    &'a tracing::Event<'event>,
+    Option<&'a SkippedFieldIndices>,
+);
+
+impl serde::ser::Serialize for SerializableEventFields<'_, '_> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        use serde::ser::SerializeMap;
+        let serializer = serializer.serialize_map(None)?;
+        let mut message_skipper = MessageFieldSkipper::new(serializer, self.1);
+        self.0.record(&mut message_skipper);
+        let serializer = message_skipper.into_serializer()?;
+        serializer.end()
+    }
+}
+
+/// A tracing field visitor that skips the message field.
+struct MessageFieldSkipper<'a, S: serde::ser::SerializeMap> {
+    serializer: S,
+    skipped_field_indices: Option<&'a SkippedFieldIndices>,
+    state: Result<(), S::Error>,
+}
+
+impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> {
+    #[inline]
+    fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
+        Self {
+            serializer,
+            skipped_field_indices,
+            state: Ok(()),
+        }
+    }
+
+    #[inline]
+    fn accept_field(&self, field: &tracing::field::Field) -> bool {
+        self.state.is_ok()
+            && field.name() != MESSAGE_FIELD
+            && !field.name().starts_with("log.")
+            && !self
+                .skipped_field_indices
+                .is_some_and(|i| i.contains(field.index()))
+    }
+
+    #[inline]
+    fn into_serializer(self) -> Result<S, S::Error> {
+        self.state?;
+        Ok(self.serializer)
+    }
+}
+
+impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<'_, S> {
+    #[inline]
+    fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
+        if self.accept_field(field) {
+            self.state = self
+                .serializer
+                .serialize_entry(field.name(), &format_args!("{value:x?}"));
+        }
+    }
+
+    #[inline]
+    fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
+        if self.accept_field(field) {
+            self.state = self
+                .serializer
+                .serialize_entry(field.name(), &format_args!("{value:?}"));
+        }
+    }
+
+    #[inline]
+    fn record_error(
+        &mut self,
+        field: &tracing::field::Field,
+        value: &(dyn std::error::Error + 'static),
+    ) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_value(&format_args!("{value}"));
+        }
+    }
+}
+
+/// Serializes the span stack from root to leaf (parent of event) enumerated
+/// inside an object where the keys are just the number padded with zeroes
+/// to retain sorting order.
+// The object is necessary because Loki cannot flatten arrays.
+struct SerializableSpanStack<'a, 'b, Span>(&'b Context<'a, Span>)
+where
+    Span: Subscriber + for<'lookup> LookupSpan<'lookup>;
+
+impl<Span> serde::ser::Serialize for SerializableSpanStack<'_, '_, Span>
+where
+    Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
+{
+    fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
+    where
+        Ser: serde::ser::Serializer,
+    {
+        let mut serializer = serializer.serialize_map(None)?;
+
+        if let Some(leaf_span) = self.0.lookup_current() {
+            for (i, span) in leaf_span.scope().from_root().enumerate() {
+                serializer.serialize_entry(&format_args!("{i:02}"), &SerializableSpan(&span))?;
+            }
+        }
+
+        serializer.end()
+    }
+}
+
+/// Serializes a single span. Include the span ID, name and its fields as
+/// recorded up to this point.
+struct SerializableSpan<'a, 'b, Span>(&'b SpanRef<'a, Span>)
+where
+    Span: for<'lookup> LookupSpan<'lookup>;
+
+impl<Span> serde::ser::Serialize for SerializableSpan<'_, '_, Span>
+where
+    Span: for<'lookup> LookupSpan<'lookup>,
+{
+    fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
+    where
+        Ser: serde::ser::Serializer,
+    {
+        let mut serializer = serializer.serialize_map(None)?;
+        // TODO: the span ID is probably only useful for debugging tracing.
+        serializer.serialize_entry("span_id", &format_args!("{:016x}", self.0.id().into_u64()))?;
+        serializer.serialize_entry("span_name", self.0.metadata().name())?;
+
+        let ext = self.0.extensions();
+        if let Some(data) = ext.get::<SpanFields>() {
+            for (key, value) in &data.fields.pin() {
+                serializer.serialize_entry(key, value)?;
+            }
+        }
+
+        serializer.end()
+    }
+}
+
+#[cfg(test)]
+#[allow(clippy::unwrap_used)]
+mod tests {
+    use std::sync::{Arc, Mutex, MutexGuard};
+
+    use assert_json_diff::assert_json_eq;
+    use tracing::info_span;
+
+    use super::*;
+
+    struct TestClock {
+        current_time: Mutex<DateTime<Utc>>,
+    }
+
+    impl Clock for Arc<TestClock> {
+        fn now(&self) -> DateTime<Utc> {
+            *self.current_time.lock().expect("poisoned")
+        }
+    }
+
+    struct VecWriter<'a> {
+        buffer: MutexGuard<'a, Vec<u8>>,
+    }
+
+    impl MakeWriter for Arc<Mutex<Vec<u8>>> {
+        fn make_writer(&self) -> impl io::Write {
+            VecWriter {
+                buffer: self.lock().expect("poisoned"),
+            }
+        }
+    }
+
+    impl io::Write for VecWriter<'_> {
+        fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
+            self.buffer.write(buf)
+        }
+
+        fn flush(&mut self) -> io::Result<()> {
+            Ok(())
+        }
+    }
+
+    #[test]
+    fn test_field_collection() {
+        let clock = Arc::new(TestClock {
+            current_time: Mutex::new(Utc::now()),
+        });
+        let buffer = Arc::new(Mutex::new(Vec::new()));
+        let log_layer = JsonLoggingLayer {
+            clock: clock.clone(),
+            skipped_field_indices: papaya::HashMap::default(),
+            writer: buffer.clone(),
+        };
+
+        let registry = tracing_subscriber::Registry::default().with(log_layer);
+
+        tracing::subscriber::with_default(registry, || {
+            info_span!("span1", x = 40, x = 41, x = 42).in_scope(|| {
+                info_span!("span2").in_scope(|| {
+                    tracing::error!(
+                        a = 1,
+                        a = 2,
+                        a = 3,
+                        message = "explicit message field",
+                        "implicit message field"
+                    );
+                });
+            });
+        });
+
+        let buffer = Arc::try_unwrap(buffer)
+            .expect("no other reference")
+            .into_inner()
+            .expect("poisoned");
+        let actual: serde_json::Value = serde_json::from_slice(&buffer).expect("valid JSON");
+        let expected: serde_json::Value = serde_json::json!(
+            {
+                "timestamp": clock.now().to_rfc3339_opts(chrono::SecondsFormat::Micros, true),
+                "level": "ERROR",
+                "message": "explicit message field",
+                "fields": {
+                    "a": 3,
+                },
+                "spans": {
+                    "00":{
+                        "span_id": "0000000000000001",
+                        "span_name": "span1",
+                        "x": 42,
+                    },
+                    "01": {
+                        "span_id": "0000000000000002",
+                        "span_name": "span2",
+                    }
+                },
+                "src": actual.as_object().unwrap().get("src").unwrap().as_str().unwrap(),
+                "target": "proxy::logging::tests",
+                "process_id": actual.as_object().unwrap().get("process_id").unwrap().as_number().unwrap(),
+                "thread_id": actual.as_object().unwrap().get("thread_id").unwrap().as_number().unwrap(),
+                "thread_name": "logging::tests::test_field_collection",
+            }
+        );
+
+        assert_json_eq!(actual, expected);
+    }
+}
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index a3dffa8f19..2c65401154 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -92,6 +92,7 @@ tonic = { version = "0.12", default-features = false, features = ["codegen", "pr
 tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
+tracing-log = { version = "0.2" }
 url = { version = "2", features = ["serde"] }
 zerocopy = { version = "0.7", features = ["derive", "simd"] }
 zeroize = { version = "1", features = ["derive", "serde"] }

From 06090bbccdf0d2b89f6a355afd42b352fadd40d6 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 4 Feb 2025 15:55:11 +0100
Subject: [PATCH 365/583] pageserver: log critical error on `ClearVmBits` for
 unknown pages (#10634)

## Problem

In #9895, we fixed some issues where `ClearVmBits` were broadcast to all
shards, even those not owning the VM relation. As part of that, we found
some ancient code from #1417, which discarded spurious incorrect
`ClearVmBits` records for pages outside of the VM relation. We added
observability in #9911 to see how often this actually happens in the
wild.

After two months, we have not seen this happen once in production or
staging. However, out of caution, we don't want a hard error and break
WAL ingestion.

Resolves #10067.

## Summary of changes

Log a critical error when ingesting `ClearVmBits` for unknown VM
relations or pages.
---
 pageserver/src/metrics.rs   |   7 ---
 pageserver/src/walingest.rs | 117 +++++++++++++++---------------------
 2 files changed, 49 insertions(+), 75 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index f9edf88553..48aed70826 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2379,7 +2379,6 @@ pub(crate) struct WalIngestMetrics {
     pub(crate) records_committed: IntCounter,
     pub(crate) records_filtered: IntCounter,
     pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
-    pub(crate) clear_vm_bits_unknown: IntCounterVec,
 }
 
 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
@@ -2414,12 +2413,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
         "Total number of zero gap blocks written on relation extends"
     )
     .expect("failed to define a metric"),
-    clear_vm_bits_unknown: register_int_counter_vec!(
-        "pageserver_wal_ingest_clear_vm_bits_unknown",
-        "Number of ignored ClearVmBits operations due to unknown pages/relations",
-        &["entity"],
-    )
-    .expect("failed to define a metric"),
 }
 });
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index e0283d99e0..04edb3e3f4 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -28,17 +28,9 @@ use std::time::Duration;
 use std::time::Instant;
 use std::time::SystemTime;
 
-use pageserver_api::shard::ShardIdentity;
-use postgres_ffi::fsm_logical_to_physical;
-use postgres_ffi::walrecord::*;
-use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
-use wal_decoder::models::*;
-
 use anyhow::{bail, Result};
 use bytes::{Buf, Bytes};
 use tracing::*;
-use utils::failpoint_support;
-use utils::rate_limit::RateLimit;
 
 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
@@ -50,11 +42,18 @@ use crate::ZERO_PAGE;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
+use pageserver_api::shard::ShardIdentity;
+use postgres_ffi::fsm_logical_to_physical;
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi::walrecord::*;
 use postgres_ffi::TransactionId;
+use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
 use utils::bin_ser::SerializeError;
 use utils::lsn::Lsn;
+use utils::rate_limit::RateLimit;
+use utils::{critical, failpoint_support};
+use wal_decoder::models::*;
 
 enum_pgversion! {CheckPoint, pgv::CheckPoint}
 
@@ -327,93 +326,75 @@ impl WalIngest {
         let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
         let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
 
-        // Sometimes, Postgres seems to create heap WAL records with the
-        // ALL_VISIBLE_CLEARED flag set, even though the bit in the VM page is
-        // not set. In fact, it's possible that the VM page does not exist at all.
-        // In that case, we don't want to store a record to clear the VM bit;
-        // replaying it would fail to find the previous image of the page, because
-        // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
-        // record if it doesn't.
-        //
-        // TODO: analyze the metrics and tighten this up accordingly. This logic
-        // implicitly assumes that VM pages see explicit WAL writes before
-        // implicit ClearVmBits, and will otherwise silently drop updates.
+        // VM bits can only be cleared on the shard(s) owning the VM relation, and must be within
+        // its view of the VM relation size. Out of caution, error instead of failing WAL ingestion,
+        // as there has historically been cases where PostgreSQL has cleared spurious VM pages. See:
+        // https://github.com/neondatabase/neon/pull/10634.
         let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else {
-            WAL_INGEST
-                .clear_vm_bits_unknown
-                .with_label_values(&["relation"])
-                .inc();
+            critical!("clear_vm_bits for unknown VM relation {vm_rel}");
             return Ok(());
         };
         if let Some(blknum) = new_vm_blk {
             if blknum >= vm_size {
-                WAL_INGEST
-                    .clear_vm_bits_unknown
-                    .with_label_values(&["new_page"])
-                    .inc();
+                critical!("new_vm_blk {blknum} not in {vm_rel} of size {vm_size}");
                 new_vm_blk = None;
             }
         }
         if let Some(blknum) = old_vm_blk {
             if blknum >= vm_size {
-                WAL_INGEST
-                    .clear_vm_bits_unknown
-                    .with_label_values(&["old_page"])
-                    .inc();
+                critical!("old_vm_blk {blknum} not in {vm_rel} of size {vm_size}");
                 old_vm_blk = None;
             }
         }
 
-        if new_vm_blk.is_some() || old_vm_blk.is_some() {
-            if new_vm_blk == old_vm_blk {
-                // An UPDATE record that needs to clear the bits for both old and the
-                // new page, both of which reside on the same VM page.
+        if new_vm_blk.is_none() && old_vm_blk.is_none() {
+            return Ok(());
+        } else if new_vm_blk == old_vm_blk {
+            // An UPDATE record that needs to clear the bits for both old and the new page, both of
+            // which reside on the same VM page.
+            self.put_rel_wal_record(
+                modification,
+                vm_rel,
+                new_vm_blk.unwrap(),
+                NeonWalRecord::ClearVisibilityMapFlags {
+                    new_heap_blkno,
+                    old_heap_blkno,
+                    flags,
+                },
+                ctx,
+            )
+            .await?;
+        } else {
+            // Clear VM bits for one heap page, or for two pages that reside on different VM pages.
+            if let Some(new_vm_blk) = new_vm_blk {
                 self.put_rel_wal_record(
                     modification,
                     vm_rel,
-                    new_vm_blk.unwrap(),
+                    new_vm_blk,
                     NeonWalRecord::ClearVisibilityMapFlags {
                         new_heap_blkno,
+                        old_heap_blkno: None,
+                        flags,
+                    },
+                    ctx,
+                )
+                .await?;
+            }
+            if let Some(old_vm_blk) = old_vm_blk {
+                self.put_rel_wal_record(
+                    modification,
+                    vm_rel,
+                    old_vm_blk,
+                    NeonWalRecord::ClearVisibilityMapFlags {
+                        new_heap_blkno: None,
                         old_heap_blkno,
                         flags,
                     },
                     ctx,
                 )
                 .await?;
-            } else {
-                // Clear VM bits for one heap page, or for two pages that reside on
-                // different VM pages.
-                if let Some(new_vm_blk) = new_vm_blk {
-                    self.put_rel_wal_record(
-                        modification,
-                        vm_rel,
-                        new_vm_blk,
-                        NeonWalRecord::ClearVisibilityMapFlags {
-                            new_heap_blkno,
-                            old_heap_blkno: None,
-                            flags,
-                        },
-                        ctx,
-                    )
-                    .await?;
-                }
-                if let Some(old_vm_blk) = old_vm_blk {
-                    self.put_rel_wal_record(
-                        modification,
-                        vm_rel,
-                        old_vm_blk,
-                        NeonWalRecord::ClearVisibilityMapFlags {
-                            new_heap_blkno: None,
-                            old_heap_blkno,
-                            flags,
-                        },
-                        ctx,
-                    )
-                    .await?;
-                }
             }
         }
-
         Ok(())
     }
 

From cab60b6d9f97983ac8cbd904982cecd67edd2d92 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 4 Feb 2025 11:11:31 -0500
Subject: [PATCH 366/583] fix(pagesever): stablize gc-compaction tests (#10621)

## Problem

Hopefully this can resolve
https://github.com/neondatabase/neon/issues/10517. The reason why the
test is flaky is that after restart the compute node might write some
data so that the pageserver flush some layers, and in the end, causing
L0 compaction to run, and we cannot get the test scenario as we want.

## Summary of changes

Ensure all L0 layers are compacted before starting the test.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_compaction.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 763a63c2e5..c031d66dfb 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -250,6 +250,9 @@ def test_pageserver_gc_compaction_idempotent(
     workload.churn_rows(row_count, env.pageserver.id)
     # compact 3 times if mode is before_restart
     n_compactions = 3 if compaction_mode == "before_restart" else 1
+    ps_http.timeline_compact(
+        tenant_id, timeline_id, force_l0_compaction=True, wait_until_uploaded=True
+    )
     for _ in range(n_compactions):
         # Force refresh gc info to have gc_cutoff generated
         ps_http.timeline_gc(tenant_id, timeline_id, None)

From f9009d6b80f5351454eabdc3df5cbd137f95d5e2 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 4 Feb 2025 17:52:54 +0000
Subject: [PATCH 367/583] pageserver: write heatmap to disk after uploading it
 (#10650)

## Problem

We wish to make heatmap generation additive in
https://github.com/neondatabase/neon/pull/10597.
However, if the pageserver restarts and has a heatmap on disk from when
it was a secondary long ago,
we can end up keeping extra layers on the secondary's disk.

## Summary of changes

Persist the heatmap after a successful upload.
---
 .../src/tenant/secondary/heatmap_uploader.rs  | 22 ++++++++++++++++---
 test_runner/fixtures/neon_fixtures.py         |  5 +++++
 .../regress/test_pageserver_secondary.py      | 10 ++++++---
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index c5e5e04945..d72c337369 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -9,13 +9,14 @@ use crate::{
     metrics::SECONDARY_MODE,
     tenant::{
         config::AttachmentMode,
-        mgr::GetTenantError,
-        mgr::TenantManager,
+        mgr::{GetTenantError, TenantManager},
         remote_timeline_client::remote_heatmap_path,
         span::debug_assert_current_span_has_tenant_id,
         tasks::{warn_when_period_overrun, BackgroundLoopKind},
         Tenant,
     },
+    virtual_file::VirtualFile,
+    TEMP_FILE_SUFFIX,
 };
 
 use futures::Future;
@@ -32,7 +33,10 @@ use super::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, Instrument};
-use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
+use utils::{
+    backoff, completion::Barrier, crashsafe::path_with_suffix_extension,
+    yielding_loop::yielding_loop,
+};
 
 pub(super) async fn heatmap_uploader_task(
     tenant_manager: Arc<TenantManager>,
@@ -461,6 +465,18 @@ async fn upload_tenant_heatmap(
         }
     }
 
+    // After a successful upload persist the fresh heatmap to disk.
+    // When restarting, the tenant will read the heatmap from disk
+    // and additively generate a new heatmap (see [`Timeline::generate_heatmap`]).
+    // If the heatmap is stale, the additive generation can lead to keeping previously
+    // evicted timelines on the secondarie's disk.
+    let tenant_shard_id = tenant.get_tenant_shard_id();
+    let heatmap_path = tenant.conf.tenant_heatmap_path(tenant_shard_id);
+    let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
+    if let Err(err) = VirtualFile::crashsafe_overwrite(heatmap_path, temp_path, bytes).await {
+        tracing::warn!("Non fatal IO error writing to disk after heatmap upload: {err}");
+    }
+
     tracing::info!("Successfully uploaded {size} byte heatmap to {path}");
 
     Ok(UploadHeatmapOutcome::Uploaded(LastUploadState {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 8909f7f249..7c4991ffab 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2766,6 +2766,11 @@ class NeonPageserver(PgProtocol, LogUtils):
             log.error(f"Failed to decode LocationConf, raw content ({len(bytes)} bytes): {bytes}")
             raise
 
+    def heatmap_content(self, tenant_shard_id: TenantId | TenantShardId) -> Any:
+        path = self.tenant_dir(tenant_shard_id) / "heatmap-v1.json"
+        with open(path) as f:
+            return json.load(f)
+
     def tenant_create(
         self,
         tenant_id: TenantId,
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 1292682f9e..590093d23c 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -443,7 +443,7 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
     workload.write_rows(256, env.pageservers[0].id)
     env.pageserver.http_client().tenant_heatmap_upload(tenant_id)
 
-    def validate_heatmap(heatmap):
+    def validate_heatmap(heatmap, on_disk_heatmap):
         assert len(heatmap["timelines"]) == 1
         assert heatmap["timelines"][0]["timeline_id"] == str(timeline_id)
         assert len(heatmap["timelines"][0]["layers"]) > 0
@@ -452,10 +452,13 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
         # Each layer appears at most once
         assert len(set(layer["name"] for layer in layers)) == len(layers)
 
+        assert heatmap == on_disk_heatmap
+
     # Download and inspect the heatmap that the pageserver uploaded
     heatmap_first = env.pageserver_remote_storage.heatmap_content(tenant_id)
+    heatmap_first_on_disk = env.pageserver.heatmap_content(tenant_id)
     log.info(f"Read back heatmap: {heatmap_first}")
-    validate_heatmap(heatmap_first)
+    validate_heatmap(heatmap_first, heatmap_first_on_disk)
 
     # Do some more I/O to generate more layers
     workload.churn_rows(64, env.pageservers[0].id)
@@ -463,9 +466,10 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
 
     # Ensure that another heatmap upload includes the new layers
     heatmap_second = env.pageserver_remote_storage.heatmap_content(tenant_id)
+    heatmap_second_on_disk = env.pageserver.heatmap_content(tenant_id)
     log.info(f"Read back heatmap: {heatmap_second}")
     assert heatmap_second != heatmap_first
-    validate_heatmap(heatmap_second)
+    validate_heatmap(heatmap_second, heatmap_second_on_disk)
 
 
 def list_elegible_layers(

From 472007dd7ce5c5b861b29870a852ad7c99e1ea01 Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Tue, 4 Feb 2025 19:58:02 +0100
Subject: [PATCH 368/583] ci: unify Dockerfiles, set bash as SHELL for debian
 layers, make cpan step as separate RUN (#10645)

## Problem
Ref: https://github.com/neondatabase/cloud/issues/23461

and follow-up after: https://github.com/neondatabase/neon/pull/10553

we used `echo` to set-up `.wgetrc` and `.curlrc`, and there we used `\n`
to make these multiline configs with one echo command.

The problem is that Debian `/bin/sh`'s built-in echo command behaves
differently from the `/bin/echo` executable and from the `echo` built-in
in `bash`. Namely, it does not support the`-e` option, and while it does
treat `\n` as a newline, passing `-e` here will add that `-e` to the
output.
At the same time, when we use different base images, for example
`alpine/curl`, their `/bin/sh` supports and requires `-e` for treating
escape sequences like `\n`.
But having different `echo` and remembering difference in their
behaviour isn't best experience for the developer and makes bad
experience maintaining Dockerfiles.

Work-arounds:

- Explicitly use `/bin/bash` (like in this PR)
- Use `/bin/echo` instead of the shell's built-in echo function
- Use printf "foo\n" instead of echo -e "foo\n"

## Summary of changes
1. To fix that, we process with the option setting `/bin/bash` as a
SHELL for the debian-baysed layers
2. With no changes for `alpine/curl` based layers.
3. And one more change here: in `extensions` layer split to the 2 steps:
installing dependencies from `CPAN` and installing `lcov` from github,
so upgrading `lcov` could reuse previous layer with installed cpan
modules.
---
 build-tools.Dockerfile          | 22 +++++++++++++++++-----
 compute/compute-node.Dockerfile | 10 +++++++++-
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index f744b44808..3ade57b175 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -3,8 +3,13 @@ ARG DEBIAN_VERSION=bookworm
 FROM debian:bookworm-slim AS pgcopydb_builder
 ARG DEBIAN_VERSION
 
+# Use strict mode for bash to catch errors early
+SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
+
+# By default, /bin/sh used in debian images will treat '\n' as eol,
+# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that.
 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
-    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
     echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
 
 RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
@@ -55,7 +60,8 @@ ARG DEBIAN_VERSION
 
 # Add nonroot user
 RUN useradd -ms /bin/bash nonroot -b /home
-SHELL ["/bin/bash", "-c"]
+# Use strict mode for bash to catch errors early
+SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
 
 RUN mkdir -p /pgcopydb/bin && \
     mkdir -p /pgcopydb/lib && \
@@ -66,7 +72,7 @@ COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/p
 COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5
 
 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
-    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
     echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
 
 # System deps
@@ -190,8 +196,14 @@ RUN set -e \
 # It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
 # And patches from us:
 # - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
-RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
-    && wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
+RUN set +o pipefail && \
+	 for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do \
+		yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')";\
+	 done && \
+	set -o pipefail
+# Split into separate step to debug flaky failures here
+RUN wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
+    && ls -laht lcov.tar.gz && sha256sum lcov.tar.gz \
     && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992  lcov.tar.gz" | sha256sum --check \
     && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
     && cd lcov \
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index ea29630001..9379856eab 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -96,8 +96,10 @@ ARG DEBIAN_VERSION
 # Use strict mode for bash to catch errors early
 SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
 
+# By default, /bin/sh used in debian images will treat '\n' as eol,
+# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that.
 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
-    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
     echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
 
 RUN case $DEBIAN_VERSION in \
@@ -1068,6 +1070,7 @@ ENV PATH="/home/nonroot/.cargo/bin:$PATH"
 USER nonroot
 WORKDIR /home/nonroot
 
+# See comment on the top of the file regading `echo` and `\n`
 RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc
 
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
@@ -1584,6 +1587,7 @@ FROM alpine/curl:${ALPINE_CURL_VERSION} AS exporters
 ARG TARGETARCH
 # Keep sql_exporter version same as in build-tools.Dockerfile and
 # test_runner/regress/test_compute_metrics.py
+# See comment on the top of the file regading `echo`, `-e` and `\n`
 RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc; \
     if [ "$TARGETARCH" = "amd64" ]; then\
         postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
@@ -1700,6 +1704,10 @@ ENV PGDATABASE=postgres
 #########################################################################################
 FROM debian:$DEBIAN_FLAVOR
 ARG DEBIAN_VERSION
+
+# Use strict mode for bash to catch errors early
+SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
+
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
     echo "postgres:test_console_pass" | chpasswd && \

From 47975d06d98bde792b61cf1d1d397567f16b0b49 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 5 Feb 2025 12:41:09 +0000
Subject: [PATCH 369/583] storcon: silence cplane 404s on tenant creation
 (#10665)

## Problem

We get WARN log noise on tenant creations. Cplane creates tenants via
/location_config. That returns the attached locations in the response
and spawns a reconciliation which will also attempt to notify cplane. If
the notification is attempted before cplane persists the shards to its
database, storcon gets back a 404. The situation is harmless, but
annoying.

## Summary of Changes

* Add a tenant creation hint to the reconciler config
* If the hint is true and we get back a 404 on the notification from
cplane, ignore the error, but still queue the reconcile up for a retry.

Closes https://github.com/neondatabase/cloud/issues/20732
---
 storage_controller/src/compute_hook.rs |  2 +-
 storage_controller/src/reconciler.rs   | 50 ++++++++++++++++++++++----
 storage_controller/src/service.rs      |  7 +++-
 3 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index 3884a6df46..5bc3c81f02 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -225,7 +225,7 @@ pub(crate) enum NotifyError {
     // We shutdown while sending
     #[error("Shutting down")]
     ShuttingDown,
-    // A response indicates we will never succeed, such as 400 or 404
+    // A response indicates we will never succeed, such as 400 or 403
     #[error("Non-retryable error {0}")]
     Fatal(StatusCode),
 
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 03db947263..58bc0ba1cd 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -115,6 +115,15 @@ impl ReconcilerConfigBuilder {
         }
     }
 
+    pub(crate) fn tenant_creation_hint(self, hint: bool) -> Self {
+        Self {
+            config: ReconcilerConfig {
+                tenant_creation_hint: hint,
+                ..self.config
+            },
+        }
+    }
+
     pub(crate) fn build(self) -> ReconcilerConfig {
         self.config
     }
@@ -129,6 +138,10 @@ pub(crate) struct ReconcilerConfig {
     // During live migrations this is the amount of time that
     // the pagserver will hold our poll.
     secondary_download_request_timeout: Option<Duration>,
+
+    // A hint indicating whether this reconciliation is done on the
+    // creation of a new tenant. This only informs logging behaviour.
+    tenant_creation_hint: bool,
 }
 
 impl ReconcilerConfig {
@@ -143,6 +156,10 @@ impl ReconcilerConfig {
         self.secondary_download_request_timeout
             .unwrap_or(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT)
     }
+
+    pub(crate) fn tenant_creation_hint(&self) -> bool {
+        self.tenant_creation_hint
+    }
 }
 
 /// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O
@@ -934,16 +951,35 @@ impl Reconciler {
                 )
                 .await;
             if let Err(e) = &result {
-                // It is up to the caller whether they want to drop out on this error, but they don't have to:
-                // in general we should avoid letting unavailability of the cloud control plane stop us from
-                // making progress.
-                if !matches!(e, NotifyError::ShuttingDown) {
-                    tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
-                }
-
                 // Set this flag so that in our ReconcileResult we will set the flag on the shard that it
                 // needs to retry at some point.
                 self.compute_notify_failure = true;
+
+                // It is up to the caller whether they want to drop out on this error, but they don't have to:
+                // in general we should avoid letting unavailability of the cloud control plane stop us from
+                // making progress.
+                match e {
+                    // 404s from cplane during tenant creation are expected.
+                    // Cplane only persists the shards to the database after
+                    // creating the tenant and the timeline. If we notify before
+                    // that, we'll get a 404.
+                    //
+                    // This is fine because tenant creations happen via /location_config
+                    // and that returns the list of locations in the response. Hence, we
+                    // silence the error and return Ok(()) here. Reconciliation will still
+                    // be retried because we set [`Reconciler::compute_notify_failure`] above.
+                    NotifyError::Unexpected(hyper::StatusCode::NOT_FOUND)
+                        if self.reconciler_config.tenant_creation_hint() =>
+                    {
+                        return Ok(());
+                    }
+                    NotifyError::ShuttingDown => {}
+                    _ => {
+                        tracing::warn!(
+                            "Failed to notify compute of attached pageserver {node}: {e}"
+                        );
+                    }
+                }
             }
             result
         } else {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 9ac9ee17ca..4028cd7023 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2238,9 +2238,14 @@ impl Service {
         let waiters = {
             let mut locked = self.inner.write().unwrap();
             let (nodes, tenants, _scheduler) = locked.parts_mut();
+            let config = ReconcilerConfigBuilder::new()
+                .tenant_creation_hint(true)
+                .build();
             tenants
                 .range_mut(TenantShardId::tenant_range(tenant_id))
-                .filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes))
+                .filter_map(|(_shard_id, shard)| {
+                    self.maybe_configured_reconcile_shard(shard, nodes, config)
+                })
                 .collect::<Vec<_>>()
         };
 

From f07119cca798e24745b4eda21bde8c2376a22952 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 5 Feb 2025 15:33:04 +0100
Subject: [PATCH 370/583] pageserver: add
 `pageserver_wal_ingest_values_committed` metric (#10653)

## Problem

We don't have visibility into the ratio of image vs. delta pages
ingested in Pageservers. This might be useful to determine whether we
should compress WAL records before storing them, which in turn might
make compaction more efficient.

## Summary of changes

Add `pageserver_wal_ingest_values_committed` metric with dimensions
`class=metadata|data` and `kind=image|delta`.
---
 pageserver/src/metrics.rs                     | 35 +++++++++++++++++++
 pageserver/src/pgdatadir_mapping.rs           | 31 +++++++++++++++-
 .../walreceiver/walreceiver_connection.rs     | 25 +++++++++----
 3 files changed, 84 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 48aed70826..6ab1178a7b 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -32,6 +32,7 @@ use utils::id::TimelineId;
 
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext};
+use crate::pgdatadir_mapping::DatadirModificationStats;
 use crate::task_mgr::TaskKind;
 use crate::tenant::layer_map::LayerMap;
 use crate::tenant::mgr::TenantSlot;
@@ -2378,10 +2379,40 @@ pub(crate) struct WalIngestMetrics {
     pub(crate) records_observed: IntCounter,
     pub(crate) records_committed: IntCounter,
     pub(crate) records_filtered: IntCounter,
+    pub(crate) values_committed_metadata_images: IntCounter,
+    pub(crate) values_committed_metadata_deltas: IntCounter,
+    pub(crate) values_committed_data_images: IntCounter,
+    pub(crate) values_committed_data_deltas: IntCounter,
     pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
 }
 
+impl WalIngestMetrics {
+    pub(crate) fn inc_values_committed(&self, stats: &DatadirModificationStats) {
+        if stats.metadata_images > 0 {
+            self.values_committed_metadata_images
+                .inc_by(stats.metadata_images);
+        }
+        if stats.metadata_deltas > 0 {
+            self.values_committed_metadata_deltas
+                .inc_by(stats.metadata_deltas);
+        }
+        if stats.data_images > 0 {
+            self.values_committed_data_images.inc_by(stats.data_images);
+        }
+        if stats.data_deltas > 0 {
+            self.values_committed_data_deltas.inc_by(stats.data_deltas);
+        }
+    }
+}
+
 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
+    let values_committed = register_int_counter_vec!(
+        "pageserver_wal_ingest_values_committed",
+        "Number of values committed to pageserver storage from WAL records",
+        &["class", "kind"],
+    )
+    .expect("failed to define a metric");
+
     WalIngestMetrics {
     bytes_received: register_int_counter!(
         "pageserver_wal_ingest_bytes_received",
@@ -2408,6 +2439,10 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
         "Number of WAL records filtered out due to sharding"
     )
     .expect("failed to define a metric"),
+    values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]),
+    values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]),
+    values_committed_data_images: values_committed.with_label_values(&["data", "image"]),
+    values_committed_data_deltas: values_committed.with_label_values(&["data", "delta"]),
     gap_blocks_zeroed_on_rel_extend: register_int_counter!(
         "pageserver_gap_blocks_zeroed_on_rel_extend",
         "Total number of zero gap blocks written on relation extends"
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 40c657524d..dcbf62b56c 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -48,7 +48,7 @@ use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
 use utils::{bin_ser::BeSer, lsn::Lsn};
-use wal_decoder::serialized_batch::SerializedValueBatch;
+use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
 
 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
 pub const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -1297,6 +1297,26 @@ impl DatadirModification<'_> {
             .is_some_and(|b| b.has_data())
     }
 
+    /// Returns statistics about the currently pending modifications.
+    pub(crate) fn stats(&self) -> DatadirModificationStats {
+        let mut stats = DatadirModificationStats::default();
+        for (_, _, value) in self.pending_metadata_pages.values().flatten() {
+            match value {
+                Value::Image(_) => stats.metadata_images += 1,
+                Value::WalRecord(r) if r.will_init() => stats.metadata_images += 1,
+                Value::WalRecord(_) => stats.metadata_deltas += 1,
+            }
+        }
+        for valuemeta in self.pending_data_batch.iter().flat_map(|b| &b.metadata) {
+            match valuemeta {
+                ValueMeta::Serialized(s) if s.will_init => stats.data_images += 1,
+                ValueMeta::Serialized(_) => stats.data_deltas += 1,
+                ValueMeta::Observed(_) => {}
+            }
+        }
+        stats
+    }
+
     /// Set the current lsn
     pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
         ensure!(
@@ -2317,6 +2337,15 @@ impl DatadirModification<'_> {
     }
 }
 
+/// Statistics for a DatadirModification.
+#[derive(Default)]
+pub struct DatadirModificationStats {
+    pub metadata_images: u64,
+    pub metadata_deltas: u64,
+    pub data_images: u64,
+    pub data_deltas: u64,
+}
+
 /// This struct facilitates accessing either a committed key from the timeline at a
 /// specific LSN, or the latest uncommitted key from a pending modification.
 ///
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index d69e7dbd32..de917377cb 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -355,6 +355,19 @@ pub(super) async fn handle_walreceiver_connection(
                 // advances it to its end LSN. 0 is just an initialization placeholder.
                 let mut modification = timeline.begin_modification(Lsn(0));
 
+                async fn commit(
+                    modification: &mut DatadirModification<'_>,
+                    ctx: &RequestContext,
+                    uncommitted: &mut u64,
+                ) -> anyhow::Result<()> {
+                    let stats = modification.stats();
+                    modification.commit(ctx).await?;
+                    WAL_INGEST.records_committed.inc_by(*uncommitted);
+                    WAL_INGEST.inc_values_committed(&stats);
+                    *uncommitted = 0;
+                    Ok(())
+                }
+
                 if !records.is_empty() {
                     timeline
                         .metrics
@@ -366,8 +379,7 @@ pub(super) async fn handle_walreceiver_connection(
                     if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
                         && uncommitted_records > 0
                     {
-                        modification.commit(&ctx).await?;
-                        uncommitted_records = 0;
+                        commit(&mut modification, &ctx, &mut uncommitted_records).await?;
                     }
 
                     let local_next_record_lsn = interpreted.next_record_lsn;
@@ -396,8 +408,7 @@ pub(super) async fn handle_walreceiver_connection(
                         || modification.approx_pending_bytes()
                             > DatadirModification::MAX_PENDING_BYTES
                     {
-                        modification.commit(&ctx).await?;
-                        uncommitted_records = 0;
+                        commit(&mut modification, &ctx, &mut uncommitted_records).await?;
                     }
                 }
 
@@ -415,7 +426,7 @@ pub(super) async fn handle_walreceiver_connection(
 
                 if uncommitted_records > 0 || needs_last_record_lsn_advance {
                     // Commit any uncommitted records
-                    modification.commit(&ctx).await?;
+                    commit(&mut modification, &ctx, &mut uncommitted_records).await?;
                 }
 
                 if !caught_up && streaming_lsn >= end_of_wal {
@@ -442,10 +453,12 @@ pub(super) async fn handle_walreceiver_connection(
                     filtered: &mut u64,
                     ctx: &RequestContext,
                 ) -> anyhow::Result<()> {
+                    let stats = modification.stats();
+                    modification.commit(ctx).await?;
                     WAL_INGEST
                         .records_committed
                         .inc_by(*uncommitted - *filtered);
-                    modification.commit(ctx).await?;
+                    WAL_INGEST.inc_values_committed(&stats);
                     *uncommitted = 0;
                     *filtered = 0;
                     Ok(())

From ebc55e6ae87723a95303e62e9e7b16dae218676c Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 5 Feb 2025 08:58:33 -0600
Subject: [PATCH 371/583] Fix logic for checking if a compute can install a
 remote extension (#10656)

Given a remote extensions manifest of the following:

```json
  {
    "public_extensions": [],
    "custom_extensions": null,
    "library_index": {
      "pg_search": "pg_search"
    },
    "extension_data": {
      "pg_search": {
        "control_data": {
          "pg_search.control": "comment = 'pg_search: Full text search for PostgreSQL using BM25'\ndefault_version = '0.14.1'\nmodule_pathname = '$libdir/pg_search'\nrelocatable = false\nsuperuser = true\nschema = paradedb\ntrusted = true\n"
        },
        "archive_path": "13117844657/v14/extensions/pg_search.tar.zst"
      }
    }
  }
```

We were allowing a compute to install a remote extension that wasn't
listed in either public_extensions or custom_extensions.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 libs/compute_api/src/spec.rs                  | 108 ++++++++++++++++--
 .../regress/test_download_extensions.py       |   2 +
 2 files changed, 102 insertions(+), 8 deletions(-)

diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index b3f18dc6da..2fc95c47c6 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -204,14 +204,16 @@ impl RemoteExtSpec {
 
         // Check if extension is present in public or custom.
         // If not, then it is not allowed to be used by this compute.
-        if let Some(public_extensions) = &self.public_extensions {
-            if !public_extensions.contains(&real_ext_name.to_string()) {
-                if let Some(custom_extensions) = &self.custom_extensions {
-                    if !custom_extensions.contains(&real_ext_name.to_string()) {
-                        return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
-                    }
-                }
-            }
+        if !self
+            .public_extensions
+            .as_ref()
+            .is_some_and(|exts| exts.iter().any(|e| e == ext_name))
+            && !self
+                .custom_extensions
+                .as_ref()
+                .is_some_and(|exts| exts.iter().any(|e| e == ext_name))
+        {
+            return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
         }
 
         match self.extension_data.get(real_ext_name) {
@@ -340,6 +342,96 @@ mod tests {
     use super::*;
     use std::fs::File;
 
+    #[test]
+    fn allow_installing_remote_extensions() {
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": null,
+            "custom_extensions": null,
+            "library_index": {},
+            "extension_data": {},
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect_err("Extension should not be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": [],
+            "custom_extensions": null,
+            "library_index": {},
+            "extension_data": {},
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect_err("Extension should not be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": [],
+            "custom_extensions": [],
+            "library_index": {
+                "ext": "ext"
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect_err("Extension should not be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": [],
+            "custom_extensions": ["ext"],
+            "library_index": {
+                "ext": "ext"
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect("Extension should be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": ["ext"],
+            "custom_extensions": [],
+            "library_index": {
+                "ext": "ext"
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect("Extension should be found");
+    }
+
     #[test]
     fn parse_spec_file() {
         let file = File::open("tests/cluster_spec.json").unwrap();
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index d7e6e9de56..7f12c14073 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -95,6 +95,8 @@ def test_remote_extensions(
 
     # mock remote_extensions spec
     spec: dict[str, Any] = {
+        "public_extensions": ["anon"],
+        "custom_extensions": None,
         "library_index": {
             "anon": "anon",
         },

From 14e05276a3e0882777c4ee38252bed25324c93e8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 5 Feb 2025 16:05:12 +0000
Subject: [PATCH 372/583] storcon: fix a case where optimise could get stuck on
 unschedulable node (#10648)

## Problem

When a shard has two secondary locations, but one of them is on a node
with MaySchedule::No, the optimiser would get stuck, because it couldn't
decide which secondary to remove.

This is generally okay if a node is offline, but if a node is in Pause
mode for a long period of time, it's a problem.

Closes: https://github.com/neondatabase/neon/issues/10646

## Summary of changes

- Instead of insisting on finding a node in the wrong AZ to remove, find
an available node in the _right_ AZ, and remove all the others. This
ensures that if there is one live suitable node, then other
offline/paused nodes cannot hold things up.
---
 storage_controller/src/tenant_shard.rs | 150 +++++++++++++++++++++++--
 1 file changed, 141 insertions(+), 9 deletions(-)

diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 302104dc97..219c0dffe7 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -707,6 +707,7 @@ impl TenantShard {
                 if let Some(node_id) = self.intent.get_attached() {
                     // Populate secondary by demoting the attached node
                     self.intent.demote_attached(scheduler, *node_id);
+
                     modified = true;
                 } else if self.intent.secondary.is_empty() {
                     // Populate secondary by scheduling a fresh node
@@ -979,24 +980,51 @@ impl TenantShard {
                         ),
                     )
                 })
-                .collect::<Vec<_>>();
+                .collect::<HashMap<_, _>>();
 
             if secondary_scores.iter().any(|score| score.1.is_none()) {
-                // Don't have full list of scores, so can't make a good decision about which to drop unless
-                // there is an obvious one in the wrong AZ
-                for secondary in self.intent.get_secondary() {
-                    if scheduler.get_node_az(secondary) == self.intent.preferred_az_id {
+                // Trivial case: if we only have one secondary, drop that one
+                if self.intent.get_secondary().len() == 1 {
+                    return Some(ScheduleOptimization {
+                        sequence: self.sequence,
+                        action: ScheduleOptimizationAction::RemoveSecondary(
+                            *self.intent.get_secondary().first().unwrap(),
+                        ),
+                    });
+                }
+
+                // Try to find a "good" secondary to keep, without relying on scores (one or more nodes is in a state
+                // where its score can't be calculated), and drop the others.  This enables us to make progress in
+                // most cases, even if some nodes are offline or have scheduling=pause set.
+
+                debug_assert!(self.intent.attached.is_some()); // We should not make it here unless attached -- this
+                                                               // logic presumes we are in a mode where we want secondaries to be in non-home AZ
+                if let Some(retain_secondary) = self.intent.get_secondary().iter().find(|n| {
+                    let in_home_az = scheduler.get_node_az(n) == self.intent.preferred_az_id;
+                    let is_available = secondary_scores
+                        .get(n)
+                        .expect("Built from same list of nodes")
+                        .is_some();
+                    is_available && !in_home_az
+                }) {
+                    // Great, we found one to retain.  Pick some other to drop.
+                    if let Some(victim) = self
+                        .intent
+                        .get_secondary()
+                        .iter()
+                        .find(|n| n != &retain_secondary)
+                    {
                         return Some(ScheduleOptimization {
                             sequence: self.sequence,
-                            action: ScheduleOptimizationAction::RemoveSecondary(*secondary),
+                            action: ScheduleOptimizationAction::RemoveSecondary(*victim),
                         });
                     }
                 }
 
                 // Fall through: we didn't identify one to remove.  This ought to be rare.
                 tracing::warn!("Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)",
-                self.intent.get_secondary()
-            );
+                    self.intent.get_secondary()
+                );
             } else {
                 let victim = secondary_scores
                     .iter()
@@ -1005,7 +1033,7 @@ impl TenantShard {
                     .0;
                 return Some(ScheduleOptimization {
                     sequence: self.sequence,
-                    action: ScheduleOptimizationAction::RemoveSecondary(victim),
+                    action: ScheduleOptimizationAction::RemoveSecondary(*victim),
                 });
             }
         }
@@ -2379,6 +2407,110 @@ pub(crate) mod tests {
         Ok(())
     }
 
+    /// Test how the optimisation code behaves with an extra secondary
+    #[test]
+    fn optimize_removes_secondary() -> anyhow::Result<()> {
+        let az_a_tag = AvailabilityZone("az-a".to_string());
+        let az_b_tag = AvailabilityZone("az-b".to_string());
+        let mut nodes = make_test_nodes(
+            4,
+            &[
+                az_a_tag.clone(),
+                az_b_tag.clone(),
+                az_a_tag.clone(),
+                az_b_tag.clone(),
+            ],
+        );
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        let mut schedule_context = ScheduleContext::default();
+
+        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        shard_a.intent.preferred_az_id = Some(az_a_tag.clone());
+        shard_a
+            .schedule(&mut scheduler, &mut schedule_context)
+            .unwrap();
+
+        // Attached on node 1, secondary on node 2
+        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
+        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(2)]);
+
+        // Initially optimiser is idle
+        assert_eq!(
+            shard_a.optimize_attachment(&mut scheduler, &schedule_context),
+            None
+        );
+        assert_eq!(
+            shard_a.optimize_secondary(&mut scheduler, &schedule_context),
+            None
+        );
+
+        // A spare secondary in the home AZ: it should be removed -- this is the situation when we're midway through a graceful migration, after cutting over
+        // to our new location
+        shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
+        let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::RemoveSecondary(NodeId(3))
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
+
+        // A spare secondary in the non-home AZ, and one of them is offline
+        shard_a.intent.push_secondary(&mut scheduler, NodeId(4));
+        nodes
+            .get_mut(&NodeId(4))
+            .unwrap()
+            .set_availability(NodeAvailability::Offline);
+        scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
+        let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::RemoveSecondary(NodeId(4))
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
+
+        // A spare secondary when should have none
+        shard_a.policy = PlacementPolicy::Attached(0);
+        let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::RemoveSecondary(NodeId(2))
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
+        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
+        assert_eq!(shard_a.intent.get_secondary(), &vec![]);
+
+        // Check that in secondary mode, we preserve the secondary in the preferred AZ
+        let mut schedule_context = ScheduleContext::default(); // Fresh context, we're about to call schedule()
+        shard_a.policy = PlacementPolicy::Secondary;
+        shard_a
+            .schedule(&mut scheduler, &mut schedule_context)
+            .unwrap();
+        assert_eq!(shard_a.intent.get_attached(), &None);
+        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
+        assert_eq!(
+            shard_a.optimize_attachment(&mut scheduler, &schedule_context),
+            None
+        );
+        assert_eq!(
+            shard_a.optimize_secondary(&mut scheduler, &schedule_context),
+            None
+        );
+
+        shard_a.intent.clear(&mut scheduler);
+
+        Ok(())
+    }
+
     // Optimize til quiescent: this emulates what Service::optimize_all does, when
     // called repeatedly in the background.
     // Returns the applied optimizations

From fba22a7123a444970dd053c09e942a9a2e237a10 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Wed, 5 Feb 2025 20:00:26 +0300
Subject: [PATCH 373/583] Record more timings in test_layer_map (#10670)

## Problem

It it is not very clear how much time take different operations.

## Summary of changes

Record more timings.

ref https://github.com/neondatabase/neon/issues/10409
---
 test_runner/performance/test_layer_map.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index efc7fa59db..6c00944005 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -34,16 +34,20 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
     cur.execute("set log_statement = 'all'")
     cur.execute("create table t(x integer)")
     for _ in range(n_iters):
-        cur.execute(f"insert into t values (generate_series(1,{n_records}))")
+        with zenbenchmark.record_duration(f"insert into t values (generate_series(1,{n_records}))"):
+            cur.execute(f"insert into t values (generate_series(1,{n_records}))")
         time.sleep(1)
 
-    cur.execute("vacuum t")
+    with zenbenchmark.record_duration("vacuum t"):
+        cur.execute("vacuum t")
 
-    with zenbenchmark.record_duration("test_query"):
+    with zenbenchmark.record_duration("SELECT count(*) from t"):
         cur.execute("SELECT count(*) from t")
         assert cur.fetchone() == (n_iters * n_records,)
 
-    flush_ep_to_pageserver(env, endpoint, tenant, timeline)
-    env.pageserver.http_client().timeline_checkpoint(
-        tenant, timeline, compact=False, wait_until_uploaded=True
-    )
+    with zenbenchmark.record_duration("flush_ep_to_pageserver"):
+        flush_ep_to_pageserver(env, endpoint, tenant, timeline)
+    with zenbenchmark.record_duration("timeline_checkpoint"):
+        env.pageserver.http_client().timeline_checkpoint(
+            tenant, timeline, compact=False, wait_until_uploaded=True
+        )

From 133b89a83db9d49a802efc5a66a7fde65b7c8d5e Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 5 Feb 2025 12:35:39 -0500
Subject: [PATCH 374/583] feat(pageserver): continue from last incomplete image
 layer creation (#10660)

## Problem

close https://github.com/neondatabase/neon/issues/10651

## Summary of changes

* Image layer creation starts from the next partition of the last
processed partition if the previous attempt was not complete.
* Add tests.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs            | 92 ++++++++++++++++----
 pageserver/src/tenant/timeline/compaction.rs |  2 +-
 test_runner/regress/test_compaction.py       | 56 +++++++++++-
 3 files changed, 129 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 11c0bbdfe5..b6a349a209 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -192,7 +192,12 @@ pub enum ImageLayerCreationMode {
 
 #[derive(Clone, Debug, Default)]
 pub enum LastImageLayerCreationStatus {
-    Incomplete, // TODO: record the last key being processed
+    Incomplete {
+        /// The last key of the partition (exclusive) that was processed in the last
+        /// image layer creation attempt. We will continue from this key in the next
+        /// attempt.
+        last_key: Key,
+    },
     Complete,
     #[default]
     Initial,
@@ -4346,7 +4351,7 @@ impl Timeline {
         Ok(result)
     }
 
-    // Is it time to create a new image layer for the given partition?
+    // Is it time to create a new image layer for the given partition? True if we want to generate.
     async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
         let threshold = self.get_image_creation_threshold();
 
@@ -4658,6 +4663,11 @@ impl Timeline {
     ) -> Result<(Vec<ResidentLayer>, LastImageLayerCreationStatus), CreateImageLayersError> {
         let timer = self.metrics.create_images_time_histo.start_timer();
 
+        if partitioning.parts.is_empty() {
+            warn!("no partitions to create image layers for");
+            return Ok((vec![], LastImageLayerCreationStatus::Complete));
+        }
+
         // We need to avoid holes between generated image layers.
         // Otherwise LayerMap::image_layer_exists will return false if key range of some layer is covered by more than one
         // image layer with hole between them. In this case such layer can not be utilized by GC.
@@ -4669,28 +4679,65 @@ impl Timeline {
         // image layers  <100000000..100000099> and <200000000..200000199> are not completely covering it.
         let mut start = Key::MIN;
 
-        let check_for_image_layers = if let LastImageLayerCreationStatus::Incomplete = last_status {
-            info!(
-                "resuming image layer creation: last_status={:?}",
-                last_status
-            );
-            true
-        } else {
-            self.should_check_if_image_layers_required(lsn)
-        };
+        let check_for_image_layers =
+            if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status {
+                info!(
+                    "resuming image layer creation: last_status=incomplete, continue from {}",
+                    last_key
+                );
+                true
+            } else {
+                self.should_check_if_image_layers_required(lsn)
+            };
 
         let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?;
 
         let mut all_generated = true;
 
         let mut partition_processed = 0;
-        let total_partitions = partitioning.parts.len();
+        let mut total_partitions = partitioning.parts.len();
+        let mut last_partition_processed = None;
+        let mut partition_parts = partitioning.parts.clone();
 
-        for partition in partitioning.parts.iter() {
+        if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status {
+            // We need to skip the partitions that have already been processed.
+            let mut found = false;
+            for (i, partition) in partition_parts.iter().enumerate() {
+                if last_key <= partition.end().unwrap() {
+                    // ```plain
+                    // |------|--------|----------|------|
+                    //              ^last_key
+                    //                    ^start from this partition
+                    // ```
+                    // Why `i+1` instead of `i`?
+                    // It is possible that the user did some writes after the previous image layer creation attempt so that
+                    // a relation grows in size, and the last_key is now in the middle of the partition. In this case, we
+                    // still want to skip this partition, so that we can make progress and avoid generating image layers over
+                    // the same partition. Doing a mod to ensure we don't end up with an empty vec.
+                    if i + 1 >= total_partitions {
+                        // In general, this case should not happen -- if last_key is on the last partition, the previous
+                        // iteration of image layer creation should return a complete status.
+                        break; // with found=false
+                    }
+                    partition_parts = partition_parts.split_off(i + 1); // Remove the first i + 1 elements
+                    total_partitions = partition_parts.len();
+                    // Update the start key to the partition start.
+                    start = partition_parts[0].start().unwrap();
+                    found = true;
+                    break;
+                }
+            }
+            if !found {
+                // Last key is within the last partition, or larger than all partitions.
+                return Ok((vec![], LastImageLayerCreationStatus::Complete));
+            }
+        }
+
+        for partition in partition_parts.iter() {
             if self.cancel.is_cancelled() {
                 return Err(CreateImageLayersError::Cancelled);
             }
-
+            partition_processed += 1;
             let img_range = start..partition.ranges.last().unwrap().end;
             let compact_metadata = partition.overlaps(&Key::metadata_key_range());
             if compact_metadata {
@@ -4725,6 +4772,8 @@ impl Timeline {
                     lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
                     is_delta: false,
                 }) {
+                    // TODO: this can be processed with the BatchLayerWriter::finish_with_discard
+                    // in the future.
                     tracing::info!(
                         "Skipping image layer at {lsn} {}..{}, already exists",
                         img_range.start,
@@ -4805,8 +4854,6 @@ impl Timeline {
                 }
             }
 
-            partition_processed += 1;
-
             if let ImageLayerCreationMode::Try = mode {
                 // We have at least made some progress
                 if batch_image_writer.pending_layer_num() >= 1 {
@@ -4822,8 +4869,10 @@ impl Timeline {
                         * self.get_compaction_threshold();
                     if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold {
                         tracing::info!(
-                        "preempt image layer generation at {start} at {lsn}: too many L0 layers {num_of_l0_layers}",
+                        "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers {}",
+                        partition.start().unwrap(), partition.end().unwrap(), num_of_l0_layers
                     );
+                        last_partition_processed = Some(partition.clone());
                         all_generated = false;
                         break;
                     }
@@ -4868,7 +4917,14 @@ impl Timeline {
             if all_generated {
                 LastImageLayerCreationStatus::Complete
             } else {
-                LastImageLayerCreationStatus::Incomplete
+                LastImageLayerCreationStatus::Incomplete {
+                    last_key: if let Some(last_partition_processed) = last_partition_processed {
+                        last_partition_processed.end().unwrap_or(Key::MIN)
+                    } else {
+                        // This branch should be unreachable, but in case it happens, we can just return the start key.
+                        Key::MIN
+                    },
+                }
             },
         ))
     }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 7dd37d7232..466ebea783 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -748,7 +748,7 @@ impl Timeline {
                     .store(Arc::new(outcome.clone()));
 
                 self.upload_new_image_layers(image_layers)?;
-                if let LastImageLayerCreationStatus::Incomplete = outcome {
+                if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
                     // Yield and do not do any other kind of compaction.
                     info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
                     return Ok(CompactionOutcome::Pending);
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index c031d66dfb..f3347b594e 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -29,6 +29,21 @@ AGGRESSIVE_COMPACTION_TENANT_CONF = {
     # "lsn_lease_length": "0s", -- TODO: would cause branch creation errors, should fix later
 }
 
+PREEMPT_COMPACTION_TENANT_CONF = {
+    "gc_period": "5s",
+    "compaction_period": "5s",
+    # Small checkpoint distance to create many layers
+    "checkpoint_distance": 1024**2,
+    # Compact small layers
+    "compaction_target_size": 1024**2,
+    "image_creation_threshold": 1,
+    "image_creation_preempt_threshold": 1,
+    # compact more frequently
+    "compaction_threshold": 3,
+    "compaction_upper_limit": 6,
+    "lsn_lease_length": "0s",
+}
+
 
 @skip_in_debug_build("only run with release build")
 @pytest.mark.parametrize(
@@ -36,7 +51,8 @@ AGGRESSIVE_COMPACTION_TENANT_CONF = {
     [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
 )
 def test_pageserver_compaction_smoke(
-    neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: PageserverWalReceiverProtocol
+    neon_env_builder: NeonEnvBuilder,
+    wal_receiver_protocol: PageserverWalReceiverProtocol,
 ):
     """
     This is a smoke test that compaction kicks in. The workload repeatedly churns
@@ -54,7 +70,8 @@ def test_pageserver_compaction_smoke(
 page_cache_size=10
 """
 
-    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESSIVE_COMPACTION_TENANT_CONF)
+    conf = AGGRESSIVE_COMPACTION_TENANT_CONF.copy()
+    env = neon_env_builder.init_start(initial_tenant_conf=conf)
 
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
@@ -113,6 +130,41 @@ page_cache_size=10
     assert vectored_average < 8
 
 
+@skip_in_debug_build("only run with release build")
+def test_pageserver_compaction_preempt(
+    neon_env_builder: NeonEnvBuilder,
+):
+    # Ideally we should be able to do unit tests for this, but we need real Postgres
+    # WALs in order to do unit testing...
+
+    conf = PREEMPT_COMPACTION_TENANT_CONF.copy()
+    env = neon_env_builder.init_start(initial_tenant_conf=conf)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    row_count = 200000
+    churn_rounds = 10
+
+    ps_http = env.pageserver.http_client()
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageserver.id)
+
+    log.info("Writing initial data ...")
+    workload.write_rows(row_count, env.pageserver.id)
+
+    for i in range(1, churn_rounds + 1):
+        log.info(f"Running churn round {i}/{churn_rounds} ...")
+        workload.churn_rows(row_count, env.pageserver.id, upload=False)
+        workload.validate(env.pageserver.id)
+    ps_http.timeline_compact(tenant_id, timeline_id, wait_until_uploaded=True)
+    log.info("Validating at workload end ...")
+    workload.validate(env.pageserver.id)
+    # ensure image layer creation gets preempted and then resumed
+    env.pageserver.assert_log_contains("resuming image layer creation")
+
+
 @skip_in_debug_build("only run with release build")
 @pytest.mark.parametrize(
     "with_branches",

From 6699a30a4948309703a90ed167980ef9d467bfad Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 5 Feb 2025 20:07:51 +0200
Subject: [PATCH 375/583] Make it easy to build only a subset of extensions
 into compute image (#10655)

The full build of all extensions takes a long time. When working locally
on parts that don't need extensions, you can iterate more quickly by
skipping the unnecessary extensions.

This adds a build argument to the dockerfile to specify extensions to
build. There are three options:

- EXTENSIONS=all (default)
- EXTENSIONS=minimal: Build only a few extensions that are listed in
shared_preload_libraries in the default neon config.
- EXTENSIONS=none: Build no extensions (except for the mandatory 'neon'
extension).
---
 compute/compute-node.Dockerfile | 44 +++++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 5 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 9379856eab..43910f2622 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -85,6 +85,10 @@ ARG DEBIAN_VERSION=bookworm
 ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
 ARG ALPINE_CURL_VERSION=8.11.1
 
+# By default, build all PostgreSQL extensions. For quick local testing when you don't
+# care about the extensions, pass EXTENSIONS=none or EXTENSIONS=minimal
+ARG EXTENSIONS=all
+
 #########################################################################################
 #
 # Layer "build-deps"
@@ -1484,12 +1488,35 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
 
 #########################################################################################
 #
-# Layer "all-extensions"
+# Layer "extensions-none"
+#
+#########################################################################################
+FROM build-deps AS extensions-none
+
+RUN mkdir /usr/local/pgsql
+
+#########################################################################################
+#
+# Layer "extensions-minimal"
+#
+# This subset of extensions includes the extensions that we have in
+# shared_preload_libraries by default.
+#
+#########################################################################################
+FROM build-deps AS extensions-minimal
+
+COPY --from=pgrag-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=timescaledb-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_cron-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
+
+#########################################################################################
+#
+# Layer "extensions-all"
 # Bundle together all the extensions
 #
 #########################################################################################
-FROM build-deps AS all-extensions
-ARG PG_VERSION
+FROM build-deps AS extensions-all
 
 # Public extensions
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1531,7 +1558,13 @@ COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
 
-COPY --from=neon-ext-build /usr/local/pgsql/ /usr/local/pgsql/
+#########################################################################################
+#
+# Layer "neon-pg-ext-build"
+# Includes Postgres and all the extensions chosen by EXTENSIONS arg.
+#
+#########################################################################################
+FROM extensions-${EXTENSIONS} AS neon-pg-ext-build
 
 #########################################################################################
 #
@@ -1614,7 +1647,8 @@ RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 30
 #
 #########################################################################################
 FROM neon-ext-build AS postgres-cleanup-layer
-COPY --from=all-extensions /usr/local/pgsql /usr/local/pgsql
+
+COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
 
 # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
 RUN cd /usr/local/pgsql/bin && rm -f ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp

From 733a57247bb04893b62855d68ca54fbc85a58aa7 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 5 Feb 2025 15:44:28 -0500
Subject: [PATCH 376/583] fix(pageserver): disallow gc-compaction produce l0
 layer (#10679)

## Problem

Any compaction should never produce l0 layers. This never happened in my
experiments, but would be good to guard it early.

## Summary of changes

Disallow gc-compaction to produce l0 layers.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 466ebea783..cfde070442 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -33,6 +33,7 @@ use crate::page_cache;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::gc_block::GcBlock;
+use crate::tenant::layer_map::LayerMap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::batch_split_writer::{
     BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -438,6 +439,11 @@ impl KeyHistoryRetention {
         if dry_run {
             return true;
         }
+        if LayerMap::is_l0(&key.key_range, key.is_delta) {
+            // gc-compaction should not produce L0 deltas, otherwise it will break the layer order.
+            // We should ignore such layers.
+            return true;
+        }
         let layer_generation;
         {
             let guard = tline.layers.read().await;

From 0ceeec9be3acd07cda24d849d7cd71dc4c0ac3cc Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 5 Feb 2025 17:11:50 -0500
Subject: [PATCH 377/583] fix(pageserver): schedule compaction immediately if
 pending (#10684)

## Problem

The code is intended to reschedule compaction immediately if there are
pending tasks. We set the duration to 0 before if there are pending
tasks, but this will go through the `if period == Duration::ZERO {`
branch and sleep for another 10 seconds.

## Summary of changes

Set duration to 1 so that it doesn't sleep for too long.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/tasks.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index b6b64d02dd..d65f099182 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -211,7 +211,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                         error_run_count = 0;
                         // schedule the next compaction immediately in case there is a pending compaction task
                         sleep_duration = if let CompactionOutcome::Pending = outcome {
-                            Duration::ZERO
+                            Duration::from_secs(1)
                         } else {
                             period
                         };

From 77f9e74d86d62bca524571aa50416680afb1f5ec Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 6 Feb 2025 02:14:29 +0100
Subject: [PATCH 378/583] pgxn: include socket send & recv queue size in slow
 response logs (#10673)

# Problem

When we see an apparent slow request, one possible cause is that the
client is failing to consume responses, but we don't have a clear way to
see that.

# Solution

- Log the socket queue depths on slow/stuck connections, so that we have
an indication of whether the compute is keeping up with processing the
connection's responses.

refs
- slack https://neondb.slack.com/archives/C036U0GRMRB/p1738652644396329
- refs https://github.com/neondatabase/cloud/issues/23515
- refs https://github.com/neondatabase/cloud/issues/23486
---
 pgxn/neon/libpagestore.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 4460e3b40c..22aeb2e2d6 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -36,6 +36,11 @@
 #include "pagestore_client.h"
 #include "walproposer.h"
 
+#ifdef __linux__
+#include <sys/ioctl.h>
+#include <linux/sockios.h>
+#endif
+
 #define PageStoreTrace DEBUG5
 
 #define MIN_RECONNECT_INTERVAL_USEC 1000
@@ -728,11 +733,36 @@ retry:
 		INSTR_TIME_SUBTRACT(since_last_log, last_log_ts);
 		if (INSTR_TIME_GET_MILLISEC(since_last_log) >= LOG_INTERVAL_MS)
 		{
+			int sndbuf = -1;
+			int recvbuf = -1;
+#ifdef __linux__
+			int socketfd;
+#endif
+
 			since_start = now;
 			INSTR_TIME_SUBTRACT(since_start, start_ts);
-			neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses)",
+
+#ifdef __linux__
+			/*
+			 * get kernel's send and recv queue size via ioctl
+			 * https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27
+			 */
+			socketfd = PQsocket(pageserver_conn);
+			if (socketfd != -1) {
+				int ioctl_err;
+				ioctl_err = ioctl(socketfd, SIOCOUTQ, &sndbuf);
+				if (ioctl_err!= 0) {
+					sndbuf = -errno;
+				}
+				ioctl_err = ioctl(socketfd, FIONREAD, &recvbuf);
+				if (ioctl_err != 0) {
+					recvbuf = -errno;
+				}
+			}
+#endif
+			neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d)",
 						   INSTR_TIME_GET_DOUBLE(since_start),
-						   shard->nrequests_sent, shard->nresponses_received);
+						   shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf);
 			last_log_ts = now;
 			logged = true;
 		}

From 7fc6953da41d536f8d785ea3a41640c80bb7c6bb Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 6 Feb 2025 07:42:14 +0200
Subject: [PATCH 379/583] Is neon superuser (#10625)

## Problem

is_neon_superuser() fiunction is public in pg14/pg15
but statically defined in publicationcmd.c in pg16/pg17

## Summary of changes

Make this function public for all Postgres version.
It is intended to be used not only in  publicationcmd.c

See
https://github.com/neondatabase/postgres/pull/573
https://github.com/neondatabase/postgres/pull/576

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 3cf7ce1afa..86d9ea96eb 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 3cf7ce1afab75027716d14223f95ddb300754162
+Subproject commit 86d9ea96ebb9088eac62f57f1f5ace68e70e0d1c
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index f0ffc8279d..8dfd5a7030 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit f0ffc8279dbcbbc439981a4fd001a9687e5d665d
+Subproject commit 8dfd5a7030d3e8a98b60265ebe045788892ac7f3
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c3eaeac927..efddaef46a 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,11 +1,11 @@
 {
   "v17": [
     "17.2",
-    "f0ffc8279dbcbbc439981a4fd001a9687e5d665d"
+    "8dfd5a7030d3e8a98b60265ebe045788892ac7f3"
   ],
   "v16": [
     "16.6",
-    "3cf7ce1afab75027716d14223f95ddb300754162"
+    "86d9ea96ebb9088eac62f57f1f5ace68e70e0d1c"
   ],
   "v15": [
     "15.10",

From 81cd30e4d6e2f9e43a92e59c6ffce4f0890980ca Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 6 Feb 2025 07:47:56 +0200
Subject: [PATCH 380/583] Use #ifdef instead of #if USE_ASSERT_CHECKING
 (#10683)

## Problem

USE_ASSERT _CHECKING is defined as empty entity. but it is checked using
#if

## Summary of changes

Replace `#if USE_ASSERT _CHECKING` with `#ifdef USE_ASSERT _CHECKING` as
done in other places in Postgres

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c     | 6 +++---
 pgxn/neon/pagestore_smgr.c | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 08b7652175..01da61f84b 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -563,8 +563,8 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 	LWLockRelease(lfc_lock);
 
-#if USE_ASSERT_CHECKING
-	do {
+#ifdef USE_ASSERT_CHECKING
+	{
 		int count = 0;
 
 		for (int j = 0; j < nblocks; j++)
@@ -574,7 +574,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		}
 
 		Assert(count == found);
-	} while (false);
+	}
 #endif
 
 	return found;
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 54cacea984..012bd479bc 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -916,7 +916,7 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
 {
 	uint64		min_ring_index;
 	PrefetchRequest hashkey;
-#if USE_ASSERT_CHECKING
+#ifdef USE_ASSERT_CHECKING
 	bool		any_hits = false;
 #endif
 	/* We will never read further ahead than our buffer can store. */
@@ -955,7 +955,7 @@ Retry:
 		else
 			lsns = NULL;
 
-#if USE_ASSERT_CHECKING
+#ifdef USE_ASSERT_CHECKING
 		any_hits = true;
 #endif
 

From 01f0be03b55e97fd3d1679e25415e1474513419e Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 6 Feb 2025 09:00:00 +0200
Subject: [PATCH 381/583] Fix bugs in lfc_cache_containsv (#10682)

## Problem

Incorrect manipulations with iteration index in `lfc_cache_containsv`

## Summary of changes

```
-		int		this_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs);
+		int		this_chunk = Min(nblocks - i, BLOCKS_PER_CHUNK - chunk_offs);		int		this_chunk = ```
 -		if (i + 1 >= nblocks)
+		if (i >= nblocks)
```

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 01da61f84b..a61dc9f4c6 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -509,47 +509,44 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 
-	tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1);
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
 	hash = get_hash_value(lfc_hash, &tag);
-	chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1);
+	chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
 
 	LWLockAcquire(lfc_lock, LW_SHARED);
 
+	if (!LFC_ENABLED())
+	{
+		LWLockRelease(lfc_lock);
+		return 0;
+	}
 	while (true)
 	{
-		int		this_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs);
-		if (LFC_ENABLED())
-		{
-			entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+		int		this_chunk = Min(nblocks - i, BLOCKS_PER_CHUNK - chunk_offs);
+		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
 
-			if (entry != NULL)
+		if (entry != NULL)
+		{
+			for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
 			{
-				for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
+				if ((entry->bitmap[chunk_offs >> 5] & 
+					 ((uint32)1 << (chunk_offs & 31))) != 0)
 				{
-					if ((entry->bitmap[chunk_offs >> 5] & 
-						((uint32)1 << (chunk_offs & 31))) != 0)
-					{
-						BITMAP_SET(bitmap, i);
-						found++;
-					}
+					BITMAP_SET(bitmap, i);
+					found++;
 				}
 			}
-			else
-			{
-				i += this_chunk;
-			}
 		}
 		else
 		{
-			LWLockRelease(lfc_lock);
-			return found;
+			i += this_chunk;
 		}
 
 		/*
 		 * Break out of the iteration before doing expensive stuff for
 		 * a next iteration
 		 */
-		if (i + 1 >= nblocks)
+		if (i >= nblocks)
 			break;
 
 		/*

From abcd00181c07a4e3427eb2645bab64500aa82d49 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 6 Feb 2025 08:24:36 +0100
Subject: [PATCH 382/583] pageserver: set a concurrency limit for LocalFS
 (#10676)

## Problem

The local filesystem backend for remote storage doesn't set a
concurrency limit. While it can't/won't enforce a concurrency limit
itself, this also bounds the upload queue concurrency. Some tests create
thousands of uploads, which slows down the quadratic scheduling of the
upload queue, and there is no point spawning that many Tokio tasks.

Resolves #10409.

## Summary of changes

Set a concurrency limit of 100 for the LocalFS backend.

Before: `test_layer_map[release-pg17].test_query: 68.338 s`
After: `test_layer_map[release-pg17].test_query: 5.209 s`
---
 libs/remote_storage/src/config.rs               | 10 +++++-----
 libs/remote_storage/src/lib.rs                  |  6 ++++++
 pageserver/src/tenant/remote_timeline_client.rs |  9 +++------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index dae141bf77..ff34158c9c 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
 
 use crate::{
     DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT,
-    DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
+    DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
 };
 
 /// External backup storage configuration, enough for creating a client for that storage.
@@ -45,11 +45,11 @@ impl RemoteStorageKind {
 
 impl RemoteStorageConfig {
     /// Helper to fetch the configured concurrency limit.
-    pub fn concurrency_limit(&self) -> Option<usize> {
+    pub fn concurrency_limit(&self) -> usize {
         match &self.storage {
-            RemoteStorageKind::LocalFs { .. } => None,
-            RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()),
-            RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()),
+            RemoteStorageKind::LocalFs { .. } => DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT,
+            RemoteStorageKind::AwsS3(c) => c.concurrency_limit.into(),
+            RemoteStorageKind::AzureContainer(c) => c.concurrency_limit.into(),
         }
     }
 }
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 7a864151ec..69b522d63e 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -65,6 +65,12 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
 /// Here, a limit of max 20k concurrent connections was noted.
 /// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
 pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
+/// Set this limit analogously to the S3 limit.
+///
+/// The local filesystem backend doesn't enforce a concurrency limit itself, but this also bounds
+/// the upload queue concurrency. Some tests create thousands of uploads, which slows down the
+/// quadratic scheduling of the upload queue, and there is no point spawning so many Tokio tasks.
+pub const DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT: usize = 100;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index bcba6d1f62..ad6d8dfae8 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -437,8 +437,7 @@ impl RemoteTimelineClient {
             .conf
             .remote_storage_config
             .as_ref()
-            .and_then(|r| r.concurrency_limit())
-            .unwrap_or(0);
+            .map_or(0, |r| r.concurrency_limit());
         let mut upload_queue = self.upload_queue.lock().unwrap();
         upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
         self.update_remote_physical_size_gauge(Some(index_part));
@@ -461,8 +460,7 @@ impl RemoteTimelineClient {
             .conf
             .remote_storage_config
             .as_ref()
-            .and_then(|r| r.concurrency_limit())
-            .unwrap_or(0);
+            .map_or(0, |r| r.concurrency_limit());
         let mut upload_queue = self.upload_queue.lock().unwrap();
         upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
         self.update_remote_physical_size_gauge(None);
@@ -484,8 +482,7 @@ impl RemoteTimelineClient {
             .conf
             .remote_storage_config
             .as_ref()
-            .and_then(|r| r.concurrency_limit())
-            .unwrap_or(0);
+            .map_or(0, |r| r.concurrency_limit());
 
         let mut upload_queue = self.upload_queue.lock().unwrap();
         upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;

From 1686d9e7332653ef44d8690d4764c582437eb6bf Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 6 Feb 2025 09:33:37 +0100
Subject: [PATCH 383/583] perf(page_service): dont `.instrument(span.clone())`
 the response flush (#10686)

On my AX102 Hetzner box, removing this line removes about 20us from the
`latency_mean` result in

`test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant`.

If the same 20us can be removed in the nightly benchmark run, this will
be a ~10% improvement because there, mean latencies are about ~220us.

This span was added during batching refactors, we didn't have it before,
and I don't think it's terribly useful.

refs
- https://github.com/neondatabase/cloud/issues/21759
---
 pageserver/src/page_service.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index e103338c7c..679cf3b2d5 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1280,8 +1280,6 @@ impl PageServerHandler {
                 }
                 Ok(())
             }
-            // and log the info! line inside the request span
-            .instrument(span.clone())
             .await?;
         }
         Ok(())

From 95588dab9874f33bd4ef637171b765a7b71455ca Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 6 Feb 2025 09:24:28 +0000
Subject: [PATCH 384/583] safekeeper: fix wal fan-out shard subscription data
 race (#10677)

## Problem

[This select
arm](https://github.com/neondatabase/neon/blob/main/safekeeper/src/send_interpreted_wal.rs#L414)
runs when we want to attach a new reader to the current cursor.
It checks the current position of the cursor and resets it if required.

The current position of the cursor is updated in the [other select
arm](https://github.com/neondatabase/neon/blob/main/safekeeper/src/send_interpreted_wal.rs#L336-L345).
That runs when we get some WAL to send.

Now, what happens if we want to attach two shards consecutively to the
cursor?
Let's say [this select
arm](https://github.com/neondatabase/neon/blob/main/safekeeper/src/send_interpreted_wal.rs#L397)
runs twice in a row.

Let's assume cursor is currently at LSN X. First shard wants to attach
at position V
and the other one at W. Assume X > W > V.

First shard resets the stream to position V. Second shard comes in,
sees stale cursor position X and resets it to W. This means that the
first shard doesn't get wal in the [V, W) range.

## Summary of changes

Ultimately, this boils down to the current position not being kept in
sync with the reset of the WAL stream. This patch fixes the race by
updating it when resetting the WAL stream and adds a unit test repro.

Closes https://github.com/neondatabase/cloud/issues/23750
---
 safekeeper/src/send_interpreted_wal.rs | 156 ++++++++++++++++++++-----
 safekeeper/src/test_utils.rs           |   6 +-
 safekeeper/src/wal_reader_stream.rs    |   2 +-
 3 files changed, 130 insertions(+), 34 deletions(-)

diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
index ea09ce364d..b57cc8001d 100644
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -120,6 +120,20 @@ pub enum InterpretedWalReaderError {
     WalStreamClosed,
 }
 
+enum CurrentPositionUpdate {
+    Reset(Lsn),
+    NotReset(Lsn),
+}
+
+impl CurrentPositionUpdate {
+    fn current_position(&self) -> Lsn {
+        match self {
+            CurrentPositionUpdate::Reset(lsn) => *lsn,
+            CurrentPositionUpdate::NotReset(lsn) => *lsn,
+        }
+    }
+}
+
 impl InterpretedWalReaderState {
     fn current_position(&self) -> Option<Lsn> {
         match self {
@@ -129,6 +143,26 @@ impl InterpretedWalReaderState {
             InterpretedWalReaderState::Done => None,
         }
     }
+
+    // Reset the current position of the WAL reader if the requested starting position
+    // of the new shard is smaller than the current value.
+    fn maybe_reset(&mut self, new_shard_start_pos: Lsn) -> CurrentPositionUpdate {
+        match self {
+            InterpretedWalReaderState::Running {
+                current_position, ..
+            } => {
+                if new_shard_start_pos < *current_position {
+                    *current_position = new_shard_start_pos;
+                    CurrentPositionUpdate::Reset(*current_position)
+                } else {
+                    CurrentPositionUpdate::NotReset(*current_position)
+                }
+            }
+            InterpretedWalReaderState::Done => {
+                panic!("maybe_reset called on finished reader")
+            }
+        }
+    }
 }
 
 pub(crate) struct AttachShardNotification {
@@ -410,15 +444,24 @@ impl InterpretedWalReader {
                         };
 
                         senders.push(ShardSenderState { sender_id: new_sender_id, tx: sender, next_record_lsn: start_pos});
-                        let current_pos = self.state.read().unwrap().current_position().unwrap();
-                        if start_pos < current_pos {
-                            self.wal_stream.reset(start_pos).await;
-                            wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
-                        }
+
+                        // If the shard is subscribing below the current position the we need
+                        // to update the cursor that tracks where we are at in the WAL
+                        // ([`Self::state`]) and reset the WAL stream itself
+                        // (`[Self::wal_stream`]). This must be done atomically from the POV of
+                        // anything outside the select statement.
+                        let position_reset = self.state.write().unwrap().maybe_reset(start_pos);
+                        match position_reset {
+                            CurrentPositionUpdate::Reset(to) => {
+                                self.wal_stream.reset(to).await;
+                                wal_decoder = WalStreamDecoder::new(to, self.pg_version);
+                            },
+                            CurrentPositionUpdate::NotReset(_) => {}
+                        };
 
                         tracing::info!(
                             "Added shard sender {} with start_pos={} current_pos={}",
-                            ShardSenderId::new(shard_id, new_sender_id), start_pos, current_pos
+                            ShardSenderId::new(shard_id, new_sender_id), start_pos, position_reset.current_position()
                         );
                     }
                 }
@@ -584,7 +627,7 @@ mod tests {
             .unwrap();
 
         let resident_tli = tli.wal_residence_guard().await.unwrap();
-        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
+        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None)
             .await
             .unwrap();
         let end_pos = end_watch.get();
@@ -715,7 +758,6 @@ mod tests {
         const MSG_COUNT: usize = 200;
         const PG_VERSION: u32 = 17;
         const SHARD_COUNT: u8 = 2;
-        const ATTACHED_SHARDS: u8 = 4;
 
         let start_lsn = Lsn::from_str("0/149FD18").unwrap();
         let env = Env::new(true).unwrap();
@@ -725,9 +767,11 @@ mod tests {
             .unwrap();
 
         let resident_tli = tli.wal_residence_guard().await.unwrap();
-        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
-            .await
-            .unwrap();
+        let mut next_record_lsns = Vec::default();
+        let end_watch =
+            Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, Some(&mut next_record_lsns))
+                .await
+                .unwrap();
         let end_pos = end_watch.get();
 
         let streaming_wal_reader = StreamingWalReader::new(
@@ -746,38 +790,71 @@ mod tests {
         )
         .unwrap();
 
-        let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
-        let mut batch_receivers = vec![rx];
+        struct Sender {
+            tx: Option<tokio::sync::mpsc::Sender<Batch>>,
+            rx: tokio::sync::mpsc::Receiver<Batch>,
+            shard: ShardIdentity,
+            start_lsn: Lsn,
+            received_next_record_lsns: Vec<Lsn>,
+        }
 
+        impl Sender {
+            fn new(start_lsn: Lsn, shard: ShardIdentity) -> Self {
+                let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
+                Self {
+                    tx: Some(tx),
+                    rx,
+                    shard,
+                    start_lsn,
+                    received_next_record_lsns: Vec::default(),
+                }
+            }
+        }
+
+        assert!(next_record_lsns.len() > 7);
+        let start_lsns = vec![
+            next_record_lsns[5],
+            next_record_lsns[1],
+            next_record_lsns[3],
+        ];
+        let mut senders = start_lsns
+            .into_iter()
+            .map(|lsn| Sender::new(lsn, shard_0))
+            .collect::<Vec<_>>();
+
+        let first_sender = senders.first_mut().unwrap();
         let handle = InterpretedWalReader::spawn(
             streaming_wal_reader,
-            start_lsn,
-            tx,
-            shard_0,
+            first_sender.start_lsn,
+            first_sender.tx.take().unwrap(),
+            first_sender.shard,
             PG_VERSION,
             &Some("pageserver".to_string()),
         );
 
-        for _ in 0..(ATTACHED_SHARDS - 1) {
-            let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
-            handle.fanout(shard_0, tx, start_lsn).unwrap();
-            batch_receivers.push(rx);
+        for sender in senders.iter_mut().skip(1) {
+            handle
+                .fanout(sender.shard, sender.tx.take().unwrap(), sender.start_lsn)
+                .unwrap();
         }
 
-        loop {
-            let batch = batch_receivers.first_mut().unwrap().recv().await.unwrap();
-            for rx in batch_receivers.iter_mut().skip(1) {
-                let other_batch = rx.recv().await.unwrap();
-
-                assert_eq!(batch.wal_end_lsn, other_batch.wal_end_lsn);
-                assert_eq!(
-                    batch.available_wal_end_lsn,
-                    other_batch.available_wal_end_lsn
+        for sender in senders.iter_mut() {
+            loop {
+                let batch = sender.rx.recv().await.unwrap();
+                tracing::info!(
+                    "Sender with start_lsn={} received batch ending at {} with {} records",
+                    sender.start_lsn,
+                    batch.wal_end_lsn,
+                    batch.records.records.len()
                 );
-            }
 
-            if batch.wal_end_lsn == batch.available_wal_end_lsn {
-                break;
+                for rec in batch.records.records {
+                    sender.received_next_record_lsns.push(rec.next_record_lsn);
+                }
+
+                if batch.wal_end_lsn == batch.available_wal_end_lsn {
+                    break;
+                }
             }
         }
 
@@ -792,5 +869,20 @@ mod tests {
         }
 
         assert!(done);
+
+        for sender in senders {
+            tracing::info!(
+                "Validating records received by sender with start_lsn={}",
+                sender.start_lsn
+            );
+
+            assert!(sender.received_next_record_lsns.is_sorted());
+            let expected = next_record_lsns
+                .iter()
+                .filter(|lsn| **lsn > sender.start_lsn)
+                .copied()
+                .collect::<Vec<_>>();
+            assert_eq!(sender.received_next_record_lsns, expected);
+        }
     }
 }
diff --git a/safekeeper/src/test_utils.rs b/safekeeper/src/test_utils.rs
index 4e851c5b3d..79ceddd366 100644
--- a/safekeeper/src/test_utils.rs
+++ b/safekeeper/src/test_utils.rs
@@ -122,6 +122,7 @@ impl Env {
         start_lsn: Lsn,
         msg_size: usize,
         msg_count: usize,
+        mut next_record_lsns: Option<&mut Vec<Lsn>>,
     ) -> anyhow::Result<EndWatch> {
         let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE);
         let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE);
@@ -130,7 +131,7 @@ impl Env {
 
         WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, Some(0));
 
-        let prefix = c"p";
+        let prefix = c"neon-file:";
         let prefixlen = prefix.to_bytes_with_nul().len();
         assert!(msg_size >= prefixlen);
         let message = vec![0; msg_size - prefixlen];
@@ -139,6 +140,9 @@ impl Env {
             &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), start_lsn);
         for _ in 0..msg_count {
             let (lsn, record) = walgen.next().unwrap();
+            if let Some(ref mut lsns) = next_record_lsns {
+                lsns.push(lsn);
+            }
 
             let req = AppendRequest {
                 h: AppendRequestHeader {
diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs
index adac6067da..a0dd571a34 100644
--- a/safekeeper/src/wal_reader_stream.rs
+++ b/safekeeper/src/wal_reader_stream.rs
@@ -246,7 +246,7 @@ mod tests {
             .unwrap();
 
         let resident_tli = tli.wal_residence_guard().await.unwrap();
-        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
+        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None)
             .await
             .unwrap();
         let end_pos = end_watch.get();

From b66fbd6176152f0d67e8b230a78639fd3481adcc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 6 Feb 2025 11:09:20 +0100
Subject: [PATCH 385/583] Warn on basebackups for archived timelines (#10688)

We don't want any external requests for an archived timeline. This
includes basebackup requests, i.e. when a compute is being started up.

Therefore, we'd like to forbid such basebackup requests: any attempt to
get a basebackup on an archived timeline (or any getpage request really)
is a cplane bug. Make this a warning for now so that, if there is
potentially a bug, we can detect cases in the wild before they cause
stuck operations, but the intention is to return an error eventually.

Related: #9548
---
 pageserver/src/page_service.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 679cf3b2d5..d4898532d6 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -2035,6 +2035,12 @@ impl PageServerHandler {
             .get(tenant_id, timeline_id, ShardSelector::Zero)
             .await?;
 
+        if timeline.is_archived() == Some(true) {
+            // TODO after a grace period, turn this log line into a hard error
+            tracing::warn!("timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it.");
+            //return Err(QueryError::NotFound("timeline is archived".into()))
+        }
+
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         if let Some(lsn) = lsn {
             // Backup was requested at a particular LSN. Wait for it to arrive.

From 05326cc247b0149ff27ada881249f4945dc6b541 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 6 Feb 2025 11:10:11 +0100
Subject: [PATCH 386/583] Skip gc cutoff lsn check at timeline creation if
 lease exists (#10685)

Right now, branch creation doesn't care if a lsn lease exists or not, it
just fails if the passed lsn is older than either the last or the
planned gc cutoff.

However, if an lsn lease exists for a given lsn, we can actually create
a branch at that point: nothing has been gc'd away.

This prevents race conditions that #10678 still leaves around.

Related: #10639
https://github.com/neondatabase/cloud/issues/23667
---
 pageserver/src/page_service.rs    |  2 +-
 pageserver/src/tenant.rs          | 32 +++++++++++++++++--------------
 pageserver/src/tenant/timeline.rs |  3 +++
 3 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index d4898532d6..24a350399d 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1690,7 +1690,7 @@ impl PageServerHandler {
         // to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN).
         if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() {
             let gc_info = &timeline.gc_info.read().unwrap();
-            if !gc_info.leases.contains_key(&request_lsn) {
+            if !gc_info.lsn_covered_by_lease(request_lsn) {
                 return Err(
                     PageStreamError::BadRequest(format!(
                         "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c1b408ed72..3c6996dd51 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4642,22 +4642,26 @@ impl Tenant {
 
         // check against last actual 'latest_gc_cutoff' first
         let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
-        src_timeline
-            .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
-            .context(format!(
-                "invalid branch start lsn: less than latest GC cutoff {}",
-                *latest_gc_cutoff_lsn,
-            ))
-            .map_err(CreateTimelineError::AncestorLsn)?;
-
-        // and then the planned GC cutoff
         {
             let gc_info = src_timeline.gc_info.read().unwrap();
-            let cutoff = gc_info.min_cutoff();
-            if start_lsn < cutoff {
-                return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
-                    "invalid branch start lsn: less than planned GC cutoff {cutoff}"
-                )));
+            let planned_cutoff = gc_info.min_cutoff();
+            if gc_info.lsn_covered_by_lease(start_lsn) {
+                tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *latest_gc_cutoff_lsn);
+            } else {
+                src_timeline
+                    .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
+                    .context(format!(
+                        "invalid branch start lsn: less than latest GC cutoff {}",
+                        *latest_gc_cutoff_lsn,
+                    ))
+                    .map_err(CreateTimelineError::AncestorLsn)?;
+
+                // and then the planned GC cutoff
+                if start_lsn < planned_cutoff {
+                    return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
+                        "invalid branch start lsn: less than planned GC cutoff {planned_cutoff}"
+                    )));
+                }
             }
         }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b6a349a209..45ddd38f67 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -531,6 +531,9 @@ impl GcInfo {
     pub(super) fn remove_child_offloaded(&mut self, child_id: TimelineId) -> bool {
         self.remove_child_maybe_offloaded(child_id, MaybeOffloaded::Yes)
     }
+    pub(crate) fn lsn_covered_by_lease(&self, lsn: Lsn) -> bool {
+        self.leases.contains_key(&lsn)
+    }
 }
 
 /// The `GcInfo` component describing which Lsns need to be retained.  Functionally, this

From f4cfa725b8ab67fafba2aef8761549c0cc010147 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 6 Feb 2025 11:30:27 +0100
Subject: [PATCH 387/583] pageserver: add a few critical errors (#10657)

## Problem

Following #10641, let's add a few critical errors.

Resolves #10094.

## Summary of changes

Adds the following critical errors:

* WAL sender read/decode failure.
* WAL record ingestion failure.
* WAL redo failure.
* Missing key during compaction.

We don't add an error for missing keys during GetPage requests, since
we've seen a handful of these in production recently, and the cause is
still unclear (most likely a benign race).
---
 libs/utils/src/logging.rs                     |  9 ++++++---
 pageserver/src/tenant/timeline.rs             |  8 +++++---
 pageserver/src/tenant/timeline/compaction.rs  | 15 ++++++++++++---
 .../walreceiver/walreceiver_connection.rs     | 16 +++++++++++++++-
 safekeeper/src/send_interpreted_wal.rs        | 19 +++++++++----------
 5 files changed, 47 insertions(+), 20 deletions(-)

diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 753f05b6fd..4a6069294d 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -8,19 +8,22 @@ use strum_macros::{EnumString, VariantNames};
 /// Logs a critical error, similarly to `tracing::error!`. This will:
 ///
 /// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace.
+/// * Trigger a pageable alert (via the metric below).
 /// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error".
-/// * Trigger a pageable alert (via the metric above).
 /// * In debug builds, panic the process.
+///
+/// When including errors in the message, please use {err:?} to include the error cause and original
+/// backtrace.
 #[macro_export]
 macro_rules! critical {
-    ($($arg:tt)*) => {
+    ($($arg:tt)*) => {{
         if cfg!(debug_assertions) {
             panic!($($arg)*);
         }
         $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
         let backtrace = std::backtrace::Backtrace::capture();
         tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*));
-    };
+    }};
 }
 
 #[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 45ddd38f67..908356c459 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -52,6 +52,7 @@ use tokio::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::critical;
 use utils::rate_limit::RateLimit;
 use utils::{
     fs_ext,
@@ -5807,10 +5808,11 @@ impl Timeline {
                 let img = match res {
                     Ok(img) => img,
                     Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
-                    Err(walredo::Error::Other(e)) => {
+                    Err(walredo::Error::Other(err)) => {
+                        critical!("walredo failure during page reconstruction: {err:?}");
                         return Err(PageReconstructError::WalRedo(
-                            e.context("reconstruct a page image"),
-                        ))
+                            err.context("reconstruct a page image"),
+                        ));
                     }
                 };
                 Ok(img)
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index cfde070442..b9f4954453 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -10,8 +10,8 @@ use std::sync::Arc;
 
 use super::layer_manager::LayerManager;
 use super::{
-    CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
-    LastImageLayerCreationStatus, RecordedDuration, Timeline,
+    CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError,
+    ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline,
 };
 
 use anyhow::{anyhow, bail, Context};
@@ -26,6 +26,7 @@ use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
 use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, info_span, trace, warn, Instrument};
+use utils::critical;
 use utils::id::TimelineId;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
@@ -748,7 +749,15 @@ impl Timeline {
                             .as_ref()
                             .clone(),
                     )
-                    .await?;
+                    .await
+                    .inspect_err(|err| {
+                        if let CreateImageLayersError::GetVectoredError(
+                            GetVectoredError::MissingKey(_),
+                        ) = err
+                        {
+                            critical!("missing key during compaction: {err:?}");
+                        }
+                    })?;
 
                 self.last_image_layer_creation_status
                     .store(Arc::new(outcome.clone()));
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index de917377cb..23db4f88d2 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -39,7 +39,7 @@ use crate::{
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use utils::{id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol};
+use utils::{critical, id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol};
 use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
 
 /// Status of the connection.
@@ -393,6 +393,13 @@ pub(super) async fn handle_walreceiver_connection(
                         .await
                         .with_context(|| {
                             format!("could not ingest record at {local_next_record_lsn}")
+                        })
+                        .inspect_err(|err| {
+                            // TODO: we can't differentiate cancellation errors with
+                            // anyhow::Error, so just ignore it if we're cancelled.
+                            if !cancellation.is_cancelled() {
+                                critical!("{err:?}")
+                            }
                         })?;
 
                     uncommitted_records += 1;
@@ -520,6 +527,13 @@ pub(super) async fn handle_walreceiver_connection(
                             .await
                             .with_context(|| {
                                 format!("could not ingest record at {next_record_lsn}")
+                            })
+                            .inspect_err(|err| {
+                                // TODO: we can't differentiate cancellation errors with
+                                // anyhow::Error, so just ignore it if we're cancelled.
+                                if !cancellation.is_cancelled() {
+                                    critical!("{err:?}")
+                                }
                             })?;
                         if !ingested {
                             tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}");
diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
index b57cc8001d..5916675c3f 100644
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -15,7 +15,8 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::sync::mpsc::error::SendError;
 use tokio::task::JoinHandle;
 use tokio::time::MissedTickBehavior;
-use tracing::{info_span, Instrument};
+use tracing::{error, info, info_span, Instrument};
+use utils::critical;
 use utils::lsn::Lsn;
 use utils::postgres_client::Compression;
 use utils::postgres_client::InterpretedFormat;
@@ -213,11 +214,10 @@ impl InterpretedWalReader {
                     metric.dec();
                 }
 
-                let res = reader.run_impl(start_pos).await;
-                if let Err(ref err) = res {
-                    tracing::error!("Task finished with error: {err}");
-                }
-                res
+                reader
+                    .run_impl(start_pos)
+                    .await
+                    .inspect_err(|err| critical!("failed to read WAL record: {err:?}"))
             }
             .instrument(info_span!("interpreted wal reader")),
         );
@@ -273,11 +273,10 @@ impl InterpretedWalReader {
             metric.dec();
         }
 
-        let res = self.run_impl(start_pos).await;
-        if let Err(err) = res {
-            tracing::error!("Interpreted wal reader encountered error: {err}");
+        if let Err(err) = self.run_impl(start_pos).await {
+            critical!("failed to read WAL record: {err:?}");
         } else {
-            tracing::info!("Interpreted wal reader exiting");
+            info!("interpreted wal reader exiting");
         }
 
         Err(CopyStreamHandlerEnd::Other(anyhow!(

From 67b71538d0a7775043614122916b5ec4e5f61f8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 6 Feb 2025 12:17:08 +0100
Subject: [PATCH 388/583] Limit returned lsn for timestamp by the planned gc
 cutoff (#10678)

Often the output of the timestamp->lsn API is used as input for branch
creation, and branch creation takes the planned lsn into account, i.e.
rejects lsn's as branch lsns that are before the planned lsn.

This patch doesn't fix all race conditions, it's still racy. But at
least it is a step into the right direction.

For #10639
---
 pageserver/src/pgdatadir_mapping.rs | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index dcbf62b56c..00f332d797 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -612,11 +612,18 @@ impl Timeline {
         pausable_failpoint!("find-lsn-for-timestamp-pausable");
 
         let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
+        let gc_cutoff_planned = {
+            let gc_info = self.gc_info.read().unwrap();
+            gc_info.min_cutoff()
+        };
+        // Usually the planned cutoff is newer than the cutoff of the last gc run,
+        // but let's be defensive.
+        let gc_cutoff = gc_cutoff_planned.max(*gc_cutoff_lsn_guard);
         // We use this method to figure out the branching LSN for the new branch, but the
         // GC cutoff could be before the branching point and we cannot create a new branch
         // with LSN < `ancestor_lsn`. Thus, pick the maximum of these two to be
         // on the safe side.
-        let min_lsn = std::cmp::max(*gc_cutoff_lsn_guard, self.get_ancestor_lsn());
+        let min_lsn = std::cmp::max(gc_cutoff, self.get_ancestor_lsn());
         let max_lsn = self.get_last_record_lsn();
 
         // LSNs are always 8-byte aligned. low/mid/high represent the

From 977781e423ad614c4fe8c55735fb35707170a402 Mon Sep 17 00:00:00 2001
From: Alexander Lakhin <exclusion@gmail.com>
Date: Thu, 6 Feb 2025 14:53:43 +0200
Subject: [PATCH 389/583] Enable sanitizers for postgres v17 (#10401)

Add a build with sanitizers (asan, ubsan) to the CI pipeline and run
tests on it.

See https://github.com/neondatabase/neon/issues/6053

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .../actions/run-python-test-set/action.yml    |   2 +
 .github/workflows/_build-and-test-locally.yml |  35 +++--
 .../build_and_test_with_sanitizers.yml        | 133 ++++++++++++++++++
 Makefile                                      |  17 ++-
 compute_tools/src/bin/fast_import.rs          |   8 ++
 control_plane/src/background_process.rs       |   8 +-
 control_plane/src/storage_controller.rs       |  18 ++-
 libs/postgres_ffi/wal_craft/src/lib.rs        |  10 +-
 libs/postgres_initdb/src/lib.rs               |   8 ++
 libs/utils/scripts/restore_from_wal.sh        |   2 +-
 pageserver/src/walredo/process.rs             |   8 ++
 .../ingest_regress_test_result-new-format.py  |   6 +-
 test_runner/fixtures/parametrize.py           |   3 +
 test_runner/regress/test_compatibility.py     |   7 +-
 test_runner/regress/test_pg_regress.py        |   8 +-
 .../regress/test_subscriber_restart.py        |   2 +
 16 files changed, 253 insertions(+), 22 deletions(-)
 create mode 100644 .github/workflows/build_and_test_with_sanitizers.yml

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 9a0261d430..0eddfe5da6 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -121,6 +121,8 @@ runs:
         export DEFAULT_PG_VERSION=${PG_VERSION#v}
         export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
         export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}
+        export ASAN_OPTIONS=detect_leaks=0:detect_stack_use_after_return=0:abort_on_error=1:strict_string_checks=1:check_initialization_order=1:strict_init_order=1
+        export UBSAN_OPTIONS=abort_on_error=1:print_stacktrace=1
 
         if [ "${BUILD_TYPE}" = "remote" ]; then
           export REMOTE_ENV=1
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 1dec8106b4..3a6fbf4234 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -20,7 +20,7 @@ on:
         required: true
         type: string
       test-cfg:
-        description: 'a json object of postgres versions and lfc states to run regression tests on'
+        description: 'a json object of postgres versions and lfc/sanitizers states to build and run regression tests on'
         required: true
         type: string
 
@@ -48,6 +48,8 @@ jobs:
       # io_uring will account the memory of the CQ and SQ as locked.
       # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
       options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
+    strategy:
+      matrix: ${{ fromJSON(format('{{"include":{0}}}', inputs.test-cfg)) }}
     env:
       BUILD_TYPE: ${{ inputs.build-type }}
       GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
@@ -87,6 +89,7 @@ jobs:
       - name: Set env variables
         env:
           ARCH: ${{ inputs.arch }}
+          SANITIZERS: ${{ matrix.sanitizers }}
         run: |
           CARGO_FEATURES="--features testing"
           if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
@@ -99,8 +102,14 @@ jobs:
             cov_prefix=""
             CARGO_FLAGS="--locked --release"
           fi
+          if [[ $SANITIZERS == 'enabled' ]]; then
+            make_vars="WITH_SANITIZERS=yes"
+          else
+            make_vars=""
+          fi
           {
             echo "cov_prefix=${cov_prefix}"
+            echo "make_vars=${make_vars}"
             echo "CARGO_FEATURES=${CARGO_FEATURES}"
             echo "CARGO_FLAGS=${CARGO_FLAGS}"
             echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
@@ -136,35 +145,39 @@ jobs:
 
       - name: Build postgres v14
         if: steps.cache_pg_14.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v14 -j$(nproc)
+        run: mold -run make ${make_vars} postgres-v14 -j$(nproc)
 
       - name: Build postgres v15
         if: steps.cache_pg_15.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v15 -j$(nproc)
+        run: mold -run make ${make_vars} postgres-v15 -j$(nproc)
 
       - name: Build postgres v16
         if: steps.cache_pg_16.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v16 -j$(nproc)
+        run: mold -run make ${make_vars} postgres-v16 -j$(nproc)
 
       - name: Build postgres v17
         if: steps.cache_pg_17.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v17 -j$(nproc)
+        run: mold -run make ${make_vars} postgres-v17 -j$(nproc)
 
       - name: Build neon extensions
-        run: mold -run make neon-pg-ext -j$(nproc)
+        run: mold -run make ${make_vars} neon-pg-ext -j$(nproc)
 
       - name: Build walproposer-lib
-        run: mold -run make walproposer-lib -j$(nproc)
+        run: mold -run make ${make_vars} walproposer-lib -j$(nproc)
 
       - name: Run cargo build
+        env:
+          WITH_TESTS: ${{ matrix.sanitizers != 'enabled' && '--tests' || '' }}
         run: |
-          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+          export ASAN_OPTIONS=detect_leaks=0
+          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins ${WITH_TESTS}
 
       # Do install *before* running rust tests because they might recompile the
       # binaries with different features/flags.
       - name: Install rust binaries
         env:
           ARCH: ${{ inputs.arch }}
+          SANITIZERS: ${{ matrix.sanitizers }}
         run: |
           # Install target binaries
           mkdir -p /tmp/neon/bin/
@@ -179,7 +192,7 @@ jobs:
           done
 
           # Install test executables and write list of all binaries (for code coverage)
-          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
+          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' && $SANITIZERS != 'enabled' ]]; then
             # Keep bloated coverage data files away from the rest of the artifact
             mkdir -p /tmp/coverage/
 
@@ -212,6 +225,7 @@ jobs:
           role-duration-seconds: 18000 # 5 hours
 
       - name: Run rust tests
+        if: ${{ matrix.sanitizers != 'enabled' }}
         env:
           NEXTEST_RETRIES: 3
         run: |
@@ -319,7 +333,7 @@ jobs:
       - name: Pytest regression tests
         continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
         uses: ./.github/actions/run-python-test-set
-        timeout-minutes: 60
+        timeout-minutes: ${{ matrix.sanitizers != 'enabled' && 60 || 180 }}
         with:
           build_type: ${{ inputs.build-type }}
           test_selection: regress
@@ -337,6 +351,7 @@ jobs:
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
           PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
           USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
+          SANITIZERS: ${{ matrix.sanitizers }}
 
       # Temporary disable this step until we figure out why it's so flaky
       # Ref https://github.com/neondatabase/neon/issues/4540
diff --git a/.github/workflows/build_and_test_with_sanitizers.yml b/.github/workflows/build_and_test_with_sanitizers.yml
new file mode 100644
index 0000000000..cf0de3f8dc
--- /dev/null
+++ b/.github/workflows/build_and_test_with_sanitizers.yml
@@ -0,0 +1,133 @@
+name: Build and Test with Sanitizers
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:   '0 1 * * *' # run once a day, timezone is utc
+  workflow_dispatch:
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow per any non-`main` branch.
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
+env:
+  RUST_BACKTRACE: 1
+  COPT: '-Werror'
+
+jobs:
+  tag:
+    runs-on: [ self-hosted, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+    outputs:
+      build-tag: ${{steps.build-tag.outputs.tag}}
+
+    steps:
+      # Need `fetch-depth: 0` to count the number of commits in the branch
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Get build tag
+        run: |
+          echo run:$GITHUB_RUN_ID
+          echo ref:$GITHUB_REF_NAME
+          echo rev:$(git rev-list --count HEAD)
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
+            echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'"
+            echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
+          fi
+        shell: bash
+        id: build-tag
+
+  build-build-tools-image:
+    uses: ./.github/workflows/build-build-tools-image.yml
+    secrets: inherit
+
+  build-and-test-locally:
+    needs: [ tag, build-build-tools-image ]
+    strategy:
+      fail-fast: false
+      matrix:
+        arch: [ x64, arm64 ]
+        build-type: [ release ]
+    uses: ./.github/workflows/_build-and-test-locally.yml
+    with:
+      arch: ${{ matrix.arch }}
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+      build-tag: ${{ needs.tag.outputs.build-tag }}
+      build-type: ${{ matrix.build-type }}
+      test-cfg: '[{"pg_version":"v17", "sanitizers": "enabled"}]'
+    secrets: inherit
+
+
+  create-test-report:
+    needs: [ build-and-test-locally, build-build-tools-image ]
+    if: ${{ !cancelled() }}
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
+      pull-requests: write
+    outputs:
+      report-url: ${{ steps.create-allure-report.outputs.report-url }}
+
+    runs-on: [ self-hosted, small ]
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Create Allure report
+        if: ${{ !cancelled() }}
+        id: create-allure-report
+        uses: ./.github/actions/allure-report-generate
+        with:
+          store-test-results-into-db: true
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        env:
+          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+
+      - uses: actions/github-script@v7
+        if: ${{ !cancelled() }}
+        with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
+          script: |
+            const report = {
+              reportUrl:     "${{ steps.create-allure-report.outputs.report-url }}",
+              reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
+            }
+
+            const coverage = {}
+
+            const script = require("./scripts/comment-test-report.js")
+            await script({
+              github,
+              context,
+              fetch,
+              report,
+              coverage,
+            })
diff --git a/Makefile b/Makefile
index d1238caebf..42ee643bb5 100644
--- a/Makefile
+++ b/Makefile
@@ -10,18 +10,29 @@ ICU_PREFIX_DIR := /usr/local/icu
 # environment variable.
 #
 BUILD_TYPE ?= debug
+WITH_SANITIZERS ?= no
 ifeq ($(BUILD_TYPE),release)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl
 	PG_CFLAGS = -O2 -g3 $(CFLAGS)
+	PG_LDFLAGS = $(LDFLAGS)
 	# Unfortunately, `--profile=...` is a nightly feature
 	CARGO_BUILD_FLAGS += --release
 else ifeq ($(BUILD_TYPE),debug)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
 	PG_CFLAGS = -O0 -g3 $(CFLAGS)
+	PG_LDFLAGS = $(LDFLAGS)
 else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif
 
+ifeq ($(WITH_SANITIZERS),yes)
+	PG_CFLAGS += -fsanitize=address -fsanitize=undefined -fno-sanitize-recover
+	COPT += -Wno-error # to avoid failing on warnings induced by sanitizers
+	PG_LDFLAGS = -fsanitize=address -fsanitize=undefined -static-libasan -static-libubsan $(LDFLAGS)
+	export CC := gcc
+	export ASAN_OPTIONS := detect_leaks=0
+endif
+
 ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
 	# Exclude static build openssl, icu for local build (MacOS, Linux)
 	# Only keep for build type release and debug
@@ -33,7 +44,9 @@ endif
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
-	PG_CONFIGURE_OPTS += --with-libseccomp
+	ifneq ($(WITH_SANITIZERS),yes)
+		PG_CONFIGURE_OPTS += --with-libseccomp
+	endif
 else ifeq ($(UNAME_S),Darwin)
 	PG_CFLAGS += -DUSE_PREFETCH
 	ifndef DISABLE_HOMEBREW
@@ -106,7 +119,7 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 	EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
 	(cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \
 	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
-		CFLAGS='$(PG_CFLAGS)' \
+		CFLAGS='$(PG_CFLAGS)' LDFLAGS='$(PG_LDFLAGS)' \
 		$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
 		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log)
 
diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index c8440afb64..1398f443dd 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -231,6 +231,14 @@ pub(crate) async fn main() -> anyhow::Result<()> {
         ])
         .env_clear()
         .env("LD_LIBRARY_PATH", &pg_lib_dir)
+        .env(
+            "ASAN_OPTIONS",
+            std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+        )
+        .env(
+            "UBSAN_OPTIONS",
+            std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+        )
         .stdout(std::process::Stdio::piped())
         .stderr(std::process::Stdio::piped())
         .spawn()
diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index af312d73a7..c668e68402 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -261,7 +261,13 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
     let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", backtrace_setting);
 
     // Pass through these environment variables to the command
-    for var in ["LLVM_PROFILE_FILE", "FAILPOINTS", "RUST_LOG"] {
+    for var in [
+        "LLVM_PROFILE_FILE",
+        "FAILPOINTS",
+        "RUST_LOG",
+        "ASAN_OPTIONS",
+        "UBSAN_OPTIONS",
+    ] {
         if let Some(val) = std::env::var_os(var) {
             filled_cmd = filled_cmd.env(var, val);
         }
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index c41ff22d15..9a2d30c861 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -221,7 +221,17 @@ impl StorageController {
             "-p",
             &format!("{}", postgres_port),
         ];
-        let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
+        let pg_lib_dir = self.get_pg_lib_dir().await.unwrap();
+        let envs = [
+            ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+        ];
+        let exitcode = Command::new(bin_path)
+            .args(args)
+            .envs(envs)
+            .spawn()?
+            .wait()
+            .await?;
 
         Ok(exitcode.success())
     }
@@ -242,6 +252,11 @@ impl StorageController {
 
         let pg_bin_dir = self.get_pg_bin_dir().await?;
         let createdb_path = pg_bin_dir.join("createdb");
+        let pg_lib_dir = self.get_pg_lib_dir().await.unwrap();
+        let envs = [
+            ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+        ];
         let output = Command::new(&createdb_path)
             .args([
                 "-h",
@@ -254,6 +269,7 @@ impl StorageController {
                 &username(),
                 DB_NAME,
             ])
+            .envs(envs)
             .output()
             .await
             .expect("Failed to spawn createdb");
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index 9524a5149b..77dff4ac99 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -76,7 +76,15 @@ impl Conf {
         let mut cmd = Command::new(path);
         cmd.env_clear()
             .env("LD_LIBRARY_PATH", self.pg_lib_dir()?)
-            .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?);
+            .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?)
+            .env(
+                "ASAN_OPTIONS",
+                std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+            )
+            .env(
+                "UBSAN_OPTIONS",
+                std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+            );
         Ok(cmd)
     }
 
diff --git a/libs/postgres_initdb/src/lib.rs b/libs/postgres_initdb/src/lib.rs
index 2f072354fb..ed54696861 100644
--- a/libs/postgres_initdb/src/lib.rs
+++ b/libs/postgres_initdb/src/lib.rs
@@ -64,6 +64,14 @@ pub async fn do_run_initdb(args: RunInitdbArgs<'_>) -> Result<(), Error> {
         .env_clear()
         .env("LD_LIBRARY_PATH", library_search_path)
         .env("DYLD_LIBRARY_PATH", library_search_path)
+        .env(
+            "ASAN_OPTIONS",
+            std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+        )
+        .env(
+            "UBSAN_OPTIONS",
+            std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+        )
         .stdin(std::process::Stdio::null())
         // stdout invocation produces the same output every time, we don't need it
         .stdout(std::process::Stdio::null())
diff --git a/libs/utils/scripts/restore_from_wal.sh b/libs/utils/scripts/restore_from_wal.sh
index a8615c2337..f394d4c58d 100755
--- a/libs/utils/scripts/restore_from_wal.sh
+++ b/libs/utils/scripts/restore_from_wal.sh
@@ -39,7 +39,7 @@ function initdb_with_args {
             ;;
     esac
 
-    eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "${cmd[*]}"
+    eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib ASAN_OPTIONS="${ASAN_OPTIONS-}" UBSAN_OPTIONS="${UBSAN_OPTIONS-}" "${cmd[*]}"
 }
 
 rm -fr "$DATA_DIR"
diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs
index 7e9477cfbc..bf30b92ea5 100644
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -79,6 +79,14 @@ impl WalRedoProcess {
             .env_clear()
             .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
             .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env(
+                "ASAN_OPTIONS",
+                std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+            )
+            .env(
+                "UBSAN_OPTIONS",
+                std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+            )
             // NB: The redo process is not trusted after we sent it the first
             // walredo work. Before that, it is trusted. Specifically, we trust
             // it to
diff --git a/scripts/ingest_regress_test_result-new-format.py b/scripts/ingest_regress_test_result-new-format.py
index ad2baf56bb..3a5cdf013a 100644
--- a/scripts/ingest_regress_test_result-new-format.py
+++ b/scripts/ingest_regress_test_result-new-format.py
@@ -32,6 +32,7 @@ CREATE TABLE IF NOT EXISTS results (
     flaky        BOOLEAN NOT NULL,
     arch         arch DEFAULT 'X64',
     lfc          BOOLEAN DEFAULT false NOT NULL,
+    sanitizers   BOOLEAN DEFAULT false NOT NULL,
     build_type   TEXT NOT NULL,
     pg_version   INT NOT NULL,
     run_id       BIGINT NOT NULL,
@@ -39,7 +40,7 @@ CREATE TABLE IF NOT EXISTS results (
     reference    TEXT NOT NULL,
     revision     CHAR(40) NOT NULL,
     raw          JSONB COMPRESSION lz4 NOT NULL,
-    UNIQUE (parent_suite, suite, name, arch, build_type, pg_version, started_at, stopped_at, run_id)
+    UNIQUE (parent_suite, suite, name, arch, lfc, sanitizers, build_type, pg_version, started_at, stopped_at, run_id)
 );
 """
 
@@ -56,6 +57,7 @@ class Row:
     flaky: bool
     arch: str
     lfc: bool
+    sanitizers: bool
     build_type: str
     pg_version: int
     run_id: int
@@ -135,6 +137,7 @@ def ingest_test_result(
         }
         arch = parameters.get("arch", "UNKNOWN").strip("'")
         lfc = parameters.get("lfc", "without-lfc").strip("'") == "with-lfc"
+        sanitizers = parameters.get("sanitizers", "disabled").strip("'") == "enabled"
 
         build_type, pg_version, unparametrized_name = parse_test_name(test["name"])
         labels = {label["name"]: label["value"] for label in test["labels"]}
@@ -149,6 +152,7 @@ def ingest_test_result(
             flaky=test["flaky"] or test["retriesStatusChange"],
             arch=arch,
             lfc=lfc,
+            sanitizers=sanitizers,
             build_type=build_type,
             pg_version=pg_version,
             run_id=run_id,
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 1acb1af23b..c33342c89e 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -124,5 +124,8 @@ def pytest_runtest_makereport(*args, **kwargs):
     allure.dynamic.parameter(
         "__lfc", "with-lfc" if os.getenv("USE_LFC") != "false" else "without-lfc"
     )
+    allure.dynamic.parameter(
+        "__sanitizers", "enabled" if os.getenv("SANITIZERS") == "enabled" else "disabled"
+    )
 
     yield
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index cdc6c0053d..ba3078d493 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -314,7 +314,10 @@ def test_forward_compatibility(
 
 
 def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path):
-    ep = env.endpoints.create_start("main")
+    ep = env.endpoints.create("main")
+    ep_env = {"LD_LIBRARY_PATH": str(env.pg_distrib_dir / f"v{env.pg_version}/lib")}
+    ep.start(env=ep_env)
+
     connstr = ep.connstr()
 
     pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
@@ -363,7 +366,7 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
     )
 
     # Timeline exists again: restart the endpoint
-    ep.start()
+    ep.start(env=ep_env)
 
     pg_bin.run_capture(
         ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"]
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 2877f14e0e..c5ae669dce 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -120,7 +120,7 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End
 
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
-@pytest.mark.timeout(900)  # Contains many sub-tests, is slow in debug builds
+@pytest.mark.timeout(3000)  # Contains many sub-tests, is slow in debug builds
 @pytest.mark.parametrize("shard_count", [None, 4])
 def test_pg_regress(
     neon_env_builder: NeonEnvBuilder,
@@ -194,7 +194,7 @@ def test_pg_regress(
 
 # Run the PostgreSQL "isolation" tests, in src/test/isolation.
 #
-@pytest.mark.timeout(600)  # Contains many sub-tests, is slow in debug builds
+@pytest.mark.timeout(1500)  # Contains many sub-tests, is slow in debug builds
 @pytest.mark.parametrize("shard_count", [None, 4])
 def test_isolation(
     neon_env_builder: NeonEnvBuilder,
@@ -222,6 +222,8 @@ def test_isolation(
             "max_prepared_transactions=100",
             # Enable the test mode, so that we don't need to patch the test cases.
             "neon.regress_test_mode = true",
+            # Stack size should be increased for tests to pass with asan.
+            "max_stack_depth = 4MB",
         ],
     )
     endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
@@ -417,7 +419,7 @@ def test_tx_abort_with_many_relations(
         try:
             # Rollback phase should be fast: this is one WAL record that we should process efficiently
             fut = exec.submit(rollback_and_wait)
-            fut.result(timeout=5)
+            fut.result(timeout=15)
         except:
             exec.shutdown(wait=False, cancel_futures=True)
             raise
diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py
index 7d4f66d044..8ad7282ea2 100644
--- a/test_runner/regress/test_subscriber_restart.py
+++ b/test_runner/regress/test_subscriber_restart.py
@@ -3,12 +3,14 @@ from __future__ import annotations
 import threading
 import time
 
+import pytest
 from fixtures.neon_fixtures import NeonEnv
 from fixtures.utils import wait_until
 
 
 # This test checks of logical replication subscriber is able to correctly restart replication without receiving duplicates.
 # It requires tracking information about replication origins at page server side
+@pytest.mark.timeout(900)  # This test is slow with sanitizers enabled, especially on ARM
 def test_subscriber_restart(neon_simple_env: NeonEnv):
     env = neon_simple_env
     env.create_branch("publisher")

From f22d41eaec1b6d96c71940f2d4e27bcc04d7617a Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 6 Feb 2025 09:39:37 -0500
Subject: [PATCH 390/583] feat(pageserver): num of background job metrics
 (#10690)

## Problem

We need a metrics to know what's going on in pageserver's background
jobs.

## Summary of changes

* Waiting tasks: task still waiting for the semaphore.
* Running tasks: tasks doing their actual jobs.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Erik Grinaker <erik@neon.tech>
---
 pageserver/src/metrics.rs                     | 88 +++++++++++++++----
 pageserver/src/tenant/tasks.rs                | 23 +++--
 .../src/tenant/timeline/eviction_task.rs      | 11 ++-
 3 files changed, 96 insertions(+), 26 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 6ab1178a7b..1cc18d83ce 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2214,6 +2214,8 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
 pub struct BackgroundLoopSemaphoreMetrics {
     counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
     durations: EnumMap<BackgroundLoopKind, Counter>,
+    waiting_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
+    running_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
 }
 
 pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
@@ -2234,6 +2236,20 @@ pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics
         )
         .unwrap();
 
+        let waiting_tasks = register_int_gauge_vec!(
+            "pageserver_background_loop_semaphore_waiting_tasks",
+            "Number of background loop tasks waiting for semaphore",
+            &["task"],
+        )
+        .unwrap();
+
+        let running_tasks = register_int_gauge_vec!(
+            "pageserver_background_loop_semaphore_running_tasks",
+            "Number of background loop tasks running concurrently",
+            &["task"],
+        )
+        .unwrap();
+
         BackgroundLoopSemaphoreMetrics {
             counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
                 let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
@@ -2243,29 +2259,69 @@ pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics
                 let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
                 durations.with_label_values(&[kind.into()])
             })),
+            waiting_tasks: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+                waiting_tasks.with_label_values(&[kind.into()])
+            })),
+            running_tasks: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+                running_tasks.with_label_values(&[kind.into()])
+            })),
         }
     },
 );
 
 impl BackgroundLoopSemaphoreMetrics {
-    pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ {
-        struct Record<'a> {
-            metrics: &'a BackgroundLoopSemaphoreMetrics,
-            task: BackgroundLoopKind,
-            _counter_guard: metrics::IntCounterPairGuard,
-            start: Instant,
-        }
-        impl Drop for Record<'_> {
-            fn drop(&mut self) {
-                let elapsed = self.start.elapsed().as_secs_f64();
-                self.metrics.durations[self.task].inc_by(elapsed);
-            }
-        }
-        Record {
-            metrics: self,
+    /// Starts recording semaphore metrics. Call `acquired()` on the returned recorder when the
+    /// semaphore is acquired, and drop it when the task completes or is cancelled.
+    pub(crate) fn record(
+        &self,
+        task: BackgroundLoopKind,
+    ) -> BackgroundLoopSemaphoreMetricsRecorder {
+        BackgroundLoopSemaphoreMetricsRecorder::start(self, task)
+    }
+}
+
+/// Records metrics for a background task.
+pub struct BackgroundLoopSemaphoreMetricsRecorder<'a> {
+    metrics: &'a BackgroundLoopSemaphoreMetrics,
+    task: BackgroundLoopKind,
+    start: Instant,
+    wait_counter_guard: Option<metrics::IntCounterPairGuard>,
+}
+
+impl<'a> BackgroundLoopSemaphoreMetricsRecorder<'a> {
+    /// Starts recording semaphore metrics, by recording wait time and incrementing
+    /// `wait_start_count` and `waiting_tasks`.
+    fn start(metrics: &'a BackgroundLoopSemaphoreMetrics, task: BackgroundLoopKind) -> Self {
+        metrics.waiting_tasks[task].inc();
+        Self {
+            metrics,
             task,
-            _counter_guard: self.counters[task].guard(),
             start: Instant::now(),
+            wait_counter_guard: Some(metrics.counters[task].guard()),
+        }
+    }
+
+    /// Signals that the semaphore has been acquired, and updates relevant metrics.
+    pub fn acquired(&mut self) {
+        self.wait_counter_guard.take().expect("already acquired");
+        self.metrics.durations[self.task].inc_by(self.start.elapsed().as_secs_f64());
+        self.metrics.waiting_tasks[self.task].dec();
+        self.metrics.running_tasks[self.task].inc();
+    }
+}
+
+impl Drop for BackgroundLoopSemaphoreMetricsRecorder<'_> {
+    /// The task either completed or was cancelled.
+    fn drop(&mut self) {
+        if self.wait_counter_guard.take().is_some() {
+            // Waiting.
+            self.metrics.durations[self.task].inc_by(self.start.elapsed().as_secs_f64());
+            self.metrics.waiting_tasks[self.task].dec();
+        } else {
+            // Running.
+            self.metrics.running_tasks[self.task].dec();
         }
     }
 }
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index d65f099182..1c3237d0bd 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -7,7 +7,7 @@ use std::sync::Arc;
 use std::time::{Duration, Instant};
 
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::metrics::TENANT_TASK_EVENTS;
+use crate::metrics::{BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::throttle::Stats;
@@ -61,21 +61,32 @@ impl BackgroundLoopKind {
     }
 }
 
+pub struct BackgroundLoopSemaphorePermit<'a> {
+    _permit: tokio::sync::SemaphorePermit<'static>,
+    _recorder: BackgroundLoopSemaphoreMetricsRecorder<'a>,
+}
+
 /// Cancellation safe.
 pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
     loop_kind: BackgroundLoopKind,
     _ctx: &RequestContext,
-) -> tokio::sync::SemaphorePermit<'static> {
-    let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind);
+) -> BackgroundLoopSemaphorePermit<'static> {
+    let mut recorder = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind);
 
     if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation {
         pausable_failpoint!("initial-size-calculation-permit-pause");
     }
 
     // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
-    match CONCURRENT_BACKGROUND_TASKS.acquire().await {
-        Ok(permit) => permit,
-        Err(_closed) => unreachable!("we never close the semaphore"),
+    let permit = CONCURRENT_BACKGROUND_TASKS
+        .acquire()
+        .await
+        .expect("should never close");
+    recorder.acquired();
+
+    BackgroundLoopSemaphorePermit {
+        _permit: permit,
+        _recorder: recorder,
     }
 }
 
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 26c2861b93..9836aafecb 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,8 +30,11 @@ use crate::{
     pgdatadir_mapping::CollectKeySpaceError,
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
     tenant::{
-        size::CalculateSyntheticSizeError, storage_layer::LayerVisibilityHint,
-        tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
+        size::CalculateSyntheticSizeError,
+        storage_layer::LayerVisibilityHint,
+        tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit},
+        timeline::EvictionError,
+        LogicalSizeCalculationCause, Tenant,
     },
 };
 
@@ -330,7 +333,7 @@ impl Timeline {
         &self,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> {
+    ) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> {
         let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
             BackgroundLoopKind::Eviction,
             ctx,
@@ -374,7 +377,7 @@ impl Timeline {
         p: &EvictionPolicyLayerAccessThreshold,
         cancel: &CancellationToken,
         gate: &GateGuard,
-        permit: tokio::sync::SemaphorePermit<'static>,
+        permit: BackgroundLoopSemaphorePermit<'static>,
         ctx: &RequestContext,
     ) -> ControlFlow<()> {
         if !self.tenant_shard_id.is_shard_zero() {

From 839f41f5bb5ef072972eefded5e5ccc32429b6e3 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 6 Feb 2025 15:39:45 +0100
Subject: [PATCH 391/583] fix pgcopydb seg fault and -c
 idle_in_transaction_session_timeout=0 (#10692)

## Problem

During ingest_benchmark which uses `pgcopydb`
([see](https://github.com/dimitri/pgcopydb))we sometimes had outages.
- when PostgreSQL COPY step failed we got a segfault (reported
[here](https://github.com/dimitri/pgcopydb/issues/899))
- the root cause was Neon idle_in_transaction_session_timeout is set to
5 minutes which is suboptimal for long-running tasks like project import
(reported [here](https://github.com/dimitri/pgcopydb/issues/900))

## Summary of changes

Patch pgcopydb to avoid segfault.
override idle_in_transaction_session_timeout and set it to "unlimited"
---
 .dockerignore                                 |  1 +
 build-tools.Dockerfile                        |  3 ++
 build_tools/patches/pgcopydbv017.patch        | 37 +++++++++++++++++++
 .../test_perf_ingest_using_pgcopydb.py        |  2 +-
 4 files changed, 42 insertions(+), 1 deletion(-)
 create mode 100644 build_tools/patches/pgcopydbv017.patch

diff --git a/.dockerignore b/.dockerignore
index 9e2d2e7108..7ead48db7c 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -24,3 +24,4 @@
 !storage_controller/
 !vendor/postgres-*/
 !workspace_hack/
+!build_tools/patches
diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 3ade57b175..52874d2ef6 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -12,6 +12,8 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
     echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
     echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
 
+COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
+
 RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
         set -e && \
         apt update && \
@@ -44,6 +46,7 @@ RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
         mkdir /tmp/pgcopydb && \
         tar -xzf /tmp/pgcopydb.tar.gz -C /tmp/pgcopydb --strip-components=1 && \
         cd /tmp/pgcopydb && \
+        patch -p1 < /pgcopydbv017.patch && \
         make -s clean && \
         make -s -j12 install && \
         libpq_path=$(find /lib /usr/lib -name "libpq.so.5" | head -n 1) && \
diff --git a/build_tools/patches/pgcopydbv017.patch b/build_tools/patches/pgcopydbv017.patch
new file mode 100644
index 0000000000..c309d8fe59
--- /dev/null
+++ b/build_tools/patches/pgcopydbv017.patch
@@ -0,0 +1,37 @@
+diff --git a/src/bin/pgcopydb/copydb.c b/src/bin/pgcopydb/copydb.c
+index d730b03..69a9be9 100644
+--- a/src/bin/pgcopydb/copydb.c
++++ b/src/bin/pgcopydb/copydb.c
+@@ -44,6 +44,7 @@ GUC dstSettings[] = {
+ 	{ "synchronous_commit", "'off'" },
+ 	{ "statement_timeout", "0" },
+ 	{ "lock_timeout", "0" },
++	{ "idle_in_transaction_session_timeout", "0" },
+ 	{ NULL, NULL },
+ };
+ 
+diff --git a/src/bin/pgcopydb/pgsql.c b/src/bin/pgcopydb/pgsql.c
+index 94f2f46..86b9448 100644
+--- a/src/bin/pgcopydb/pgsql.c
++++ b/src/bin/pgcopydb/pgsql.c
+@@ -3174,11 +3174,18 @@ pgcopy_log_error(PGSQL *pgsql, PGresult *res, const char *context)
+ 		/* errors have already been logged */
+ 		return;
+ 	}
+-
+ 	if (res != NULL)
+ 	{
+ 		char *sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
+-		strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate));
++		if (sqlstate == NULL)
++		{
++			// PQresultErrorField returned NULL!
++			pgsql->sqlstate[0] = '\0';  // Set to an empty string to avoid segfault
++		}
++		else
++		{
++			strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate));
++		}
+ 	}
+ 
+ 	char *endpoint =
diff --git a/test_runner/performance/test_perf_ingest_using_pgcopydb.py b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
index f0a0c1f5a2..da62422fca 100644
--- a/test_runner/performance/test_perf_ingest_using_pgcopydb.py
+++ b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
@@ -136,7 +136,7 @@ def run_command_and_log_output(command, log_file_path: Path):
         "LD_LIBRARY_PATH": f"{os.getenv('PGCOPYDB_LIB_PATH')}:{os.getenv('PG_16_LIB_PATH')}",
         "PGCOPYDB_SOURCE_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_SOURCE_CONNSTR")),
         "PGCOPYDB_TARGET_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")),
-        "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7",
+        "PGOPTIONS": "-c idle_in_transaction_session_timeout=0 -c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7",
     }
     # Combine the current environment with custom variables
     env = os.environ.copy()

From ddd7c363430de8126c69edb02903a6e2bf7a1919 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 6 Feb 2025 14:40:22 +0000
Subject: [PATCH 392/583] CI(approved-for-ci-run): Use internal CI_ACCESS_TOKEN
 for cloning repo (#10693)

## Problem

The default `GITHUB_TOKEN` is used to push changes created with
`approved-for-ci-run`, which doesn't work:
```
Run git push --force origin "${BRANCH}"
remote: Permission to neondatabase/neon.git denied to github-actions[bot].
fatal: unable to access 'https://github.com/neondatabase/neon/': The requested URL returned error: 403
```
Ref:
https://github.com/neondatabase/neon/actions/runs/13166108303/job/36746518291?pr=10687

## Summary of changes
- Use `CI_ACCESS_TOKEN` to clone an external repo
- Remove unneeded `actions/checkout`
---
 .github/workflows/approved-for-ci-run.yml | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml
index fc2f36c74b..f4e1e2e96c 100644
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -67,9 +67,9 @@ jobs:
 
       - uses: actions/checkout@v4
         with:
-          ref: main
+          ref: ${{ github.event.pull_request.head.sha }}
           token: ${{ secrets.CI_ACCESS_TOKEN }}
-      
+
       - name: Look for existing PR
         id: get-pr
         env:
@@ -77,7 +77,7 @@ jobs:
         run: |
           ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
           echo "ALREADY_CREATED=${ALREADY_CREATED}" >> ${GITHUB_OUTPUT}
-      
+
       - name: Get changed labels
         id: get-labels
         if: steps.get-pr.outputs.ALREADY_CREATED != ''
@@ -94,10 +94,6 @@ jobs:
           echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT}
           echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT}
 
-      - uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event.pull_request.head.sha }}
-
       - run: git checkout -b "${BRANCH}"
 
       - run: git push --force origin "${BRANCH}"
@@ -105,7 +101,7 @@ jobs:
 
       - name: Create a Pull Request for CI run (if required)
         if: steps.get-pr.outputs.ALREADY_CREATED == ''
-        env: 
+        env:
           GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
         run: |
           cat << EOF > body.md
@@ -142,7 +138,7 @@ jobs:
 
       - run: git push --force origin "${BRANCH}"
         if: steps.get-pr.outputs.ALREADY_CREATED != ''
-             
+
   cleanup:
     # Close PRs and delete branchs if the original PR is closed.
 

From df06c410852e51e01528ba63942f2361b8f6f68b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 6 Feb 2025 15:18:50 +0000
Subject: [PATCH 393/583] tests: don't detach from controller in
 test_issue_5878 (#10675)

## Problem

This test called NeonPageserver.tenant_detach, which as well as
detaching locally on the pageserver, also updates the storage controller
to put the tenant into Detached mode. When the test runs slowly in debug
mode, it sometimes takes long enough that the background_reconcile loop
wakes up and drops the tenant from memory in response, such that the
pageserver can't validate its deletions and the test does not behave as
expected.

Closes: https://github.com/neondatabase/neon/issues/10513

## Summary of changes

- Call the pageserver HTTP client directly rather than going via
NeonPageserver.tenant_detach
---
 test_runner/regress/test_layers_from_future.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 5e06a1d47f..872d3dc4cf 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -172,7 +172,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str):
     # force removal of layers from the future
     tenant_conf = ps_http.tenant_config(tenant_id)
     generation_before_detach = get_generation_number()
-    env.pageserver.tenant_detach(tenant_id)
+    env.pageserver.http_client().tenant_detach(tenant_id)
     failpoint_deletion_queue = "deletion-queue-before-execute-pause"
 
     ps_http.configure_failpoints((failpoint_deletion_queue, "pause"))

From 2943590694c57bde91eed71aa92bc96c18abb152 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 6 Feb 2025 18:17:47 +0100
Subject: [PATCH 394/583] pageserver: use histogram for background job
 semaphore waits (#10697)

## Problem

We don't have visibility into how long an individual background job is
waiting for a semaphore permit.

## Summary of changes

* Make `pageserver_background_loop_semaphore_wait_seconds` a histogram
rather than a sum.
* Add a paced warning when a task takes more than 10 minutes to get a
permit (for now).
* Drive-by cleanup of some `EnumMap` usage.
---
 pageserver/src/metrics.rs      | 58 ++++++++++++++++++----------------
 pageserver/src/tenant/tasks.rs | 38 ++++++++++++++++------
 2 files changed, 58 insertions(+), 38 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 1cc18d83ce..3b8612a3fa 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -6,7 +6,7 @@ use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
 use std::time::{Duration, Instant};
 
-use enum_map::EnumMap;
+use enum_map::{Enum as _, EnumMap};
 use futures::Future;
 use metrics::{
     register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
@@ -104,7 +104,7 @@ pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::n
     .expect("failed to define a metric")
 });
 
-// Buckets for background operations like compaction, GC, size calculation
+// Buckets for background operation duration in seconds, like compaction, GC, size calculation.
 const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
 
 pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
@@ -236,7 +236,7 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(||
 
     GetVectoredLatency {
         map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
-            let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
+            let task_kind = TaskKind::from_usize(task_kind_idx);
 
             if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
                 let task_kind = task_kind.into();
@@ -259,7 +259,7 @@ pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
 
     ScanLatency {
         map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
-            let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
+            let task_kind = TaskKind::from_usize(task_kind_idx);
 
             if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
                 let task_kind = task_kind.into();
@@ -300,10 +300,10 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
 
 pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
     map: EnumMap::from_array(std::array::from_fn(|task_kind| {
-        let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
+        let task_kind = TaskKind::from_usize(task_kind);
         let task_kind: &'static str = task_kind.into();
         EnumMap::from_array(std::array::from_fn(|content_kind| {
-            let content_kind = <PageContentKind as enum_map::Enum>::from_usize(content_kind);
+            let content_kind = PageContentKind::from_usize(content_kind);
             let content_kind: &'static str = content_kind.into();
             PageCacheMetricsForTaskKind {
                 read_accesses_immutable: {
@@ -1913,7 +1913,7 @@ pub(crate) static COMPUTE_COMMANDS_COUNTERS: Lazy<ComputeCommandCounters> = Lazy
 
     ComputeCommandCounters {
         map: EnumMap::from_array(std::array::from_fn(|i| {
-            let command = <ComputeCommandKind as enum_map::Enum>::from_usize(i);
+            let command = ComputeCommandKind::from_usize(i);
             let command_str: &'static str = command.into();
             inner.with_label_values(&[command_str])
         })),
@@ -2213,13 +2213,13 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
 
 pub struct BackgroundLoopSemaphoreMetrics {
     counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
-    durations: EnumMap<BackgroundLoopKind, Counter>,
+    durations: EnumMap<BackgroundLoopKind, Histogram>,
     waiting_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
     running_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
 }
 
-pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
-    || {
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> =
+    Lazy::new(|| {
         let counters = register_int_counter_pair_vec!(
             "pageserver_background_loop_semaphore_wait_start_count",
             "Counter for background loop concurrency-limiting semaphore acquire calls started",
@@ -2229,10 +2229,11 @@ pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics
         )
         .unwrap();
 
-        let durations = register_counter_vec!(
-            "pageserver_background_loop_semaphore_wait_duration_seconds",
-            "Sum of wall clock time spent waiting on the background loop concurrency-limiting semaphore acquire calls",
+        let durations = register_histogram_vec!(
+            "pageserver_background_loop_semaphore_wait_seconds",
+            "Seconds spent waiting on background loop semaphore acquisition",
             &["task"],
+            vec![0.01, 1.0, 5.0, 10.0, 30.0, 60.0, 180.0, 300.0, 600.0],
         )
         .unwrap();
 
@@ -2251,25 +2252,24 @@ pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics
         .unwrap();
 
         BackgroundLoopSemaphoreMetrics {
-            counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+            counters: EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = BackgroundLoopKind::from_usize(i);
                 counters.with_label_values(&[kind.into()])
             })),
-            durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+            durations: EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = BackgroundLoopKind::from_usize(i);
                 durations.with_label_values(&[kind.into()])
             })),
-            waiting_tasks: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+            waiting_tasks: EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = BackgroundLoopKind::from_usize(i);
                 waiting_tasks.with_label_values(&[kind.into()])
             })),
-            running_tasks: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+            running_tasks: EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = BackgroundLoopKind::from_usize(i);
                 running_tasks.with_label_values(&[kind.into()])
             })),
         }
-    },
-);
+    });
 
 impl BackgroundLoopSemaphoreMetrics {
     /// Starts recording semaphore metrics. Call `acquired()` on the returned recorder when the
@@ -2304,11 +2304,13 @@ impl<'a> BackgroundLoopSemaphoreMetricsRecorder<'a> {
     }
 
     /// Signals that the semaphore has been acquired, and updates relevant metrics.
-    pub fn acquired(&mut self) {
+    pub fn acquired(&mut self) -> Duration {
+        let waited = self.start.elapsed();
         self.wait_counter_guard.take().expect("already acquired");
-        self.metrics.durations[self.task].inc_by(self.start.elapsed().as_secs_f64());
+        self.metrics.durations[self.task].observe(waited.as_secs_f64());
         self.metrics.waiting_tasks[self.task].dec();
         self.metrics.running_tasks[self.task].inc();
+        waited
     }
 }
 
@@ -2317,7 +2319,7 @@ impl Drop for BackgroundLoopSemaphoreMetricsRecorder<'_> {
     fn drop(&mut self) {
         if self.wait_counter_guard.take().is_some() {
             // Waiting.
-            self.metrics.durations[self.task].inc_by(self.start.elapsed().as_secs_f64());
+            self.metrics.durations[self.task].observe(self.start.elapsed().as_secs_f64());
             self.metrics.waiting_tasks[self.task].dec();
         } else {
             // Running.
@@ -2570,7 +2572,7 @@ pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> =
 
 pub(crate) struct WalRedoProcessCounters {
     pub(crate) started: IntCounter,
-    pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
+    pub(crate) killed_by_cause: EnumMap<WalRedoKillCause, IntCounter>,
     pub(crate) active_stderr_logger_tasks_started: IntCounter,
     pub(crate) active_stderr_logger_tasks_finished: IntCounter,
 }
@@ -2612,7 +2614,7 @@ impl Default for WalRedoProcessCounters {
         Self {
             started,
             killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
-                let cause = <WalRedoKillCause as enum_map::Enum>::from_usize(i);
+                let cause = WalRedoKillCause::from_usize(i);
                 let cause_str: &'static str = cause.into();
                 killed.with_label_values(&[cause_str])
             })),
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 1c3237d0bd..0f10dd7e10 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -3,7 +3,7 @@
 
 use std::ops::ControlFlow;
 use std::str::FromStr;
-use std::sync::Arc;
+use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};
 
 use crate::context::{DownloadBehavior, RequestContext};
@@ -14,9 +14,11 @@ use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::compaction::CompactionOutcome;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
+use once_cell::sync::Lazy;
 use rand::Rng;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::rate_limit::RateLimit;
 use utils::{backoff, completion, pausable_failpoint};
 
 static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
@@ -41,7 +43,16 @@ static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore
         tokio::sync::Semaphore::new(permits)
     });
 
-#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr, enum_map::Enum)]
+#[derive(
+    Debug,
+    PartialEq,
+    Eq,
+    Clone,
+    Copy,
+    strum_macros::IntoStaticStr,
+    strum_macros::Display,
+    enum_map::Enum,
+)]
 #[strum(serialize_all = "snake_case")]
 pub(crate) enum BackgroundLoopKind {
     Compaction,
@@ -55,12 +66,6 @@ pub(crate) enum BackgroundLoopKind {
     SecondaryDownload,
 }
 
-impl BackgroundLoopKind {
-    fn as_static_str(&self) -> &'static str {
-        self.into()
-    }
-}
-
 pub struct BackgroundLoopSemaphorePermit<'a> {
     _permit: tokio::sync::SemaphorePermit<'static>,
     _recorder: BackgroundLoopSemaphoreMetricsRecorder<'a>,
@@ -71,6 +76,11 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
     loop_kind: BackgroundLoopKind,
     _ctx: &RequestContext,
 ) -> BackgroundLoopSemaphorePermit<'static> {
+    // TODO: use a lower threshold and remove the pacer once we resolve some blockage.
+    const WARN_THRESHOLD: Duration = Duration::from_secs(600);
+    static WARN_PACER: Lazy<Mutex<RateLimit>> =
+        Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+
     let mut recorder = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind);
 
     if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation {
@@ -82,7 +92,15 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
         .acquire()
         .await
         .expect("should never close");
-    recorder.acquired();
+
+    let waited = recorder.acquired();
+    if waited >= WARN_THRESHOLD {
+        let waited = waited.as_secs_f64();
+        WARN_PACER
+            .lock()
+            .unwrap()
+            .call(|| warn!("{loop_kind} task waited {waited:.3}s for semaphore permit"));
+    }
 
     BackgroundLoopSemaphorePermit {
         _permit: permit,
@@ -628,7 +646,7 @@ pub(crate) fn warn_when_period_overrun(
             "task iteration took longer than the configured period"
         );
         crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
-            .with_label_values(&[task.as_static_str(), &format!("{}", period.as_secs())])
+            .with_label_values(&[task.into(), &format!("{}", period.as_secs())])
             .inc();
     }
 }

From 82cbab75123d9e413c432ff8adec2f633056af77 Mon Sep 17 00:00:00 2001
From: OBBO67 <35974943+OBBO67@users.noreply.github.com>
Date: Thu, 6 Feb 2025 17:26:26 +0000
Subject: [PATCH 395/583] Switch reqlsns[0].request_lsn to arrow operator in
 neon_read_at_lsnv() (#10620) (#10687)

## Problem

Currently the following line below uses array subscript notation which
is confusing since `reqlsns` is not an array but just a pointer to a
struct.

```
XLogWaitForReplayOf(reqlsns[0].request_lsn);
```

## Summary of changes

Switch from array subscript notation to arrow operator to improve
readability of code.

Close #10620.
---
 pgxn/neon/pagestore_smgr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 012bd479bc..8051970176 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -3011,7 +3011,7 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block
 		start_ts = GetCurrentTimestamp();
 
 		if (RecoveryInProgress() && MyBackendType != B_STARTUP)
-			XLogWaitForReplayOf(reqlsns[0].request_lsn);
+			XLogWaitForReplayOf(reqlsns->request_lsn);
 
 		/*
 		 * Try to find prefetched page in the list of received pages.

From 186199f406fbc0d6b46b9edf0ae79ca2f1a53cb2 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Thu, 6 Feb 2025 20:28:27 +0300
Subject: [PATCH 396/583] Update aws sdk (#10699)

## Problem

We have unclear issue with stuck s3 client, probably after partial aws
sdk update without updating sdk-s3.
https://github.com/neondatabase/neon/pull/10588
Let's try to update s3 as well.

## Summary of changes

Result of running

cargo update -p aws-types -p aws-sigv4 -p aws-credential-types -p
aws-smithy-types -p aws-smithy-async -p aws-sdk-kms -p aws-sdk-iam -p
aws-sdk-s3 -p aws-config

ref https://github.com/neondatabase/neon/issues/10695
---
 Cargo.lock | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index de1b1218ca..2c5b0a113f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -368,9 +368,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-iam"
-version = "1.53.0"
+version = "1.60.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb8a6fea8d335cde419176b1f2c6d2d6e97997719e7df4b51e59064310f48e4a"
+checksum = "a43daa438f8e7e4ebbbcb5c712b3b85db50d62e637a7da4ba9da51095d327460"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -391,9 +391,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-kms"
-version = "1.51.0"
+version = "1.58.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c30f6fd5646b99d9b45ec3a0c22e67112c175b2383100c960d7ee39d96c8d96"
+checksum = "40b7a24700ac548025a47a5c579886f5198895bb1eccd8964dfd71cd66c16912"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -413,9 +413,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-s3"
-version = "1.65.0"
+version = "1.68.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d3ba2c5c0f2618937ce3d4a5ad574b86775576fa24006bcb3128c6e2cbf3c34e"
+checksum = "bc5ddf1dc70287dc9a2f953766a1fe15e3e74aef02fd1335f2afa475c9b4f4fc"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -514,9 +514,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sigv4"
-version = "1.2.7"
+version = "1.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "690118821e46967b3c4501d67d7d52dd75106a9c54cf36cefa1985cedbe94e05"
+checksum = "0bc5bbd1e4a2648fd8c5982af03935972c24a2f9846b396de661d351ee3ce837"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-eventstream",
@@ -670,9 +670,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-types"
-version = "1.2.12"
+version = "1.2.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a28f6feb647fb5e0d5b50f0472c19a7db9462b74e2fec01bb0b44eedcc834e97"
+checksum = "c7b8a53819e42f10d0821f56da995e1470b199686a1809168db6ca485665f042"
 dependencies = [
  "base64-simd",
  "bytes",
@@ -705,9 +705,9 @@ dependencies = [
 
 [[package]]
 name = "aws-types"
-version = "1.3.4"
+version = "1.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0df5a18c4f951c645300d365fec53a61418bcf4650f604f85fe2a665bfaa0c2"
+checksum = "dfbd0a668309ec1f66c0f6bda4840dd6d4796ae26d699ebc266d7cc95c6d040f"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-async",

From 44b905d14b03d141fa94e0dfb0e1eae06c437ae6 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 6 Feb 2025 19:21:38 +0000
Subject: [PATCH 397/583] Fix remote extension lookup (#10708)

when library name doesn't match extension name.
The bug was introduced by recent commit ebc55e6a
---
 libs/compute_api/src/spec.rs | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 2fc95c47c6..767a34bcbc 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -207,11 +207,11 @@ impl RemoteExtSpec {
         if !self
             .public_extensions
             .as_ref()
-            .is_some_and(|exts| exts.iter().any(|e| e == ext_name))
+            .is_some_and(|exts| exts.iter().any(|e| e == real_ext_name))
             && !self
                 .custom_extensions
                 .as_ref()
-                .is_some_and(|exts| exts.iter().any(|e| e == ext_name))
+                .is_some_and(|exts| exts.iter().any(|e| e == real_ext_name))
         {
             return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
         }
@@ -414,7 +414,7 @@ mod tests {
             "public_extensions": ["ext"],
             "custom_extensions": [],
             "library_index": {
-                "ext": "ext"
+                "extlib": "ext",
             },
             "extension_data": {
                 "ext": {
@@ -430,6 +430,12 @@ mod tests {
         rspec
             .get_ext("ext", false, "latest", "v17")
             .expect("Extension should be found");
+
+        // test library index for the case when library name
+        // doesn't match the extension name
+        rspec
+            .get_ext("extlib", true, "latest", "v17")
+            .expect("Library should be found");
     }
 
     #[test]

From e73d681a0e05d055de93e8d72762996bb746f440 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 6 Feb 2025 21:21:18 +0100
Subject: [PATCH 398/583] Patch pgcopydb and fix another segfault (#10706)

## Problem

Found another pgcopydb segfault in error handling

```bash
2025-02-06 15:30:40.112 51299 ERROR  pgsql.c:2330              [TARGET -738302813] FATAL:  terminating connection due to administrator command
2025-02-06 15:30:40.112 51298 ERROR  pgsql.c:2330              [TARGET -1407749748] FATAL:  terminating connection due to administrator command
2025-02-06 15:30:40.112 51297 ERROR  pgsql.c:2330              [TARGET -2073308066] FATAL:  terminating connection due to administrator command
2025-02-06 15:30:40.112 51300 ERROR  pgsql.c:2330              [TARGET 1220908650] FATAL:  terminating connection due to administrator command
2025-02-06 15:30:40.432 51300 ERROR  pgsql.c:2536              [Postgres] FATAL:  terminating connection due to administrator command
2025-02-06 15:30:40.513 51290 ERROR  copydb.c:773              Sub-process 51300 exited with code 0 and signal Segmentation fault
2025-02-06 15:30:40.578 51299 ERROR  pgsql.c:2536              [Postgres] FATAL:  terminating connection due to administrator command
2025-02-06 15:30:40.613 51290 ERROR  copydb.c:773              Sub-process 51299 exited with code 0 and signal Segmentation fault
2025-02-06 15:30:41.253 51298 ERROR  pgsql.c:2536              [Postgres] FATAL:  terminating connection due to administrator command
2025-02-06 15:30:41.314 51290 ERROR  copydb.c:773              Sub-process 51298 exited with code 0 and signal Segmentation fault
2025-02-06 15:30:43.133 51297 ERROR  pgsql.c:2536              [Postgres] FATAL:  terminating connection due to administrator command
2025-02-06 15:30:43.215 51290 ERROR  copydb.c:773              Sub-process 51297 exited with code 0 and signal Segmentation fault
2025-02-06 15:30:43.215 51290 ERROR  indexes.c:123             Some INDEX worker process(es) have exited with error, see above for details
2025-02-06 15:30:43.215 51290 ERROR  indexes.c:59              Failed to create indexes, see above for details
2025-02-06 15:30:43.232 51271 ERROR  copydb.c:768              Sub-process 51290 exited with code 12
```

```bashadmin@ip-172-31-38-164:~/pgcopydb$ gdb /usr/local/pgsql/bin/pgcopydb core
GNU gdb (Debian 13.1-3) 13.1
Copyright (C) 2023 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "aarch64-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<https://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
    <http://www.gnu.org/software/gdb/documentation/>.

For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from /usr/local/pgsql/bin/pgcopydb...
[New LWP 51297]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/aarch64-linux-gnu/libthread_db.so.1".
Core was generated by `pgcopydb: create index ocr.ocr_pipeline_step_results_version_pkey             '.
Program terminated with signal SIGSEGV, Segmentation fault.
#0  0x0000aaaac3a4b030 in splitLines (lbuf=lbuf@entry=0xffffd8b86930, buffer=<optimized out>) at string_utils.c:630
630				*newLinePtr = '\0';
(gdb) bt
#0  0x0000aaaac3a4b030 in splitLines (lbuf=lbuf@entry=0xffffd8b86930, buffer=<optimized out>) at string_utils.c:630
#1  0x0000aaaac3a3a678 in pgsql_execute_log_error (pgsql=pgsql@entry=0xffffd8b87040, result=result@entry=0x0,
    sql=sql@entry=0xffff81fe9be0 "CREATE UNIQUE INDEX IF NOT EXISTS ocr_pipeline_step_results_version_pkey ON ocr.ocr_pipeline_step_results_version USING btree (id, transaction_id);",
    debugParameters=debugParameters@entry=0xaaaaec5f92f0, context=context@entry=0x0) at pgsql.c:2322
#2  0x0000aaaac3a3bbec in pgsql_execute_with_params (pgsql=pgsql@entry=0xffffd8b87040,
    sql=0xffff81fe9be0 "CREATE UNIQUE INDEX IF NOT EXISTS ocr_pipeline_step_results_version_pkey ON ocr.ocr_pipeline_step_results_version USING btree (id, transaction_id);", paramCount=paramCount@entry=0,
    paramTypes=paramTypes@entry=0x0, paramValues=paramValues@entry=0x0, context=context@entry=0x0, parseFun=parseFun@entry=0x0) at pgsql.c:1649
#3  0x0000aaaac3a3c468 in pgsql_execute (pgsql=pgsql@entry=0xffffd8b87040, sql=<optimized out>) at pgsql.c:1522
#4  0x0000aaaac3a245f4 in copydb_create_index (specs=specs@entry=0xffffd8b8ec98, dst=dst@entry=0xffffd8b87040, index=index@entry=0xffff81f71800, ifNotExists=<optimized out>) at indexes.c:846
#5  0x0000aaaac3a24ca8 in copydb_create_index_by_oid (specs=specs@entry=0xffffd8b8ec98, dst=dst@entry=0xffffd8b87040, indexOid=<optimized out>) at indexes.c:410
#6  0x0000aaaac3a25040 in copydb_index_worker (specs=specs@entry=0xffffd8b8ec98) at indexes.c:297
#7  0x0000aaaac3a25238 in copydb_start_index_workers (specs=specs@entry=0xffffd8b8ec98) at indexes.c:209
#8  0x0000aaaac3a252f4 in copydb_index_supervisor (specs=specs@entry=0xffffd8b8ec98) at indexes.c:112
#9  0x0000aaaac3a253f4 in copydb_start_index_supervisor (specs=0xffffd8b8ec98) at indexes.c:57
#10 copydb_start_index_supervisor (specs=specs@entry=0xffffd8b8ec98) at indexes.c:34
#11 0x0000aaaac3a51ff4 in copydb_process_table_data (specs=specs@entry=0xffffd8b8ec98) at table-data.c:146
#12 0x0000aaaac3a520dc in copydb_copy_all_table_data (specs=specs@entry=0xffffd8b8ec98) at table-data.c:69
#13 0x0000aaaac3a0ccd8 in cloneDB (copySpecs=copySpecs@entry=0xffffd8b8ec98) at cli_clone_follow.c:602
#14 0x0000aaaac3a0d2cc in start_clone_process (pid=0xffffd8b743d8, copySpecs=0xffffd8b8ec98) at cli_clone_follow.c:502
#15 start_clone_process (copySpecs=copySpecs@entry=0xffffd8b8ec98, pid=pid@entry=0xffffd8b89788) at cli_clone_follow.c:482
#16 0x0000aaaac3a0d52c in cli_clone (argc=<optimized out>, argv=<optimized out>) at cli_clone_follow.c:164
#17 0x0000aaaac3a53850 in commandline_run (command=command@entry=0xffffd8b9eb88, argc=0, argc@entry=22, argv=0xffffd8b9edf8, argv@entry=0xffffd8b9ed48) at /home/admin/pgcopydb/src/bin/pgcopydb/../lib/subcommands.c/commandline.c:71
#18 0x0000aaaac3a01464 in main (argc=22, argv=0xffffd8b9ed48) at main.c:140
(gdb)

```

The problem is most likely that the following call returned a message in
a read-only memory segment where we cannot replace \n with \0 in
string_utils.c splitLines() function
```C
char *message = PQerrorMessage(pgsql->connection);
```

## Summary of changes

modified the patch to also address this problem
---
 build_tools/patches/pgcopydbv017.patch | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/build_tools/patches/pgcopydbv017.patch b/build_tools/patches/pgcopydbv017.patch
index c309d8fe59..4e68793afc 100644
--- a/build_tools/patches/pgcopydbv017.patch
+++ b/build_tools/patches/pgcopydbv017.patch
@@ -11,10 +11,30 @@ index d730b03..69a9be9 100644
  };
  
 diff --git a/src/bin/pgcopydb/pgsql.c b/src/bin/pgcopydb/pgsql.c
-index 94f2f46..86b9448 100644
+index 94f2f46..e051ba8 100644
 --- a/src/bin/pgcopydb/pgsql.c
 +++ b/src/bin/pgcopydb/pgsql.c
-@@ -3174,11 +3174,18 @@ pgcopy_log_error(PGSQL *pgsql, PGresult *res, const char *context)
+@@ -2319,6 +2319,11 @@ pgsql_execute_log_error(PGSQL *pgsql,
+ 
+ 	LinesBuffer lbuf = { 0 };
+ 
++	if (message != NULL){
++		// make sure message is writable by splitLines
++		message = strdup(message);
++	}
++
+ 	if (!splitLines(&lbuf, message))
+ 	{
+ 		/* errors have already been logged */
+@@ -2332,6 +2337,7 @@ pgsql_execute_log_error(PGSQL *pgsql,
+ 				  PQbackendPID(pgsql->connection),
+ 				  lbuf.lines[lineNumber]);
+ 	}
++        free(message); // free copy of message we created above
+ 
+ 	if (pgsql->logSQL)
+ 	{
+@@ -3174,11 +3180,18 @@ pgcopy_log_error(PGSQL *pgsql, PGresult *res, const char *context)
  		/* errors have already been logged */
  		return;
  	}

From de0525841903db9d135137d5b4b6f3a2675a52ca Mon Sep 17 00:00:00 2001
From: Alexander Lakhin <exclusion@gmail.com>
Date: Fri, 7 Feb 2025 10:56:39 +0200
Subject: [PATCH 399/583] Adjust diesel schema check for build with sanitizers
 (#10711)

We need to disable the detection of memory leaks when running
``neon_local init` for build with sanitizers to avoid an error thrown by
AddressSanitizer.
---
 .github/workflows/_build-and-test-locally.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 3a6fbf4234..a963452523 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -287,6 +287,7 @@ jobs:
           DATABASE_URL: postgresql://localhost:1235/storage_controller
           POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
         run: |
+          export ASAN_OPTIONS=detect_leaks=0
           /tmp/neon/bin/neon_local init
           /tmp/neon/bin/neon_local storage_controller start
 

From b5a239c4ae25190bd563427518bc80195b1a444a Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Fri, 7 Feb 2025 14:20:49 +0300
Subject: [PATCH 400/583] Add reconciliation details to sk membership change
 rfc (#10514)

## Problem

RFC pointed out the need of reconciliation, but wasn't detailed how it
can be done.

## Summary of changes

Add these details.
---
 ...35-safekeeper-dynamic-membership-change.md | 215 ++++++++++++++----
 1 file changed, 166 insertions(+), 49 deletions(-)

diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
index cea9af34ab..9b320c7285 100644
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -285,10 +285,10 @@ To summarize, list of cplane changes:
 
 ### storage_controller implementation
 
-Current 'load everything on startup and keep in memory' easy design is fine.
-Single timeline shouldn't take more than 100 bytes (it's 16 byte tenant_id, 16
-byte timeline_id, int generation, vec of ~3 safekeeper ids plus some flags), so
-10^6 of timelines shouldn't take more than 100MB.
+If desired, we may continue using current 'load everything on startup and keep
+in memory' approach: single timeline shouldn't take more than 100 bytes (it's 16
+byte tenant_id, 16 byte timeline_id, int generation, vec of ~3 safekeeper ids
+plus some flags), so 10^6 of timelines shouldn't take more than 100MB.
 
 Similar to pageserver attachment Intents storage_controller would have in-memory
 `MigrationRequest` (or its absense) for each timeline and pool of tasks trying
@@ -296,7 +296,7 @@ to make these request reality; this ensures one instance of storage_controller
 won't do several migrations on the same timeline concurrently. In the first
 version it is simpler to have more manual control and no retries, i.e. migration
 failure removes the request. Later we can build retries and automatic
-scheduling/migration. `MigrationRequest` is
+scheduling/migration around. `MigrationRequest` is
 ```
 enum MigrationRequest {
     To(Vec<NodeId>),
@@ -313,9 +313,9 @@ similarly, in the first version it is ok to trigger it manually).
 #### Schema
 
 `safekeepers` table mirroring current `nodes` should be added, except that for
-`scheduling_policy` field (seems like `status` is a better name for it): it is enough
-to have at least in the beginning only 3 fields: 1) `active` 2) `offline` 3)
-`decomissioned`.
+`scheduling_policy`: it is enough to have at least in the beginning only 3
+fields: 1) `active` 2) `paused` (initially means only not assign new tlis there
+3) `decomissioned` (node is removed).
 
 `timelines` table:
 ```
@@ -324,18 +324,24 @@ table! {
     timelines (tenant_id, timeline_id) {
         timeline_id -> Varchar,
         tenant_id -> Varchar,
+        start_lsn -> pg_lsn,
         generation -> Int4,
         sk_set -> Array<Int4>, // list of safekeeper ids
-        new_sk_set -> Nullable<Array<Int4>>, // list of safekeeper ids, null if not joint conf
+        new_sk_set -> Nullable<Array<Int8>>, // list of safekeeper ids, null if not joint conf
         cplane_notified_generation -> Int4,
+        deleted_at -> Nullable<Timestamptz>,
     }
 }
 ```
 
+`start_lsn` is needed to create timeline on safekeepers properly, see below. We
+might also want to add ancestor_timeline_id to preserve the hierarchy, but for
+this RFC it is not needed.
+
 #### API
 
 Node management is similar to pageserver:
-1) POST `/control/v1/safekeepers` upserts safekeeper.
+1) POST `/control/v1/safekeepers` inserts safekeeper.
 2) GET `/control/v1/safekeepers` lists safekeepers.
 3) GET `/control/v1/safekeepers/:node_id` gets safekeeper.
 4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g.
@@ -345,25 +351,15 @@ Node management is similar to pageserver:
 Safekeeper deploy scripts should register safekeeper at storage_contorller as
 they currently do with cplane, under the same id.
 
-Timeline creation/deletion: already existing POST `tenant/:tenant_id/timeline`
-would 1) choose initial set of safekeepers; 2) write to the db initial
-`Configuration` with `INSERT ON CONFLICT DO NOTHING` returning existing row in
-case of conflict; 3) create timeline on the majority of safekeepers (already
-created is ok).
+Timeline creation/deletion will work through already existing POST and DELETE
+`tenant/:tenant_id/timeline`. Cplane is expected to retry both until they
+succeed. See next section on the implementation details.
 
-We don't want to block timeline creation when one safekeeper is down. Currently
-this is solved by compute implicitly creating timeline on any safekeeper it is
-connected to. This creates ugly timeline state on safekeeper when timeline is
-created, but start LSN is not defined yet. It would be nice to remove this; to
-do that, controller can in the background retry to create timeline on
-safekeeper(s) which missed that during initial creation call. It can do that
-through `pull_timeline` from majority so it doesn't need to remember
-`parent_lsn` in its db.
-
-Timeline deletion removes the row from the db and forwards deletion to the
-current configuration members. Without additional actions deletions might leak,
-see below on this; initially let's ignore these, reporting to cplane success if
-at least one safekeeper deleted the timeline (this will remove s3 data).
+We don't want to block timeline creation/deletion when one safekeeper is down.
+Currently this is crutched by compute implicitly creating timeline on any
+safekeeper it is connected to. This creates ugly timeline state on safekeeper
+when timeline is created, but start LSN is not defined yet. Next section
+describes dealing with this.
 
 Tenant deletion repeats timeline deletion for all timelines.
 
@@ -395,26 +391,6 @@ Similar call should be added for the tenant.
 It would be great to have some way of subscribing to the results (apart from
 looking at logs/metrics).
 
-Migration is executed as described above. One subtlety is that (local) deletion on
-source safekeeper might fail, which is not a problem if we are going to
-decomission the node but leaves garbage otherwise. I'd propose in the first version
-1) Don't attempt deletion at all if node status is `offline`.
-2) If it failed, just issue warning.
-And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and
-remove garbage timelines for manual use. It will 1) list all timelines on the
-safekeeper 2) compare each one against configuration storage: if timeline
-doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can
-be deleted under generation number if node is not member of current generation.
-
-Automating this is untrivial; we'd need to register all potential missing
-deletions <tenant_id, timeline_id, generation, node_id> in the same transaction
-which switches configurations. Similarly when timeline is fully deleted to
-prevent cplane operation from blocking when some safekeeper is not available
-deletion should be also registered.
-
-One more task pool should infinitely retry notifying control plane about changed
-safekeeper sets.
-
 3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return
    current in memory state of the timeline and pending `MigrationRequest`,
    if any.
@@ -423,12 +399,153 @@ safekeeper sets.
    migration by switching configuration from the joint to the one with (previous) `sk_set` under CAS
    (incrementing generation as always).
 
+#### API implementation and reconciliation
+
+For timeline creation/deletion we want to preserve the basic assumption that
+unreachable minority (1 sk of 3) doesn't block their completion, but eventually
+we want to finish creation/deletion on nodes which missed it (unless they are
+removed). Similarly for migration; it may and should finish even though excluded
+members missed their exclusion. And of course e.g. such pending exclusion on
+node C after migration ABC -> ABD must not prevent next migration ABD -> ABE. As
+another example, if some node missed timeline creation it clearly must not block
+migration from it. Hence it is natural to have per safekeeper background
+reconciler which retries these ops until they succeed. There are 3 possible
+operation types, and the type is defined by timeline state (membership
+configuration and whether it is deleted) and safekeeper id: we may need to
+create timeline on sk (node added), locally delete it (node excluded, somewhat
+similar to detach) or globally delete it (timeline is deleted).
+
+Next, on storage controller restart in principle these pending operations can be
+figured out by comparing safekeepers state against storcon state. But it seems
+better to me to materialize them in the database; it is not expensive, avoids
+these startup scans which themselves can fail etc and makes it very easy to see
+outstanding work directly at the source of truth -- the db. So we can add table
+`safekeeper_timeline_pending_ops`
+```
+table! {
+    // timeline_id, sk_id is primary key
+    safekeeper_timeline_pending_ops (sk_id, tenant_id, timeline_id) {
+        sk_id -> int8,
+        tenant_id -> Varchar,
+        timeline_id -> Varchar,
+        generation -> Int4,
+        op_type -> Varchar,
+    }
+}
+```
+
+`op_type` can be `include` (seed from peers and ensure generation is up to
+date), `exclude` (remove locally) and `delete`. Field is actually not strictly
+needed as it can be computed from current configuration, but gives more explicit
+observability.
+
+`generation` is necessary there because after op is done reconciler must remove
+it and not remove another row with higher gen which in theory might appear.
+
+Any insert of row should overwrite (remove) all rows with the same sk and
+timeline id but lower `generation` as next op makes previous obsolete. Insertion
+of `op_type` `delete` overwrites all rows.
+
+About `exclude`: rather than adding explicit safekeeper http endpoint, it is
+reasonable to reuse membership switch endpoint: if safekeeper is not member
+of the configuration it locally removes the timeline on the switch. In this case
+404 should also be considered an 'ok' answer by the caller.
+
+So, main loop of per sk reconcile reads `safekeeper_timeline_pending_ops`
+joined with timeline configuration to get current conf (with generation `n`)
+for the safekeeper and does the jobs, infinitely retrying failures:
+1) If node is member (`include`):
+  - Check if timeline exists on it, if not, call pull_timeline on it from 
+     other members
+  - Call switch configuration to the current
+2) If node is not member (`exclude`):
+  - Call switch configuration to the current, 404 is ok.
+3) If timeline is deleted (`delete`), call delete.
+
+In cases 1 and 2 remove `safekeeper_timeline_pending_ops` for the sk and 
+timeline with generation <= `n` if `op_type` is not `delete`.
+In case 3 also remove `safekeeper_timeline_pending_ops` 
+entry + remove `timelines` entry if there is nothing left  in `safekeeper_timeline_pending_ops` for the timeline.
+
+Let's consider in details how APIs can be implemented from this angle.
+
+Timeline creation. It is assumed that cplane retries it until success, so all
+actions must be idempotent. Now, a tricky point here is timeline start LSN. For
+the initial (tenant creation) call cplane doesn't know it. However, setting
+start_lsn on safekeepers during creation is a good thing -- it provides a
+guarantee that walproposer can always find a common point in WAL histories of
+safekeeper and its own, and so absense of it would be a clear sign of
+corruption. The following sequence works:
+1) Create timeline (or observe that it exists) on pageserver,
+   figuring out last_record_lsn in response.
+2) Choose safekeepers and insert (ON CONFLICT DO NOTHING) timeline row into the
+   db. Note that last_record_lsn returned on the previous step is movable as it
+   changes once ingestion starts, insert must not overwrite it (as well as other
+   fields like membership conf). On the contrary, start_lsn used in the next
+   step must be set to the value in the db. cplane_notified_generation can be set
+   to 1 (initial generation) in insert to avoid notifying cplane about initial 
+   conf as cplane will receive it in timeline creation request anyway.
+3) Issue timeline creation calls to at least majority of safekeepers. Using
+   majority here is not necessary but handy because it guarantees that any live
+   majority will have at least one sk with created timeline and so
+   reconciliation task can use pull_timeline shared with migration instead of
+   create timeline special init case. OFC if timeline is already exists call is
+   ignored.
+4) For minority of safekeepers which could have missed creation insert
+   entries to `safekeeper_timeline_pending_ops`. We won't miss this insertion 
+   because response to cplane is sent only after it has happened, and cplane 
+   retries the call until 200 response.
+
+   There is a small question how request handler (timeline creation in this
+   case) would interact with per sk reconciler. As always I prefer to do the
+   simplest possible thing and here it seems to be just waking it up so it
+   re-reads the db for work to do. Passing work in memory is faster, but
+   that shouldn't matter, and path to scan db for work will exist anyway, 
+   simpler to reuse it.
+
+For pg version / wal segment size: while we may persist them in `timelines`
+table, it is not necessary as initial creation at step 3 can take them from
+pageserver or cplane creation call and later pull_timeline will carry them
+around.
+
+Timeline migration.
+1) CAS to the db to create joint conf, and in the same transaction create
+   `safekeeper_timeline_pending_ops` `include` entries to initialize new members
+   as well as deliver this conf to current ones; poke per sk reconcilers to work
+   on it. Also any conf change should also poke cplane notifier task(s).
+2) Once it becomes possible per alg description above, get out of joint conf
+   with another CAS. Task should get wakeups from per sk reconcilers because 
+   conf switch is required for advancement; however retries should be sleep
+   based as well as LSN advancement might be needed, though in happy path 
+   it isn't. To see whether further transition is possible on wakup migration
+   executor polls safekeepers per the algorithm. CAS creating new conf with only
+   new members should again insert entries to `safekeeper_timeline_pending_ops`
+   to switch them there, as well as `exclude` rows to remove timeline from 
+   old members.
+
+Timeline deletion: just set `deleted_at` on the timeline row and insert
+`safekeeper_timeline_pending_ops` entries in the same xact, the rest is done by
+per sk reconcilers.
+
+When node is removed (set to `decomissioned`), `safekeeper_timeline_pending_ops`
+for it must be cleared in the same transaction.
+
+One more task pool should infinitely retry notifying control plane about changed
+safekeeper sets (trying making `cplane_notified_generation` equal `generation`).
+
 #### Dealing with multiple instances of storage_controller
 
 Operations described above executed concurrently might create some errors but do
 not prevent progress, so while we normally don't want to run multiple instances
 of storage_controller it is fine to have it temporarily, e.g. during redeploy.
 
+To harden against some controller instance creating some work in
+`safekeeper_timeline_pending_ops` and then disappearing without anyone pickup up
+the job per sk reconcilers apart from explicit wakups should scan for work
+periodically. It is possible to remove that though if all db updates are
+protected with leadership token/term -- then such scans are needed only after
+leadership is acquired.
+
 Any interactions with db update in-memory controller state, e.g. if migration
 request failed because different one is in progress, controller remembers that
 and tries to finish it.
@@ -545,7 +662,7 @@ Aurora does this but similarly I don't think this is needed.
 
 We should use Compute <-> safekeeper protocol change to include other (long
 yearned) modifications:
-- send data in network order to make arm work.
+- send data in network order without putting whole structs to be arch independent
 - remove term_start_lsn from AppendRequest
 - add horizon to TermHistory
 - add to ProposerGreeting number of connection from this wp to sk

From 8f651f958278b043a2d46aed9b1722430e452373 Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Fri, 7 Feb 2025 13:25:16 +0100
Subject: [PATCH 401/583] switch from localtest.me to local.neon.build (#10714)

## Problem
Ref: https://github.com/neondatabase/neon/issues/10632

We use dns named `*.localtest.me` in our test, and that domain is
well-known and widely used for that, with all the records there resolve
to the localhost, both IPv4 and IPv6: `127.0.0.1` and `::1`

In some cases on our runners these addresses resolves only to `IPv6`,
and so components fail to connect when runner doesn't have `IPv6`
address. We suspect issue in systemd-resolved here
(https://github.com/systemd/systemd/issues/17745)
To workaround that and improve test stability, we introduced our own
domain `*.local.neon.build` with IPv4 address `127.0.0.1` only

See full details and troubleshoot log in referred issue.

p.s.
If you're FritzBox user, don't forget to add that domain
`local.neon.build` to the `DNS Rebind Protection` section under `Home
Network -> Network -> Network Settings`, otherwise FritzBox will block
addresses, resolving to the local addresses.
For other devices/vendors, please check corresponding documentation, if
resolving `local.neon.build` will produce empty answer for you.

## Summary of changes
Replace all the occurrences of `localtest.me` with `local.neon.build`
---
 proxy/README.md                               | 10 +++++-----
 test_runner/fixtures/neon_fixtures.py         |  8 ++++----
 test_runner/regress/test_proxy.py             |  4 ++--
 test_runner/regress/test_proxy_allowed_ips.py |  4 ++--
 test_runner/regress/test_sni_router.py        |  6 +++---
 test_runner/websocket_tunnel.py               |  4 ++--
 6 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/proxy/README.md b/proxy/README.md
index ecd54fbbd8..1156bfd352 100644
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -37,8 +37,8 @@ To play with it locally one may start proxy over a local postgres installation
 
 If both postgres and proxy are running you may send a SQL query:
 ```console
-curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
-  -H 'Neon-Connection-String: postgres://stas:pass@proxy.localtest.me:4444/postgres' \
+curl -k -X POST 'https://proxy.local.neon.build:4444/sql' \
+  -H 'Neon-Connection-String: postgres://stas:pass@proxy.local.neon.build:4444/postgres' \
   -H 'Content-Type: application/json' \
   --data '{
     "query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num",
@@ -104,7 +104,7 @@ cases where it is hard to use rows represented as objects (e.g. when several fie
 
 ## Test proxy locally
 
-Proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so we can use *.localtest.me` which resolves to `127.0.0.1`.
+Proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so we can use *.local.neon.build` which resolves to `127.0.0.1`.
 
 We will need to have a postgres instance. Assuming that we have set up docker we can set it up as follows:
 ```sh
@@ -125,7 +125,7 @@ docker exec -it proxy-postgres psql -U postgres -c "CREATE ROLE proxy WITH SUPER
 
 Let's create self-signed certificate by running:
 ```sh
-openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me"
+openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.local.neon.build"
 ```
 
 Then we need to build proxy with 'testing' feature and run, e.g.:
@@ -136,5 +136,5 @@ RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backe
 Now from client you can start a new session:
 
 ```sh
-PGSSLROOTCERT=./server.crt psql  "postgresql://proxy:password@endpoint.localtest.me:4432/postgres?sslmode=verify-full"
+PGSSLROOTCERT=./server.crt psql  "postgresql://proxy:password@endpoint.local.neon.build:4432/postgres?sslmode=verify-full"
 ```
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 7c4991ffab..690e5cdcc4 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3345,7 +3345,7 @@ class NeonProxy(PgProtocol):
         metric_collection_interval: str | None = None,
     ):
         host = "127.0.0.1"
-        domain = "proxy.localtest.me"  # resolves to 127.0.0.1
+        domain = "proxy.local.neon.build"  # resolves to 127.0.0.1
         super().__init__(dsn=auth_backend.default_conn_url, host=domain, port=proxy_port)
 
         self.domain = domain
@@ -3368,7 +3368,7 @@ class NeonProxy(PgProtocol):
         # generate key of it doesn't exist
         crt_path = self.test_output_dir / "proxy.crt"
         key_path = self.test_output_dir / "proxy.key"
-        generate_proxy_tls_certs("*.localtest.me", key_path, crt_path)
+        generate_proxy_tls_certs("*.local.neon.build", key_path, crt_path)
 
         args = [
             str(self.neon_binpath / "proxy"),
@@ -3569,7 +3569,7 @@ class NeonAuthBroker:
         external_http_port: int,
         auth_backend: NeonAuthBroker.ProxyV1,
     ):
-        self.domain = "apiauth.localtest.me"  # resolves to 127.0.0.1
+        self.domain = "apiauth.local.neon.build"  # resolves to 127.0.0.1
         self.host = "127.0.0.1"
         self.http_port = http_port
         self.external_http_port = external_http_port
@@ -3586,7 +3586,7 @@ class NeonAuthBroker:
         # generate key of it doesn't exist
         crt_path = self.test_output_dir / "proxy.crt"
         key_path = self.test_output_dir / "proxy.key"
-        generate_proxy_tls_certs("apiauth.localtest.me", key_path, crt_path)
+        generate_proxy_tls_certs("apiauth.local.neon.build", key_path, crt_path)
 
         args = [
             str(self.neon_binpath / "proxy"),
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index d8df2efc78..3c7fd0b897 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -57,7 +57,7 @@ def test_proxy_select_1(static_proxy: NeonProxy):
     assert out[0][0] == 1
 
     # with SNI
-    out = static_proxy.safe_psql("select 42", host="generic-project-name.localtest.me")
+    out = static_proxy.safe_psql("select 42", host="generic-project-name.local.neon.build")
     assert out[0][0] == 42
 
 
@@ -234,7 +234,7 @@ def test_sql_over_http_serverless_driver(static_proxy: NeonProxy):
 
     connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
     response = requests.post(
-        f"https://api.localtest.me:{static_proxy.external_http_port}/sql",
+        f"https://api.local.neon.build:{static_proxy.external_http_port}/sql",
         data=json.dumps({"query": "select 42 as answer", "params": []}),
         headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr},
         verify=str(static_proxy.test_output_dir / "proxy.crt"),
diff --git a/test_runner/regress/test_proxy_allowed_ips.py b/test_runner/regress/test_proxy_allowed_ips.py
index 902da1942e..c59da8c6b0 100644
--- a/test_runner/regress/test_proxy_allowed_ips.py
+++ b/test_runner/regress/test_proxy_allowed_ips.py
@@ -35,7 +35,7 @@ async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil
     check_cannot_connect(query="select 1", sslsni=0, options="endpoint=private-project")
 
     # with SNI
-    check_cannot_connect(query="select 1", host="private-project.localtest.me")
+    check_cannot_connect(query="select 1", host="private-project.local.neon.build")
 
     # no SNI, deprecated `options=project` syntax (before we had several endpoint in project)
     out = static_proxy.safe_psql(query="select 1", sslsni=0, options="project=generic-project")
@@ -46,7 +46,7 @@ async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil
     assert out[0][0] == 1
 
     # with SNI
-    out = static_proxy.safe_psql(query="select 1", host="generic-project.localtest.me")
+    out = static_proxy.safe_psql(query="select 1", host="generic-project.local.neon.build")
     assert out[0][0] == 1
 
 
diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py
index 2a26fef59a..3487542d6e 100644
--- a/test_runner/regress/test_sni_router.py
+++ b/test_runner/regress/test_sni_router.py
@@ -116,7 +116,7 @@ def test_pg_sni_router(
     test_output_dir: Path,
 ):
     generate_tls_cert(
-        "endpoint.namespace.localtest.me",
+        "endpoint.namespace.local.neon.build",
         test_output_dir / "router.crt",
         test_output_dir / "router.key",
     )
@@ -130,7 +130,7 @@ def test_pg_sni_router(
     with PgSniRouter(
         neon_binpath=neon_binpath,
         port=router_port,
-        destination="localtest.me",
+        destination="local.neon.build",
         tls_cert=test_output_dir / "router.crt",
         tls_key=test_output_dir / "router.key",
         test_output_dir=test_output_dir,
@@ -141,7 +141,7 @@ def test_pg_sni_router(
             "select 1",
             dbname="postgres",
             sslmode="require",
-            host=f"endpoint--namespace--{pg_port}.localtest.me",
+            host=f"endpoint--namespace--{pg_port}.local.neon.build",
             hostaddr="127.0.0.1",
         )
         assert out[0][0] == 1
diff --git a/test_runner/websocket_tunnel.py b/test_runner/websocket_tunnel.py
index facdb19140..069852468d 100755
--- a/test_runner/websocket_tunnel.py
+++ b/test_runner/websocket_tunnel.py
@@ -13,12 +13,12 @@
 # postgres -D data -p3000
 #
 # ## Launch proxy with WSS enabled:
-# openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj '/CN=*.neon.localtest.me'
+# openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj '/CN=*.local.neon.build'
 # ./target/debug/proxy --wss 127.0.0.1:40433 --http 127.0.0.1:28080 --mgmt 127.0.0.1:9099 --proxy 127.0.0.1:4433 --tls-key server.key --tls-cert server.crt --auth-backend postgres
 #
 # ## Launch the tunnel:
 #
-# poetry run ./test_runner/websocket_tunnel.py --ws-port 40433 --ws-url "wss://ep-test.neon.localtest.me"
+# poetry run ./test_runner/websocket_tunnel.py --ws-port 40433 --ws-url "wss://ep-test.local.neon.build"
 #
 # ## Now you can connect with psql:
 # psql "postgresql://heikki@localhost:40433/postgres"

From 08f92bb916bd38045ff9c4a18b04f069b827c9a0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 7 Feb 2025 13:03:01 +0000
Subject: [PATCH 402/583] pageserver: clean up DeletionQueue push_layers_sync
 (#10701)

## Problem

This is tech debt. While we introduced generations for tenants, some
legacy situations without generations needed to delete things inline
(async operation) instead of enqueing them (sync operation).

## Summary of changes

- Remove the async code, replace calls with the sync variant, and assert
that the generation is always set
---
 pageserver/src/deletion_queue.rs              | 109 ++++++------------
 .../src/tenant/remote_timeline_client.rs      |   3 +-
 test_runner/regress/test_compatibility.py     |   8 ++
 .../regress/test_pageserver_generations.py    | 105 -----------------
 4 files changed, 45 insertions(+), 180 deletions(-)

diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 1d508f5fe9..a2395b0dca 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -8,7 +8,6 @@ use std::time::Duration;
 
 use crate::controller_upcall_client::ControlPlaneGenerationsApi;
 use crate::metrics;
-use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::remote_timeline_path;
 use crate::tenant::remote_timeline_client::LayerFileMetadata;
 use crate::virtual_file::MaybeFatalIo;
@@ -463,45 +462,18 @@ impl DeletionQueueClient {
     ///
     /// The `current_generation` is the generation of this pageserver's current attachment.  The
     /// generations in `layers` are the generations in which those layers were written.
-    pub(crate) async fn push_layers(
+    pub(crate) fn push_layers(
         &self,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         current_generation: Generation,
         layers: Vec<(LayerName, LayerFileMetadata)>,
     ) -> Result<(), DeletionQueueError> {
-        if current_generation.is_none() {
-            debug!("Enqueuing deletions in legacy mode, skipping queue");
+        // None generations are not valid for attached tenants: they must always be attached in
+        // a known generation.  None generations are still permitted for layers in the index because
+        // they may be historical.
+        assert!(!current_generation.is_none());
 
-            let mut layer_paths = Vec::new();
-            for (layer, meta) in layers {
-                layer_paths.push(remote_layer_path(
-                    &tenant_shard_id.tenant_id,
-                    &timeline_id,
-                    meta.shard,
-                    &layer,
-                    meta.generation,
-                ));
-            }
-            self.push_immediate(layer_paths).await?;
-            return self.flush_immediate().await;
-        }
-
-        self.push_layers_sync(tenant_shard_id, timeline_id, current_generation, layers)
-    }
-
-    /// When a Tenant has a generation, push_layers is always synchronous because
-    /// the ListValidator channel is an unbounded channel.
-    ///
-    /// This can be merged into push_layers when we remove the Generation-less mode
-    /// support (`<https://github.com/neondatabase/neon/issues/5395>`)
-    pub(crate) fn push_layers_sync(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        current_generation: Generation,
-        layers: Vec<(LayerName, LayerFileMetadata)>,
-    ) -> Result<(), DeletionQueueError> {
         metrics::DELETION_QUEUE
             .keys_submitted
             .inc_by(layers.len() as u64);
@@ -957,14 +929,12 @@ mod test {
 
         // File should still be there after we push it to the queue (we haven't pushed enough to flush anything)
         info!("Pushing");
-        client
-            .push_layers(
-                tenant_shard_id,
-                TIMELINE_ID,
-                now_generation,
-                [(layer_file_name_1.clone(), layer_metadata)].to_vec(),
-            )
-            .await?;
+        client.push_layers(
+            tenant_shard_id,
+            TIMELINE_ID,
+            now_generation,
+            [(layer_file_name_1.clone(), layer_metadata)].to_vec(),
+        )?;
         assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
 
         assert_local_files(&[], &deletion_prefix);
@@ -1017,14 +987,12 @@ mod test {
         assert_remote_files(&[&remote_layer_name], &remote_timeline_path);
 
         tracing::debug!("Pushing...");
-        client
-            .push_layers(
-                tenant_shard_id,
-                TIMELINE_ID,
-                stale_generation,
-                [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
-            )
-            .await?;
+        client.push_layers(
+            tenant_shard_id,
+            TIMELINE_ID,
+            stale_generation,
+            [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
+        )?;
 
         // We enqueued the operation in a stale generation: it should have failed validation
         tracing::debug!("Flushing...");
@@ -1032,14 +1000,12 @@ mod test {
         assert_remote_files(&[&remote_layer_name], &remote_timeline_path);
 
         tracing::debug!("Pushing...");
-        client
-            .push_layers(
-                tenant_shard_id,
-                TIMELINE_ID,
-                latest_generation,
-                [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
-            )
-            .await?;
+        client.push_layers(
+            tenant_shard_id,
+            TIMELINE_ID,
+            latest_generation,
+            [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
+        )?;
 
         // We enqueued the operation in a fresh generation: it should have passed validation
         tracing::debug!("Flushing...");
@@ -1074,28 +1040,24 @@ mod test {
         // generation gets that treatment)
         let remote_layer_file_name_historical =
             ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
-        client
-            .push_layers(
-                tenant_shard_id,
-                TIMELINE_ID,
-                now_generation.previous(),
-                [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
-            )
-            .await?;
+        client.push_layers(
+            tenant_shard_id,
+            TIMELINE_ID,
+            now_generation.previous(),
+            [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
+        )?;
 
         // Inject a deletion in the generation before generation_now: after restart,
         // this deletion should get executed, because we execute deletions in the
         // immediately previous generation on the same node.
         let remote_layer_file_name_previous =
             ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?;
-        client
-            .push_layers(
-                tenant_shard_id,
-                TIMELINE_ID,
-                now_generation,
-                [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
-            )
-            .await?;
+        client.push_layers(
+            tenant_shard_id,
+            TIMELINE_ID,
+            now_generation,
+            [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
+        )?;
 
         client.flush().await?;
         assert_remote_files(
@@ -1139,6 +1101,7 @@ pub(crate) mod mock {
     use tracing::info;
 
     use super::*;
+    use crate::tenant::remote_timeline_client::remote_layer_path;
     use std::sync::atomic::{AtomicUsize, Ordering};
 
     pub struct ConsumerState {
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index ad6d8dfae8..713efbb9a4 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -517,7 +517,7 @@ impl RemoteTimelineClient {
             if let Ok(queue) = queue_locked.initialized_mut() {
                 let blocked_deletions = std::mem::take(&mut queue.blocked_deletions);
                 for d in blocked_deletions {
-                    if let Err(e) = self.deletion_queue_client.push_layers_sync(
+                    if let Err(e) = self.deletion_queue_client.push_layers(
                         self.tenant_shard_id,
                         self.timeline_id,
                         self.generation,
@@ -2151,7 +2151,6 @@ impl RemoteTimelineClient {
                                 self.generation,
                                 delete.layers.clone(),
                             )
-                            .await
                             .map_err(|e| anyhow::anyhow!(e))
                     }
                 }
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index ba3078d493..823f2185e4 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -474,6 +474,14 @@ HISTORIC_DATA_SETS = [
         PgVersion.V16,
         "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2024-07-18-pgv16.tar.zst",
     ),
+    # This dataset created on a pageserver running modern code at time of capture, but configured with no generation.  This
+    # is our regression test that we can load data written without generations in layer file names & indices
+    HistoricDataSet(
+        "2025-02-07-nogenerations",
+        TenantId("e1411ca6562d6ff62419f693a5695d67"),
+        PgVersion.V17,
+        "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2025-02-07-pgv17-nogenerations.tar.zst",
+    ),
 ]
 
 
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 7e5bb45242..fa1cd61206 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -12,7 +12,6 @@ of the pageserver are:
 from __future__ import annotations
 
 import os
-import re
 import time
 from enum import StrEnum
 
@@ -29,7 +28,6 @@ from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     assert_tenant_state,
-    list_prefix,
     wait_for_last_record_lsn,
     wait_for_upload,
 )
@@ -124,109 +122,6 @@ def assert_deletion_queue(ps_http, size_fn) -> None:
     assert size_fn(v) is True
 
 
-def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
-    """
-    Validate behavior when a pageserver is run without generation support enabled,
-    then started again after activating it:
-    - Before upgrade, no objects should have generation suffixes
-    - After upgrade, the bucket should contain a mixture.
-    - In both cases, postgres I/O should work.
-    """
-    neon_env_builder.enable_pageserver_remote_storage(
-        RemoteStorageKind.MOCK_S3,
-    )
-
-    env = neon_env_builder.init_configs()
-    env.broker.start()
-    for sk in env.safekeepers:
-        sk.start()
-    env.storage_controller.start()
-
-    # We will start a pageserver with no control_plane_api set, so it won't be able to self-register
-    env.storage_controller.node_register(env.pageserver)
-
-    def remove_control_plane_api_field(config):
-        return config.pop("control_plane_api")
-
-    control_plane_api = env.pageserver.edit_config_toml(remove_control_plane_api_field)
-    env.pageserver.start()
-    env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"})
-
-    env.create_tenant(
-        tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline
-    )
-
-    generate_uploads_and_deletions(env, pageserver=env.pageserver)
-
-    def parse_generation_suffix(key):
-        m = re.match(".+-([0-9a-zA-Z]{8})$", key)
-        if m is None:
-            return None
-        else:
-            log.info(f"match: {m}")
-            log.info(f"group: {m.group(1)}")
-            return int(m.group(1), 16)
-
-    assert neon_env_builder.pageserver_remote_storage is not None
-    pre_upgrade_keys = list(
-        [
-            o["Key"]
-            for o in list_prefix(neon_env_builder.pageserver_remote_storage, delimiter="")[
-                "Contents"
-            ]
-        ]
-    )
-    for key in pre_upgrade_keys:
-        assert parse_generation_suffix(key) is None
-
-    env.pageserver.stop()
-    # Starting without the override that disabled control_plane_api
-    env.pageserver.patch_config_toml_nonrecursive(
-        {
-            "control_plane_api": control_plane_api,
-        }
-    )
-    env.pageserver.start()
-
-    generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False)
-
-    legacy_objects: list[str] = []
-    suffixed_objects = []
-    post_upgrade_keys = list(
-        [
-            o["Key"]
-            for o in list_prefix(neon_env_builder.pageserver_remote_storage, delimiter="")[
-                "Contents"
-            ]
-        ]
-    )
-    for key in post_upgrade_keys:
-        log.info(f"post-upgrade key: {key}")
-        if parse_generation_suffix(key) is not None:
-            suffixed_objects.append(key)
-        else:
-            legacy_objects.append(key)
-
-    # Bucket now contains a mixture of suffixed and non-suffixed objects
-    assert len(suffixed_objects) > 0
-    assert len(legacy_objects) > 0
-
-    # Flush through deletions to get a clean state for scrub: we are implicitly validating
-    # that our generations-enabled pageserver was able to do deletions of layers
-    # from earlier which don't have a generation.
-    env.pageserver.http_client().deletion_queue_flush(execute=True)
-
-    assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0
-
-    # Having written a mixture of generation-aware and legacy index_part.json,
-    # ensure the scrubber handles the situation as expected.
-    healthy, metadata_summary = env.storage_scrubber.scan_metadata()
-    assert metadata_summary["tenant_count"] == 1  # Scrubber should have seen our timeline
-    assert metadata_summary["timeline_count"] == 1
-    assert metadata_summary["timeline_shard_count"] == 1
-    assert healthy
-
-
 def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.enable_pageserver_remote_storage(
         RemoteStorageKind.MOCK_S3,

From 95220ba43e54666e9a271f0ee9d53c6d976ca33c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 7 Feb 2025 14:51:36 +0000
Subject: [PATCH 403/583] tests: fix flaky endpoint in
 test_ingest_logical_message (#10700)

## Problem

Endpoint kept running while timeline was deleted, causing forbidden
warnings on the pageserver when the tenant is not found.

## Summary of changes

- Explicitly stop the endpoint before the end of the test, so that it
isn't trying to talk to the pageserver in the background while things
are torn down
---
 test_runner/fixtures/neon_fixtures.py                 |  4 +++-
 .../performance/test_ingest_logical_message.py        | 11 ++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 690e5cdcc4..3d3a445b97 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -5122,12 +5122,14 @@ def wait_for_last_flush_lsn(
     timeline: TimelineId,
     pageserver_id: int | None = None,
     auth_token: str | None = None,
+    last_flush_lsn: Lsn | None = None,
 ) -> Lsn:
     """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""
 
     shards = tenant_get_shards(env, tenant, pageserver_id)
 
-    last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+    if last_flush_lsn is None:
+        last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
 
     results = []
     for tenant_shard_id, pageserver in shards:
diff --git a/test_runner/performance/test_ingest_logical_message.py b/test_runner/performance/test_ingest_logical_message.py
index d3118eb15a..b55cb68b64 100644
--- a/test_runner/performance/test_ingest_logical_message.py
+++ b/test_runner/performance/test_ingest_logical_message.py
@@ -76,6 +76,9 @@ def test_ingest_logical_message(
             log.info("Waiting for Pageserver to catch up")
             wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn)
 
+    recover_to_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+    endpoint.stop()
+
     # Now that all data is ingested, delete and recreate the tenant in the pageserver. This will
     # reingest all the WAL from the safekeeper without any other constraints. This gives us a
     # baseline of how fast the pageserver can ingest this WAL in isolation.
@@ -88,7 +91,13 @@ def test_ingest_logical_message(
     with zenbenchmark.record_duration("pageserver_recover_ingest"):
         log.info("Recovering WAL into pageserver")
         client.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline)
-        wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
+        wait_for_last_flush_lsn(
+            env, endpoint, env.initial_tenant, env.initial_timeline, last_flush_lsn=recover_to_lsn
+        )
+
+    # Check endpoint can start, i.e. we really recovered
+    endpoint.start()
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
 
     # Emit metrics.
     wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024))

From f5243992fad52f3fa144d2d6e387ad1b3c7d6ace Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 7 Feb 2025 16:06:26 +0100
Subject: [PATCH 404/583] safekeeper: make timeline deletions a bit more
 verbose (#10721)

Make timeline deletion print the sub-steps, so that we can narrow down
some stuck timeline deletion issues we are observing.

https://neondb.slack.com/archives/C08C2G15M6U/p1738930694716009
---
 safekeeper/src/timeline.rs             | 2 ++
 safekeeper/src/timelines_global_map.rs | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 5eb0bd7146..3702a096e0 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -592,6 +592,8 @@ impl Timeline {
         assert!(self.cancel.is_cancelled());
         assert!(self.gate.close_complete());
 
+        info!("deleting timeline {} from disk", self.ttid);
+
         // Close associated FDs. Nobody will be able to touch timeline data once
         // it is cancelled, so WAL storage won't be opened again.
         shared_state.sk.close_wal_store();
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 01c6aff6c3..1ff6a72bce 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -475,6 +475,8 @@ impl GlobalTimelines {
                 info!("deleting timeline {}, only_local={}", ttid, only_local);
                 timeline.shutdown().await;
 
+                info!("timeline {ttid} shut down for deletion");
+
                 // Take a lock and finish the deletion holding this mutex.
                 let mut shared_state = timeline.write_shared_state().await;
 

From d6e87a3a9cfcaee7b4e37dd8c3aeef1e3f862cee Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 7 Feb 2025 16:11:31 +0100
Subject: [PATCH 405/583] pageserver: add separate, disabled compaction
 semaphore (#10716)

## Problem

L0 compaction can get starved by other background tasks. It needs to be
responsive to avoid read amp blowing up during heavy write workloads.

Touches #10694.

## Summary of changes

Add a separate semaphore for compaction, configurable via
`use_compaction_semaphore` (disabled by default). This is primarily for
testing in staging; it needs further work (in particular to split
image/L0 compaction jobs) before it can be enabled.
---
 libs/pageserver_api/src/config.rs             |  2 +
 pageserver/src/config.rs                      |  6 ++
 pageserver/src/tenant/tasks.rs                | 77 +++++++++++--------
 pageserver/src/tenant/timeline.rs             |  6 +-
 .../src/tenant/timeline/eviction_task.rs      |  3 +-
 5 files changed, 59 insertions(+), 35 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index a0b5feea94..b806bd391c 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -94,6 +94,7 @@ pub struct ConfigToml {
     pub ondemand_download_behavior_treat_error_as_warn: bool,
     #[serde(with = "humantime_serde")]
     pub background_task_maximum_delay: Duration,
+    pub use_compaction_semaphore: bool,
     pub control_plane_api: Option<reqwest::Url>,
     pub control_plane_api_token: Option<String>,
     pub control_plane_emergency_mode: bool,
@@ -470,6 +471,7 @@ impl Default for ConfigToml {
                 DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
             )
             .unwrap()),
+            use_compaction_semaphore: false,
 
             control_plane_api: (None),
             control_plane_api_token: (None),
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index ce480c70a0..3c86b73933 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -140,6 +140,10 @@ pub struct PageServerConf {
     /// not terrible.
     pub background_task_maximum_delay: Duration,
 
+    /// If true, use a separate semaphore for compaction tasks instead of the common background task
+    /// semaphore. Defaults to false.
+    pub use_compaction_semaphore: bool,
+
     pub control_plane_api: Option<Url>,
 
     /// JWT token for use with the control plane API.
@@ -332,6 +336,7 @@ impl PageServerConf {
             test_remote_failures,
             ondemand_download_behavior_treat_error_as_warn,
             background_task_maximum_delay,
+            use_compaction_semaphore,
             control_plane_api,
             control_plane_api_token,
             control_plane_emergency_mode,
@@ -385,6 +390,7 @@ impl PageServerConf {
             test_remote_failures,
             ondemand_download_behavior_treat_error_as_warn,
             background_task_maximum_delay,
+            use_compaction_semaphore,
             control_plane_api,
             control_plane_emergency_mode,
             heatmap_upload_concurrency,
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 0f10dd7e10..d562f7b783 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -1,47 +1,56 @@
 //! This module contains functions to serve per-tenant background processes,
 //! such as compaction and GC
 
+use std::cmp::max;
 use std::ops::ControlFlow;
 use std::str::FromStr;
 use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};
 
+use once_cell::sync::Lazy;
+use rand::Rng;
+use tokio::sync::Semaphore;
+use tokio_util::sync::CancellationToken;
+use tracing::*;
+
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::{BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
-use crate::task_mgr;
-use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
+use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS};
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::compaction::CompactionOutcome;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
-use once_cell::sync::Lazy;
-use rand::Rng;
-use tokio_util::sync::CancellationToken;
-use tracing::*;
 use utils::rate_limit::RateLimit;
 use utils::{backoff, completion, pausable_failpoint};
 
-static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
-    once_cell::sync::Lazy::new(|| {
-        let total_threads = task_mgr::TOKIO_WORKER_THREADS.get();
-        let permits = usize::max(
-            1,
-            // while a lot of the work is done on spawn_blocking, we still do
-            // repartitioning in the async context. this should give leave us some workers
-            // unblocked to be blocked on other work, hopefully easing any outside visible
-            // effects of restarts.
-            //
-            // 6/8 is a guess; previously we ran with unlimited 8 and more from
-            // spawn_blocking.
-            (total_threads * 3).checked_div(4).unwrap_or(0),
-        );
-        assert_ne!(permits, 0, "we will not be adding in permits later");
-        assert!(
-            permits < total_threads,
-            "need threads avail for shorter work"
-        );
-        tokio::sync::Semaphore::new(permits)
-    });
+/// Semaphore limiting concurrent background tasks (across all tenants).
+///
+/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work.
+static CONCURRENT_BACKGROUND_TASKS: Lazy<Semaphore> = Lazy::new(|| {
+    let total_threads = TOKIO_WORKER_THREADS.get();
+    let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0));
+    assert_ne!(permits, 0, "we will not be adding in permits later");
+    assert!(permits < total_threads, "need threads for other work");
+    Semaphore::new(permits)
+});
+
+/// Semaphore limiting concurrent compaction tasks (across all tenants). This is disabled by
+/// default, see `use_compaction_semaphore`.
+///
+/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work.
+///
+/// This is a separate semaphore from background tasks, because L0 compaction needs to be responsive
+/// to avoid high read amp during heavy write workloads.
+///
+/// TODO: split image compaction and L0 compaction, and move image compaction to background tasks.
+/// Only L0 compaction needs to be responsive, and it shouldn't block on image compaction.
+static CONCURRENT_COMPACTION_TASKS: Lazy<Semaphore> = Lazy::new(|| {
+    let total_threads = TOKIO_WORKER_THREADS.get();
+    let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0));
+    assert_ne!(permits, 0, "we will not be adding in permits later");
+    assert!(permits < total_threads, "need threads for other work");
+    Semaphore::new(permits)
+});
 
 #[derive(
     Debug,
@@ -73,8 +82,9 @@ pub struct BackgroundLoopSemaphorePermit<'a> {
 
 /// Cancellation safe.
 pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
-    loop_kind: BackgroundLoopKind,
     _ctx: &RequestContext,
+    loop_kind: BackgroundLoopKind,
+    use_compaction_semaphore: bool,
 ) -> BackgroundLoopSemaphorePermit<'static> {
     // TODO: use a lower threshold and remove the pacer once we resolve some blockage.
     const WARN_THRESHOLD: Duration = Duration::from_secs(600);
@@ -88,10 +98,13 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
     }
 
     // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
-    let permit = CONCURRENT_BACKGROUND_TASKS
-        .acquire()
-        .await
-        .expect("should never close");
+    let permit = if loop_kind == BackgroundLoopKind::Compaction && use_compaction_semaphore {
+        CONCURRENT_COMPACTION_TASKS.acquire().await
+    } else {
+        assert!(!use_compaction_semaphore);
+        CONCURRENT_BACKGROUND_TASKS.acquire().await
+    }
+    .expect("should never close");
 
     let waited = recorder.acquired();
     if waited >= WARN_THRESHOLD {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 908356c459..770ea418d1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1719,8 +1719,9 @@ impl Timeline {
             let guard = self.compaction_lock.lock().await;
 
             let permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
-                BackgroundLoopKind::Compaction,
                 ctx,
+                BackgroundLoopKind::Compaction,
+                self.conf.use_compaction_semaphore,
             )
             .await;
 
@@ -3057,8 +3058,9 @@ impl Timeline {
             let skip_concurrency_limiter = &skip_concurrency_limiter;
             async move {
                 let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
-                    BackgroundLoopKind::InitialLogicalSizeCalculation,
                     background_ctx,
+                    BackgroundLoopKind::InitialLogicalSizeCalculation,
+                    false,
                 );
 
                 use crate::metrics::initial_logical_size::StartCircumstances;
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 9836aafecb..985329136e 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -335,8 +335,9 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> {
         let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
-            BackgroundLoopKind::Eviction,
             ctx,
+            BackgroundLoopKind::Eviction,
+            false,
         );
 
         tokio::select! {

From 9609f7547ea6aba294e3614aa047cbe5f209f15f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 7 Feb 2025 15:29:34 +0000
Subject: [PATCH 406/583] tests: address warnings in timeline shutdown (#10702)

## Problem

There are a couple of log warnings tripping up
`test_timeline_archival_chaos`

- `[stopping left-over name="timeline_delete"
tenant_shard_id=2d526292b67dac0e6425266d7079c253
timeline_id=Some(44ba36bfdee5023672c93778985facd9)
kind=TimelineDeletionWorker\n')](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-10672/13161357302/index.html#/testresult/716b997bb1d8a021)`
- `ignoring attempt to restart exited flush_loop
503d8f401d8887cfaae873040a6cc193/d5eed0673ba37d8992f7ec411363a7e3\n')`

Related: https://github.com/neondatabase/neon/issues/10389

## Summary of changes

- Downgrade the 'ignoring attempt to restart' to info -- there's nothing
in the design that forbids this happening, i.e. someone calling
maybe_spawn_flush_loop concurrently with shutdown()
- Prevent timeline deletion tasks outliving tenants by carrying a
gateguard. This logically makes sense because the deletion process does
call into Tenant to update manifests.
---
 pageserver/src/tenant/timeline.rs        | 2 +-
 pageserver/src/tenant/timeline/delete.rs | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 770ea418d1..2be6fc1e59 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2633,7 +2633,7 @@ impl Timeline {
                 return;
             }
             FlushLoopState::Exited => {
-                warn!(
+                info!(
                     "ignoring attempt to restart exited flush_loop {}/{}",
                     self.tenant_shard_id, self.timeline_id
                 );
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 3c828c8a9e..5eb2d3aa24 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -341,6 +341,13 @@ impl DeleteTimelineFlow {
         let tenant_shard_id = timeline.tenant_shard_id();
         let timeline_id = timeline.timeline_id();
 
+        // Take a tenant gate guard, because timeline deletion needs access to the tenant to update its manifest.
+        let Ok(tenant_guard) = tenant.gate.enter() else {
+            // It is safe to simply skip here, because we only schedule background work once the timeline is durably marked for deletion.
+            info!("Tenant is shutting down, timeline deletion will be resumed when it next starts");
+            return;
+        };
+
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             TaskKind::TimelineDeletionWorker,
@@ -348,6 +355,8 @@ impl DeleteTimelineFlow {
             Some(timeline_id),
             "timeline_delete",
             async move {
+                let _guard = tenant_guard;
+
                 if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await {
                     // Only log as an error if it's not a cancellation.
                     if matches!(err, DeleteTimelineError::Cancelled) {

From 0abff59e97415f35622fededca178d0f93f6b96d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 7 Feb 2025 18:03:01 +0200
Subject: [PATCH 407/583] compute: Allow postgres user to power off the VM
 (#10710)

I plan to use this when launching a fast_import job in a VM. There's
currently no good way for an executable running in a NeonVM to exit
gracefully and have the VM shut down. The inittab we use always respawns
the payload command. The idea is that the control plane can use
"fast_import ... && poweroff" as the command, so that when fast_import
completes successfully, the VM is terminated, and the k8s Pod and
VirtualMachine object are marked as completed successfully.

I'm working on bigger changes to how we launch VMs, and will try to come
up with a nicer system for that, but in the meanwhile, this quick hack
allows us to proceed with using VMs for one-off jobs like fast_import.
---
 compute/vm-image-spec-bookworm.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml
index 005143fff3..86caa95f38 100644
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -47,7 +47,9 @@ files:
       # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
       # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
       # regardless of hostname (ALL)
-      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota
+      #
+      # Also allow it to shut down the VM. The fast_import job does that when it's finished.
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
   - filename: cgconfig.conf
     content: |
       # Configuration for cgroups in VM compute nodes

From 5e95860e708ee1cd09f740b964f9a8c369ddd0bc Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 7 Feb 2025 16:27:39 +0000
Subject: [PATCH 408/583] tests: wait for manifest persistence in
 test_timeline_archival_chaos (#10719)

## Problem

This test would sometimes fail its assertion that a timeline does not
revert to active once archived. That's because it was using the
in-memory offload state, not the persistent state, so this was sometimes
lost across a pageserver restart.

Closes: https://github.com/neondatabase/neon/issues/10389

## Summary of changes

- When reading offload status, read from pageserver API _and_ remote
storage before considering the timeline offloaded
---
 test_runner/fixtures/remote_storage.py       | 52 +++++++++++++++++---
 test_runner/regress/test_timeline_archive.py | 29 ++++++++++-
 2 files changed, 73 insertions(+), 8 deletions(-)

diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index d969971a35..4df2b2df2b 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -282,18 +282,35 @@ class S3Storage:
     def timeline_path(self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId) -> str:
         return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}"
 
+    def get_latest_generation_key(self, prefix: str, suffix: str, keys: list[str]) -> str:
+        """
+        Gets the latest generation key from a list of keys.
+
+        @param index_keys: A list of keys of different generations, which start with `prefix`
+        """
+
+        def parse_gen(key: str) -> int:
+            shortname = key.split("/")[-1]
+            generation_str = shortname.removeprefix(prefix).removesuffix(suffix)
+            try:
+                return int(generation_str, base=16)
+            except ValueError:
+                log.info(f"Ignoring non-matching key: {key}")
+                return -1
+
+        if len(keys) == 0:
+            raise IndexError("No keys found")
+
+        return max(keys, key=parse_gen)
+
     def get_latest_index_key(self, index_keys: list[str]) -> str:
         """
         Gets the latest index file key.
 
         @param index_keys: A list of index keys of different generations.
         """
-
-        def parse_gen(index_key: str) -> int:
-            parts = index_key.split("index_part.json-")
-            return int(parts[-1], base=16) if len(parts) == 2 else -1
-
-        return max(index_keys, key=parse_gen)
+        key = self.get_latest_generation_key(prefix="index_part.json-", suffix="", keys=index_keys)
+        return key
 
     def download_index_part(self, index_key: str) -> IndexPartDump:
         """
@@ -306,6 +323,29 @@ class S3Storage:
         log.info(f"index_part.json: {body}")
         return IndexPartDump.from_json(json.loads(body))
 
+    def download_tenant_manifest(self, tenant_id: TenantId) -> dict[str, Any] | None:
+        tenant_prefix = self.tenant_path(tenant_id)
+
+        objects = self.client.list_objects_v2(Bucket=self.bucket_name, Prefix=f"{tenant_prefix}/")[
+            "Contents"
+        ]
+        keys = [obj["Key"] for obj in objects if obj["Key"].find("tenant-manifest") != -1]
+        try:
+            manifest_key = self.get_latest_generation_key("tenant-manifest-", ".json", keys)
+        except IndexError:
+            log.info(
+                f"No manifest found for tenant {tenant_id}, this is normal if it didn't offload anything yet"
+            )
+            return None
+
+        response = self.client.get_object(Bucket=self.bucket_name, Key=manifest_key)
+        body = response["Body"].read().decode("utf-8")
+        log.info(f"Downloaded manifest {manifest_key}: {body}")
+
+        manifest = json.loads(body)
+        assert isinstance(manifest, dict)
+        return manifest
+
     def heatmap_key(self, tenant_id: TenantId) -> str:
         return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"
 
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 306e971657..50f674f539 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -554,8 +554,33 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
                                 log.info(f"Timeline {state.timeline_id} is still active")
                                 shutdown.wait(0.5)
                             elif state.timeline_id in offloaded_ids:
-                                log.info(f"Timeline {state.timeline_id} is now offloaded")
-                                state.offloaded = True
+                                log.info(f"Timeline {state.timeline_id} is now offloaded in memory")
+
+                                # Hack: when we see something offloaded in the API, it doesn't guarantee that the offload
+                                # is persistent (it is marked offloaded first, then that is persisted to the tenant manifest).
+                                # So we wait until we see the manifest update before considering it offloaded, that way
+                                # subsequent checks that it doesn't revert to active on a restart will pass reliably.
+                                time.sleep(0.1)
+                                assert isinstance(env.pageserver_remote_storage, S3Storage)
+                                manifest = env.pageserver_remote_storage.download_tenant_manifest(
+                                    tenant_id
+                                )
+                                if manifest is None:
+                                    log.info(
+                                        f"Timeline {state.timeline_id} is not yet offloaded persistently (no manifest)"
+                                    )
+                                elif str(state.timeline_id) in [
+                                    t["timeline_id"] for t in manifest["offloaded_timelines"]
+                                ]:
+                                    log.info(
+                                        f"Timeline {state.timeline_id} is now offloaded persistently"
+                                    )
+                                    state.offloaded = True
+                                else:
+                                    log.info(
+                                        f"Timeline {state.timeline_id} is not yet offloaded persistently (manifest: {manifest})"
+                                    )
+
                                 break
                             else:
                                 # Timeline is neither offloaded nor active, this is unexpected: the pageserver

From 2656c713a46b87c2b2f91bd1f1c813850a7b8372 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 7 Feb 2025 18:37:53 +0100
Subject: [PATCH 409/583] Revert recent AWS SDK update (#10724)

We've been seeing some regressions in staging since the AWS SDK updates:
https://github.com/neondatabase/neon/issues/10695 . We aren't sure the
regression was caused by the SDK update, but the issues do involve S3,
so it's not unlikely. By reverting the SDK update we find out whether it
was really the SDK update, or something else.

Reverts the two PRs:

* https://github.com/neondatabase/neon/pull/10588
* https://github.com/neondatabase/neon/pull/10699

https://neondb.slack.com/archives/C08C2G15M6U/p1738576986047179
---
 Cargo.lock | 87 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 48 insertions(+), 39 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2c5b0a113f..e73f1f9cdb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -300,9 +300,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "aws-config"
-version = "1.5.15"
+version = "1.5.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc47e70fc35d054c8fcd296d47a61711f043ac80534a10b4f741904f81e73a90"
+checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -311,7 +311,7 @@ dependencies = [
  "aws-sdk-sts",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.60.7",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -342,9 +342,9 @@ dependencies = [
 
 [[package]]
 name = "aws-runtime"
-version = "1.5.4"
+version = "1.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bee7643696e7fdd74c10f9eb42848a87fe469d35eae9c3323f80aa98f350baac"
+checksum = "b5ac934720fbb46206292d2c75b57e67acfc56fe7dfd34fb9a02334af08409ea"
 dependencies = [
  "aws-credential-types",
  "aws-sigv4",
@@ -368,15 +368,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-iam"
-version = "1.60.0"
+version = "1.53.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a43daa438f8e7e4ebbbcb5c712b3b85db50d62e637a7da4ba9da51095d327460"
+checksum = "fb8a6fea8d335cde419176b1f2c6d2d6e97997719e7df4b51e59064310f48e4a"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-query",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
@@ -391,15 +391,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-kms"
-version = "1.58.0"
+version = "1.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40b7a24700ac548025a47a5c579886f5198895bb1eccd8964dfd71cd66c16912"
+checksum = "3c30f6fd5646b99d9b45ec3a0c22e67112c175b2383100c960d7ee39d96c8d96"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -413,9 +413,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-s3"
-version = "1.68.0"
+version = "1.65.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc5ddf1dc70287dc9a2f953766a1fe15e3e74aef02fd1335f2afa475c9b4f4fc"
+checksum = "d3ba2c5c0f2618937ce3d4a5ad574b86775576fa24006bcb3128c6e2cbf3c34e"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -424,7 +424,7 @@ dependencies = [
  "aws-smithy-checksums",
  "aws-smithy-eventstream",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -447,15 +447,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sso"
-version = "1.57.0"
+version = "1.50.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c54bab121fe1881a74c338c5f723d1592bf3b53167f80268a1274f404e1acc38"
+checksum = "05ca43a4ef210894f93096039ef1d6fa4ad3edfabb3be92b80908b9f2e4b4eab"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -469,15 +469,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-ssooidc"
-version = "1.58.0"
+version = "1.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c8234fd024f7ac61c4e44ea008029bde934250f371efe7d4a39708397b1080c"
+checksum = "abaf490c2e48eed0bb8e2da2fb08405647bd7f253996e0f93b981958ea0f73b0"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -491,15 +491,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sts"
-version = "1.58.0"
+version = "1.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba60e1d519d6f23a9df712c04fdeadd7872ac911c84b2f62a8bda92e129b7962"
+checksum = "b68fde0d69c8bfdc1060ea7da21df3e39f6014da316783336deff0a9ec28f4bf"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-query",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
@@ -514,9 +514,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sigv4"
-version = "1.2.8"
+version = "1.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0bc5bbd1e4a2648fd8c5982af03935972c24a2f9846b396de661d351ee3ce837"
+checksum = "7d3820e0c08d0737872ff3c7c1f21ebbb6693d832312d6152bf18ef50a5471c2"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-eventstream",
@@ -543,9 +543,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-async"
-version = "1.2.4"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa59d1327d8b5053c54bf2eaae63bf629ba9e904434d0835a28ed3c0ed0a614e"
+checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c"
 dependencies = [
  "futures-util",
  "pin-project-lite",
@@ -575,9 +575,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-eventstream"
-version = "0.60.6"
+version = "0.60.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b18559a41e0c909b77625adf2b8c50de480a8041e5e4a3f5f7d177db70abc5a"
+checksum = "cef7d0a272725f87e51ba2bf89f8c21e4df61b9e49ae1ac367a6d69916ef7c90"
 dependencies = [
  "aws-smithy-types",
  "bytes",
@@ -586,9 +586,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http"
-version = "0.60.12"
+version = "0.60.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7809c27ad8da6a6a68c454e651d4962479e81472aa19ae99e59f9aba1f9713cc"
+checksum = "5c8bc3e8fdc6b8d07d976e301c02fe553f72a39b7a9fea820e023268467d7ab6"
 dependencies = [
  "aws-smithy-eventstream",
  "aws-smithy-runtime-api",
@@ -607,9 +607,18 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-json"
-version = "0.61.2"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "623a51127f24c30776c8b374295f2df78d92517386f77ba30773f15a30ce1422"
+checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6"
+dependencies = [
+ "aws-smithy-types",
+]
+
+[[package]]
+name = "aws-smithy-json"
+version = "0.61.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee4e69cc50921eb913c6b662f8d909131bb3e6ad6cb6090d3a39b66fc5c52095"
 dependencies = [
  "aws-smithy-types",
 ]
@@ -626,9 +635,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime"
-version = "1.7.7"
+version = "1.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "865f7050bbc7107a6c98a397a9fcd9413690c27fa718446967cf03b2d3ac517e"
+checksum = "9f20685047ca9d6f17b994a07f629c813f08b5bce65523e47124879e60103d45"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-http",
@@ -670,9 +679,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-types"
-version = "1.2.13"
+version = "1.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c7b8a53819e42f10d0821f56da995e1470b199686a1809168db6ca485665f042"
+checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510"
 dependencies = [
  "base64-simd",
  "bytes",
@@ -705,9 +714,9 @@ dependencies = [
 
 [[package]]
 name = "aws-types"
-version = "1.3.5"
+version = "1.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dfbd0a668309ec1f66c0f6bda4840dd6d4796ae26d699ebc266d7cc95c6d040f"
+checksum = "5221b91b3e441e6675310829fd8984801b772cb1546ef6c0e54dec9f1ac13fef"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-async",

From bf20d78292618355b160cc74fab474d8d956e92e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 7 Feb 2025 20:45:39 +0100
Subject: [PATCH 410/583] fix(page_service): page reconstruct error log does
 not include `shard_id` label (#10680)

# Problem

Before this PR, the `shard_id` field was missing when page_service logs
a reconstruct error.

This was caused by batching-related refactorings.

Example from staging:

```
2025-01-30T07:10:04.346022Z ERROR page_service_conn_main{peer_addr=...}:process_query{tenant_id=... timeline_id=...}:handle_pagerequests:request:handle_get_page_at_lsn_request_batched{req_lsn=FFFFFFFF/FFFFFFFF}: error reading relation or page version: Read error: whole vectored get request failed because one or more of the requested keys were missing: could not find data for key  ...
```

# Changes

Delay creation of the handler-specific span until after shard routing

This also avoids the need for the record() call in the pagestream hot
path.

# Testing

Manual testing with a failpoint that is part of this PR's history but
will be squashed away.


# Refs

- fixes https://github.com/neondatabase/neon/issues/10599
---
 pageserver/src/page_service.rs | 74 ++++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 31 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 24a350399d..db8c428795 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -489,7 +489,6 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
         let timeline = tenant_shard
             .get_timeline(timeline_id, true)
             .map_err(GetActiveTimelineError::Timeline)?;
-        set_tracing_field_shard_id(&timeline);
         Ok(timeline)
     }
 }
@@ -774,11 +773,11 @@ impl PageServerHandler {
 
         let batched_msg = match neon_fe_msg {
             PagestreamFeMessage::Exists(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
                 let shard = timeline_handles
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
-                    .instrument(span.clone()) // sets `shard_id` field
                     .await?;
+                debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
+                let span = tracing::info_span!(parent: &parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
                 let timer = record_op_start_and_throttle(
                     &shard,
                     metrics::SmgrQueryType::GetRelExists,
@@ -793,11 +792,10 @@ impl PageServerHandler {
                 }
             }
             PagestreamFeMessage::Nblocks(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
                 let shard = timeline_handles
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
-                    .instrument(span.clone()) // sets `shard_id` field
                     .await?;
+                let span = tracing::info_span!(parent: &parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
                 let timer = record_op_start_and_throttle(
                     &shard,
                     metrics::SmgrQueryType::GetRelSize,
@@ -812,11 +810,10 @@ impl PageServerHandler {
                 }
             }
             PagestreamFeMessage::DbSize(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn);
                 let shard = timeline_handles
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
-                    .instrument(span.clone()) // sets `shard_id` field
                     .await?;
+                let span = tracing::info_span!(parent: &parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
                 let timer = record_op_start_and_throttle(
                     &shard,
                     metrics::SmgrQueryType::GetDbSize,
@@ -831,11 +828,10 @@ impl PageServerHandler {
                 }
             }
             PagestreamFeMessage::GetSlruSegment(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn);
                 let shard = timeline_handles
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
-                    .instrument(span.clone()) // sets `shard_id` field
                     .await?;
+                let span = tracing::info_span!(parent: &parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
                 let timer = record_op_start_and_throttle(
                     &shard,
                     metrics::SmgrQueryType::GetSlruSegment,
@@ -850,12 +846,20 @@ impl PageServerHandler {
                 }
             }
             PagestreamFeMessage::GetPage(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %req.hdr.request_lsn);
+                // avoid a somewhat costly Span::record() by constructing the entire span in one go.
+                macro_rules! mkspan {
+                    (before shard routing) => {{
+                        tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn)
+                    }};
+                    ($shard_id:expr) => {{
+                        tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, shard_id = %$shard_id)
+                    }};
+                }
 
                 macro_rules! respond_error {
-                    ($error:expr) => {{
+                    ($span:expr, $error:expr) => {{
                         let error = BatchedFeMessage::RespondError {
-                            span,
+                            span: $span,
                             error: BatchedPageStreamError {
                                 req: req.hdr,
                                 err: $error,
@@ -868,27 +872,35 @@ impl PageServerHandler {
                 let key = rel_block_to_key(req.rel, req.blkno);
                 let shard = match timeline_handles
                     .get(tenant_id, timeline_id, ShardSelector::Page(key))
-                    .instrument(span.clone()) // sets `shard_id` field
                     .await
                 {
                     Ok(tl) => tl,
-                    Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
-                        // We already know this tenant exists in general, because we resolved it at
-                        // start of connection.  Getting a NotFound here indicates that the shard containing
-                        // the requested page is not present on this node: the client's knowledge of shard->pageserver
-                        // mapping is out of date.
-                        //
-                        // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
-                        // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
-                        // and talk to a different pageserver.
-                        return respond_error!(PageStreamError::Reconnect(
-                            "getpage@lsn request routed to wrong shard".into()
-                        ));
-                    }
                     Err(e) => {
-                        return respond_error!(e.into());
+                        let span = mkspan!(before shard routing);
+                        match e {
+                            GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_)) => {
+                                // We already know this tenant exists in general, because we resolved it at
+                                // start of connection.  Getting a NotFound here indicates that the shard containing
+                                // the requested page is not present on this node: the client's knowledge of shard->pageserver
+                                // mapping is out of date.
+                                //
+                                // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
+                                // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
+                                // and talk to a different pageserver.
+                                return respond_error!(
+                                    span,
+                                    PageStreamError::Reconnect(
+                                        "getpage@lsn request routed to wrong shard".into()
+                                    )
+                                );
+                            }
+                            e => {
+                                return respond_error!(span, e.into());
+                            }
+                        }
                     }
                 };
+                let span = mkspan!(shard.tenant_shard_id.shard_slug());
 
                 let timer = record_op_start_and_throttle(
                     &shard,
@@ -910,7 +922,7 @@ impl PageServerHandler {
                 {
                     Ok(lsn) => lsn,
                     Err(e) => {
-                        return respond_error!(e);
+                        return respond_error!(span, e);
                     }
                 };
                 BatchedFeMessage::GetPage {
@@ -922,11 +934,10 @@ impl PageServerHandler {
             }
             #[cfg(feature = "testing")]
             PagestreamFeMessage::Test(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_test_request");
                 let shard = timeline_handles
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
-                    .instrument(span.clone()) // sets `shard_id` field
                     .await?;
+                let span = tracing::info_span!(parent: &parent_span, "handle_test_request", shard_id = %shard.tenant_shard_id.shard_slug());
                 let timer =
                     record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
                         .await?;
@@ -1340,7 +1351,7 @@ impl PageServerHandler {
             .take()
             .expect("implementation error: timeline_handles should not be locked");
 
-        let request_span = info_span!("request", shard_id = tracing::field::Empty);
+        let request_span = info_span!("request");
         let ((pgb_reader, timeline_handles), result) = match self.pipelining_config.clone() {
             PageServicePipeliningConfig::Pipelined(pipelining_config) => {
                 self.handle_pagerequests_pipelined(
@@ -2034,6 +2045,7 @@ impl PageServerHandler {
             .unwrap()
             .get(tenant_id, timeline_id, ShardSelector::Zero)
             .await?;
+        set_tracing_field_shard_id(&timeline);
 
         if timeline.is_archived() == Some(true) {
             // TODO after a grace period, turn this log line into a hard error

From 6cd3b501ec8f9100e9eeb092f64f3de18c47c5ea Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Sat, 8 Feb 2025 10:28:09 +0100
Subject: [PATCH 411/583] fix(page_service / batching): smgr op latency metrics
 includes the flush time of preceding requests (#10728)

Before this PR, if a batch contains N responses, the smgr op latency
reported for response (N-i) would include the time we spent flushing
the preceding requests.

refs:
- fixup of https://github.com/neondatabase/neon/pull/10042
- fixes https://github.com/neondatabase/neon/issues/10674
---
 pageserver/src/page_service.rs | 37 ++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index db8c428795..69f1f1c051 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1201,6 +1201,29 @@ impl PageServerHandler {
             }
         };
 
+        // We purposefully don't count flush time into the smgr operaiton timer.
+        //
+        // The reason is that current compute client will not perform protocol processing
+        // if the postgres backend process is doing things other than `->smgr_read()`.
+        // This is especially the case for prefetch.
+        //
+        // If the compute doesn't read from the connection, eventually TCP will backpressure
+        // all the way into our flush call below.
+        //
+        // The timer's underlying metric is used for a storage-internal latency SLO and
+        // we don't want to include latency in it that we can't control.
+        // And as pointed out above, in this case, we don't control the time that flush will take.
+        //
+        // We put each response in the batch onto the wire in a separate pgb_writer.flush()
+        // call, which (all unmeasured) adds syscall overhead but reduces time to first byte
+        // and avoids building up a "giant" contiguous userspace buffer to hold the entire response.
+        // TODO: vectored socket IO would be great, but pgb_writer doesn't support that.
+        //
+        // Since we're flushing multiple times in the loop, but only have access to the per-op
+        // timers inside the loop, we capture the flush start time here and reuse it to finish
+        // each op timer.
+        let flushing_start_time = Instant::now();
+
         // Map handler result to protocol behavior.
         // Some handler errors cause exit from pagestream protocol.
         // Other handler errors are sent back as an error message and we stay in pagestream protocol.
@@ -1249,21 +1272,9 @@ impl PageServerHandler {
                 &response_msg.serialize(protocol_version),
             ))?;
 
-            // We purposefully don't count flush time into the timer.
-            //
-            // The reason is that current compute client will not perform protocol processing
-            // if the postgres backend process is doing things other than `->smgr_read()`.
-            // This is especially the case for prefetch.
-            //
-            // If the compute doesn't read from the connection, eventually TCP will backpressure
-            // all the way into our flush call below.
-            //
-            // The timer's underlying metric is used for a storage-internal latency SLO and
-            // we don't want to include latency in it that we can't control.
-            // And as pointed out above, in this case, we don't control the time that flush will take.
             let flushing_timer = timer.map(|mut timer| {
                 timer
-                    .observe_execution_end_flush_start(Instant::now())
+                    .observe_execution_end_flush_start(flushing_start_time)
                     .expect("we are the first caller")
             });
 

From 874accd6ede7231e5e4e1f562a83862e2286f6cd Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Sat, 8 Feb 2025 12:02:13 +0100
Subject: [PATCH 412/583] pageserver: misc task cleanups (#10723)

This patch does a bunch of superficial cleanups of `tenant::tasks` to
avoid noise in subsequent PRs. There are no functional changes.

PS: enable "hide whitespace" when reviewing, due to the unindentation of
large async blocks.
---
 pageserver/src/disk_usage_eviction_task.rs    |  11 +-
 pageserver/src/tenant/tasks.rs                | 714 ++++++++----------
 pageserver/src/tenant/timeline.rs             |   8 +-
 .../src/tenant/timeline/eviction_task.rs      |  10 +-
 4 files changed, 342 insertions(+), 401 deletions(-)

diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index ca44fbe6ae..738a783813 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -61,6 +61,7 @@ use crate::{
         remote_timeline_client::LayerFileMetadata,
         secondary::SecondaryTenant,
         storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint},
+        tasks::sleep_random,
     },
     CancellableTask, DiskUsageEvictionTask,
 };
@@ -210,14 +211,8 @@ async fn disk_usage_eviction_task(
         info!("disk usage based eviction task finishing");
     };
 
-    use crate::tenant::tasks::random_init_delay;
-    {
-        if random_init_delay(task_config.period, &cancel)
-            .await
-            .is_err()
-        {
-            return;
-        }
+    if sleep_random(task_config.period, &cancel).await.is_err() {
+        return;
     }
 
     let mut iteration_no = 0;
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index d562f7b783..a45eb002bd 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -1,15 +1,17 @@
-//! This module contains functions to serve per-tenant background processes,
-//! such as compaction and GC
+//! This module contains per-tenant background processes, e.g. compaction and GC.
 
 use std::cmp::max;
-use std::ops::ControlFlow;
+use std::future::Future;
+use std::ops::{ControlFlow, RangeInclusive};
+use std::pin::pin;
 use std::str::FromStr;
 use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};
 
 use once_cell::sync::Lazy;
 use rand::Rng;
-use tokio::sync::Semaphore;
+use scopeguard::defer;
+use tokio::sync::{Semaphore, SemaphorePermit};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 
@@ -20,8 +22,10 @@ use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::compaction::CompactionOutcome;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
+use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD;
+use utils::completion::Barrier;
 use utils::rate_limit::RateLimit;
-use utils::{backoff, completion, pausable_failpoint};
+use utils::{backoff, pausable_failpoint};
 
 /// Semaphore limiting concurrent background tasks (across all tenants).
 ///
@@ -52,6 +56,10 @@ static CONCURRENT_COMPACTION_TASKS: Lazy<Semaphore> = Lazy::new(|| {
     Semaphore::new(permits)
 });
 
+/// Background jobs.
+///
+/// NB: not all of these acquire a CONCURRENT_BACKGROUND_TASKS semaphore permit, only the ones that
+/// do any significant IO.
 #[derive(
     Debug,
     PartialEq,
@@ -76,15 +84,15 @@ pub(crate) enum BackgroundLoopKind {
 }
 
 pub struct BackgroundLoopSemaphorePermit<'a> {
-    _permit: tokio::sync::SemaphorePermit<'static>,
+    _permit: SemaphorePermit<'static>,
     _recorder: BackgroundLoopSemaphoreMetricsRecorder<'a>,
 }
 
-/// Cancellation safe.
-pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
-    _ctx: &RequestContext,
+/// Acquires a semaphore permit, to limit concurrent background jobs.
+pub(crate) async fn acquire_concurrency_permit(
     loop_kind: BackgroundLoopKind,
     use_compaction_semaphore: bool,
+    _ctx: &RequestContext,
 ) -> BackgroundLoopSemaphorePermit<'static> {
     // TODO: use a lower threshold and remove the pacer once we resolve some blockage.
     const WARN_THRESHOLD: Duration = Duration::from_secs(600);
@@ -121,12 +129,10 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
     }
 }
 
-/// Start per tenant background loops: compaction and gc.
-pub fn start_background_loops(
-    tenant: &Arc<Tenant>,
-    background_jobs_can_start: Option<&completion::Barrier>,
-) {
+/// Start per tenant background loops: compaction, GC, and ingest housekeeping.
+pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>) {
     let tenant_shard_id = tenant.tenant_shard_id;
+
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::Compaction,
@@ -135,13 +141,15 @@ pub fn start_background_loops(
         &format!("compactor for tenant {tenant_shard_id}"),
         {
             let tenant = Arc::clone(tenant);
-            let background_jobs_can_start = background_jobs_can_start.cloned();
+            let can_start = can_start.cloned();
             async move {
-                let cancel = task_mgr::shutdown_token();
+                let cancel = task_mgr::shutdown_token(); // NB: must be in async context
                 tokio::select! {
-                    _ = cancel.cancelled() => { return Ok(()) },
-                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
+                    _ = cancel.cancelled() => return Ok(()),
+                    _ = Barrier::maybe_wait(can_start) => {}
                 };
+                TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
+                defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
                 compaction_loop(tenant, cancel)
                     // If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py
                     .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
@@ -150,6 +158,7 @@ pub fn start_background_loops(
             }
         },
     );
+
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::GarbageCollector,
@@ -158,13 +167,15 @@ pub fn start_background_loops(
         &format!("garbage collector for tenant {tenant_shard_id}"),
         {
             let tenant = Arc::clone(tenant);
-            let background_jobs_can_start = background_jobs_can_start.cloned();
+            let can_start = can_start.cloned();
             async move {
-                let cancel = task_mgr::shutdown_token();
+                let cancel = task_mgr::shutdown_token(); // NB: must be in async context
                 tokio::select! {
-                    _ = cancel.cancelled() => { return Ok(()) },
-                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
+                    _ = cancel.cancelled() => return Ok(()),
+                    _ = Barrier::maybe_wait(can_start) => {}
                 };
+                TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
+                defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
                 gc_loop(tenant, cancel)
                     .instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
                     .await;
@@ -181,13 +192,15 @@ pub fn start_background_loops(
         &format!("ingest housekeeping for tenant {tenant_shard_id}"),
         {
             let tenant = Arc::clone(tenant);
-            let background_jobs_can_start = background_jobs_can_start.cloned();
+            let can_start = can_start.cloned();
             async move {
-                let cancel = task_mgr::shutdown_token();
+                let cancel = task_mgr::shutdown_token(); // NB: must be in async context
                 tokio::select! {
-                    _ = cancel.cancelled() => { return Ok(()) },
-                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
+                    _ = cancel.cancelled() => return Ok(()),
+                    _ = Barrier::maybe_wait(can_start) => {}
                 };
+                TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
+                defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
                 ingest_housekeeping_loop(tenant, cancel)
                     .instrument(info_span!("ingest_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
                     .await;
@@ -197,372 +210,309 @@ pub fn start_background_loops(
     );
 }
 
-///
-/// Compaction task's main loop
-///
+/// Compaction task's main loop.
 async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
     const MAX_BACKOFF_SECS: f64 = 300.0;
-    // How many errors we have seen consequtively
-    let mut error_run_count = 0;
 
-    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
-    async {
-        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
-        let mut first = true;
-        loop {
-            tokio::select! {
-                _ = cancel.cancelled() => {
-                    return;
-                },
-                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
-                    ControlFlow::Break(()) => return,
-                    ControlFlow::Continue(()) => (),
-                },
-            }
+    let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
+    let mut first = true;
+    let mut error_run = 0; // consecutive errors
 
-            let period = tenant.get_compaction_period();
+    loop {
+        if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
+            return;
+        }
 
-            // TODO: we shouldn't need to await to find tenant and this could be moved outside of
-            // loop, #3501. There are also additional "allowed_errors" in tests.
-            if first {
-                first = false;
-                if random_init_delay(period, &cancel).await.is_err() {
-                    break;
-                }
-            }
+        let period = tenant.get_compaction_period();
 
-            let sleep_duration;
-            if period == Duration::ZERO {
-                #[cfg(not(feature = "testing"))]
-                info!("automatic compaction is disabled");
-                // check again in 10 seconds, in case it's been enabled again.
-                sleep_duration = Duration::from_secs(10)
-            } else {
-                let iteration = Iteration {
-                    started_at: Instant::now(),
-                    period,
-                    kind: BackgroundLoopKind::Compaction,
-                };
-
-                // Run compaction
-                let IterationResult { output, elapsed } = iteration
-                    .run(tenant.compaction_iteration(&cancel, &ctx))
-                    .await;
-                match output {
-                    Ok(outcome) => {
-                        error_run_count = 0;
-                        // schedule the next compaction immediately in case there is a pending compaction task
-                        sleep_duration = if let CompactionOutcome::Pending = outcome {
-                            Duration::from_secs(1)
-                        } else {
-                            period
-                        };
-                    }
-                    Err(e) => {
-                        let wait_duration = backoff::exponential_backoff_duration_seconds(
-                            error_run_count + 1,
-                            1.0,
-                            MAX_BACKOFF_SECS,
-                        );
-                        error_run_count += 1;
-                        let wait_duration = Duration::from_secs_f64(wait_duration);
-                        log_compaction_error(
-                            &e,
-                            error_run_count,
-                            &wait_duration,
-                            cancel.is_cancelled(),
-                        );
-                        sleep_duration = wait_duration;
-                    }
-                }
-
-                // the duration is recorded by performance tests by enabling debug in this function
-                tracing::debug!(
-                    elapsed_ms = elapsed.as_millis(),
-                    "compaction iteration complete"
-                );
-            };
-
-            // Perhaps we did no work and the walredo process has been idle for some time:
-            // give it a chance to shut down to avoid leaving walredo process running indefinitely.
-            // TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off,
-            // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
-            if let Some(walredo_mgr) = &tenant.walredo_mgr {
-                walredo_mgr.maybe_quiesce(period * 10);
-            }
-
-            // Sleep
-            if tokio::time::timeout(sleep_duration, cancel.cancelled())
-                .await
-                .is_ok()
-            {
+        // TODO: we shouldn't need to await to find tenant and this could be moved outside of
+        // loop, #3501. There are also additional "allowed_errors" in tests.
+        if first {
+            first = false;
+            if sleep_random(period, &cancel).await.is_err() {
                 break;
             }
         }
+
+        let sleep_duration;
+        if period == Duration::ZERO {
+            #[cfg(not(feature = "testing"))]
+            info!("automatic compaction is disabled");
+            // check again in 10 seconds, in case it's been enabled again.
+            sleep_duration = Duration::from_secs(10)
+        } else {
+            let iteration = Iteration {
+                started_at: Instant::now(),
+                period,
+                kind: BackgroundLoopKind::Compaction,
+            };
+
+            // Run compaction
+            let IterationResult { output, elapsed } = iteration
+                .run(tenant.compaction_iteration(&cancel, &ctx))
+                .await;
+            match output {
+                Ok(outcome) => {
+                    error_run = 0;
+                    // schedule the next compaction immediately in case there is a pending compaction task
+                    sleep_duration = if let CompactionOutcome::Pending = outcome {
+                        Duration::from_secs(1)
+                    } else {
+                        period
+                    };
+                }
+                Err(err) => {
+                    let wait_duration = backoff::exponential_backoff_duration_seconds(
+                        error_run + 1,
+                        1.0,
+                        MAX_BACKOFF_SECS,
+                    );
+                    error_run += 1;
+                    let wait_duration = Duration::from_secs_f64(wait_duration);
+                    log_compaction_error(&err, error_run, &wait_duration, cancel.is_cancelled());
+                    sleep_duration = wait_duration;
+                }
+            }
+
+            // the duration is recorded by performance tests by enabling debug in this function
+            debug!(
+                elapsed_ms = elapsed.as_millis(),
+                "compaction iteration complete"
+            );
+        };
+
+        // Perhaps we did no work and the walredo process has been idle for some time:
+        // give it a chance to shut down to avoid leaving walredo process running indefinitely.
+        // TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off,
+        // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
+        if let Some(walredo_mgr) = &tenant.walredo_mgr {
+            walredo_mgr.maybe_quiesce(period * 10);
+        }
+
+        // Sleep
+        if tokio::time::timeout(sleep_duration, cancel.cancelled())
+            .await
+            .is_ok()
+        {
+            break;
+        }
     }
-    .await;
-    TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
 }
 
 fn log_compaction_error(
-    e: &CompactionError,
-    error_run_count: u32,
-    sleep_duration: &std::time::Duration,
+    err: &CompactionError,
+    error_count: u32,
+    sleep_duration: &Duration,
     task_cancelled: bool,
 ) {
     use crate::tenant::upload_queue::NotInitialized;
     use crate::tenant::PageReconstructError;
     use CompactionError::*;
 
-    enum LooksLike {
-        Info,
-        Error,
-    }
+    let level = match err {
+        ShuttingDown => return,
+        Offload(_) => Level::ERROR,
+        _ if task_cancelled => Level::INFO,
+        Other(err) => {
+            let root_cause = err.root_cause();
 
-    let decision = match e {
-        ShuttingDown => None,
-        Offload(_) => Some(LooksLike::Error),
-        _ if task_cancelled => Some(LooksLike::Info),
-        Other(e) => {
-            let root_cause = e.root_cause();
-
-            let is_stopping = {
-                let upload_queue = root_cause
-                    .downcast_ref::<NotInitialized>()
-                    .is_some_and(|e| e.is_stopping());
-
-                let timeline = root_cause
-                    .downcast_ref::<PageReconstructError>()
-                    .is_some_and(|e| e.is_stopping());
-
-                upload_queue || timeline
-            };
+            let upload_queue = root_cause
+                .downcast_ref::<NotInitialized>()
+                .is_some_and(|e| e.is_stopping());
+            let timeline = root_cause
+                .downcast_ref::<PageReconstructError>()
+                .is_some_and(|e| e.is_stopping());
+            let is_stopping = upload_queue || timeline;
 
             if is_stopping {
-                Some(LooksLike::Info)
+                Level::INFO
             } else {
-                Some(LooksLike::Error)
+                Level::ERROR
             }
         }
     };
 
-    match decision {
-        Some(LooksLike::Info) => info!(
-            "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}",
-        ),
-        Some(LooksLike::Error) => error!(
-            "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}",
-        ),
-        None => {}
+    match level {
+        Level::ERROR => {
+            error!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}")
+        }
+        Level::INFO => {
+            info!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}")
+        }
+        level => unimplemented!("unexpected level {level:?}"),
     }
 }
 
-///
-/// GC task's main loop
-///
+/// GC task's main loop.
 async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
     const MAX_BACKOFF_SECS: f64 = 300.0;
-    // How many errors we have seen consequtively
-    let mut error_run_count = 0;
+    let mut error_run = 0; // consecutive errors
 
-    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
-    async {
-        // GC might require downloading, to find the cutoff LSN that corresponds to the
-        // cutoff specified as time.
-        let ctx =
-            RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
+    // GC might require downloading, to find the cutoff LSN that corresponds to the
+    // cutoff specified as time.
+    let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
+    let mut first = true;
 
-        let mut first = true;
-        loop {
-            tokio::select! {
-                _ = cancel.cancelled() => {
-                    return;
-                },
-                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
-                    ControlFlow::Break(()) => return,
-                    ControlFlow::Continue(()) => (),
-                },
-            }
+    loop {
+        if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
+            return;
+        }
 
-            let period = tenant.get_gc_period();
+        let period = tenant.get_gc_period();
 
-            if first {
-                first = false;
-
-                let delays = async {
-                    random_init_delay(period, &cancel).await?;
-                    Ok::<_, Cancelled>(())
-                };
-
-                if delays.await.is_err() {
-                    break;
-                }
-            }
-
-            let gc_horizon = tenant.get_gc_horizon();
-            let sleep_duration;
-            if period == Duration::ZERO || gc_horizon == 0 {
-                #[cfg(not(feature = "testing"))]
-                info!("automatic GC is disabled");
-                // check again in 10 seconds, in case it's been enabled again.
-                sleep_duration = Duration::from_secs(10);
-            } else {
-                let iteration = Iteration {
-                    started_at: Instant::now(),
-                    period,
-                    kind: BackgroundLoopKind::Gc,
-                };
-                // Run gc
-                let IterationResult { output, elapsed: _ } =
-                    iteration.run(tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx))
-                    .await;
-                match output {
-                    Ok(_) => {
-                        error_run_count = 0;
-                        sleep_duration = period;
-                    }
-                    Err(crate::tenant::GcError::TenantCancelled) => {
-                        return;
-                    }
-                    Err(e) => {
-                        let wait_duration = backoff::exponential_backoff_duration_seconds(
-                            error_run_count + 1,
-                            1.0,
-                            MAX_BACKOFF_SECS,
-                        );
-                        error_run_count += 1;
-                        let wait_duration = Duration::from_secs_f64(wait_duration);
-
-                        if matches!(e, crate::tenant::GcError::TimelineCancelled) {
-                            // Timeline was cancelled during gc. We might either be in an event
-                            // that affects the entire tenant (tenant deletion, pageserver shutdown),
-                            // or in one that affects the timeline only (timeline deletion).
-                            // Therefore, don't exit the loop.
-                            info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
-                        } else {
-                            error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
-                        }
-
-                        sleep_duration = wait_duration;
-                    }
-                }
-            };
-
-            if tokio::time::timeout(sleep_duration, cancel.cancelled())
-                .await
-                .is_ok()
-            {
+        if first {
+            first = false;
+            if sleep_random(period, &cancel).await.is_err() {
                 break;
             }
         }
-    }
-    .await;
-    TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
-}
-
-async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
-    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
-    async {
-    let mut last_throttle_flag_reset_at = Instant::now();
-        loop {
-            tokio::select! {
-                _ = cancel.cancelled() => {
-                    return;
-                },
-                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
-                    ControlFlow::Break(()) => return,
-                    ControlFlow::Continue(()) => (),
-                },
-            }
-
-            // We run ingest housekeeping with the same frequency as compaction: it is not worth
-            // having a distinct setting.  But we don't run it in the same task, because compaction
-            // blocks on acquiring the background job semaphore.
-            let period = tenant.get_compaction_period();
-
-            // If compaction period is set to zero (to disable it), then we will use a reasonable default
-            let period = if period == Duration::ZERO {
-                humantime::Duration::from_str(
-                    pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD,
-                )
-                .unwrap()
-                .into()
-            } else {
-                period
-            };
-
-            // Jitter the period by +/- 5%
-            let period =
-                rand::thread_rng().gen_range((period * (95)) / 100..(period * (105)) / 100);
-
-            // Always sleep first: we do not need to do ingest housekeeping early in the lifetime of
-            // a tenant, since it won't have started writing any ephemeral files yet.
-            if tokio::time::timeout(period, cancel.cancelled())
-                .await
-                .is_ok()
-            {
-                break;
-            }
 
+        let gc_horizon = tenant.get_gc_horizon();
+        let sleep_duration;
+        if period == Duration::ZERO || gc_horizon == 0 {
+            #[cfg(not(feature = "testing"))]
+            info!("automatic GC is disabled");
+            // check again in 10 seconds, in case it's been enabled again.
+            sleep_duration = Duration::from_secs(10);
+        } else {
             let iteration = Iteration {
                 started_at: Instant::now(),
                 period,
-                kind: BackgroundLoopKind::IngestHouseKeeping,
+                kind: BackgroundLoopKind::Gc,
             };
-            iteration.run(tenant.ingest_housekeeping()).await;
-
-            // TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
-            // Or just spawn another background loop for this throttle, it's not like it's super costly.
-            info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
-                let now = Instant::now();
-                let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
-                let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats();
-                if count_throttled == 0 {
+            // Run gc
+            let IterationResult { output, elapsed: _ } = iteration
+                .run(tenant.gc_iteration(
+                    None,
+                    gc_horizon,
+                    tenant.get_pitr_interval(),
+                    &cancel,
+                    &ctx,
+                ))
+                .await;
+            match output {
+                Ok(_) => {
+                    error_run = 0;
+                    sleep_duration = period;
+                }
+                Err(crate::tenant::GcError::TenantCancelled) => {
                     return;
                 }
-                let allowed_rps = tenant.pagestream_throttle.steady_rps();
-                let delta = now - prev;
-                info!(
-                    n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
-                    count_accounted = count_accounted_finish,  // don't break existing log scraping
-                    count_throttled,
-                    sum_throttled_usecs,
-                    count_accounted_start, // log after pre-existing fields to not break existing log scraping
-                    allowed_rps=%format_args!("{allowed_rps:.0}"),
-                    "shard was throttled in the last n_seconds"
-                );
-            });
-        }
-    }
-    .await;
-    TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
-}
+                Err(e) => {
+                    let wait_duration = backoff::exponential_backoff_duration_seconds(
+                        error_run + 1,
+                        1.0,
+                        MAX_BACKOFF_SECS,
+                    );
+                    error_run += 1;
+                    let wait_duration = Duration::from_secs_f64(wait_duration);
 
-async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
-    // if the tenant has a proper status already, no need to wait for anything
-    if tenant.current_state() == TenantState::Active {
-        ControlFlow::Continue(())
-    } else {
-        let mut tenant_state_updates = tenant.subscribe_for_state_updates();
-        loop {
-            match tenant_state_updates.changed().await {
-                Ok(()) => {
-                    let new_state = &*tenant_state_updates.borrow();
-                    match new_state {
-                        TenantState::Active => {
-                            debug!("Tenant state changed to active, continuing the task loop");
-                            return ControlFlow::Continue(());
-                        }
-                        state => {
-                            debug!("Not running the task loop, tenant is not active: {state:?}");
-                            continue;
-                        }
+                    if matches!(e, crate::tenant::GcError::TimelineCancelled) {
+                        // Timeline was cancelled during gc. We might either be in an event
+                        // that affects the entire tenant (tenant deletion, pageserver shutdown),
+                        // or in one that affects the timeline only (timeline deletion).
+                        // Therefore, don't exit the loop.
+                        info!("Gc failed {error_run} times, retrying in {wait_duration:?}: {e:?}");
+                    } else {
+                        error!("Gc failed {error_run} times, retrying in {wait_duration:?}: {e:?}");
                     }
-                }
-                Err(_sender_dropped_error) => {
-                    return ControlFlow::Break(());
+
+                    sleep_duration = wait_duration;
                 }
             }
+        };
+
+        if tokio::time::timeout(sleep_duration, cancel.cancelled())
+            .await
+            .is_ok()
+        {
+            break;
+        }
+    }
+}
+
+/// Ingest housekeeping's main loop.
+async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
+    let mut last_throttle_flag_reset_at = Instant::now();
+    loop {
+        if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
+            return;
+        }
+
+        // We run ingest housekeeping with the same frequency as compaction: it is not worth
+        // having a distinct setting.  But we don't run it in the same task, because compaction
+        // blocks on acquiring the background job semaphore.
+        let mut period = tenant.get_compaction_period();
+
+        // If compaction period is set to zero (to disable it), then we will use a reasonable default
+        if period == Duration::ZERO {
+            period = humantime::Duration::from_str(DEFAULT_COMPACTION_PERIOD)
+                .unwrap()
+                .into()
+        }
+
+        // Always sleep first: we do not need to do ingest housekeeping early in the lifetime of
+        // a tenant, since it won't have started writing any ephemeral files yet. Jitter the
+        // period by ±5%.
+        let Ok(period) = sleep_jitter(period, period * 5 / 100, &cancel).await else {
+            break;
+        };
+
+        let iteration = Iteration {
+            started_at: Instant::now(),
+            period,
+            kind: BackgroundLoopKind::IngestHouseKeeping,
+        };
+        iteration.run(tenant.ingest_housekeeping()).await;
+
+        // TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
+        // Or just spawn another background loop for this throttle, it's not like it's super costly.
+        info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
+            let now = Instant::now();
+            let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
+            let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats();
+            if count_throttled == 0 {
+                return;
+            }
+            let allowed_rps = tenant.pagestream_throttle.steady_rps();
+            let delta = now - prev;
+            info!(
+                n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
+                count_accounted = count_accounted_finish,  // don't break existing log scraping
+                count_throttled,
+                sum_throttled_usecs,
+                count_accounted_start, // log after pre-existing fields to not break existing log scraping
+                allowed_rps=%format_args!("{allowed_rps:.0}"),
+                "shard was throttled in the last n_seconds"
+            );
+        });
+    }
+}
+
+/// Waits until the tenant becomes active, or returns `ControlFlow::Break()` to shut down.
+async fn wait_for_active_tenant(
+    tenant: &Arc<Tenant>,
+    cancel: &CancellationToken,
+) -> ControlFlow<()> {
+    if tenant.current_state() == TenantState::Active {
+        return ControlFlow::Continue(());
+    }
+
+    let mut update_rx = tenant.subscribe_for_state_updates();
+    loop {
+        tokio::select! {
+            _ = cancel.cancelled() => return ControlFlow::Break(()),
+            result = update_rx.changed() => if result.is_err() {
+                return ControlFlow::Break(());
+            }
+        }
+
+        match &*update_rx.borrow() {
+            TenantState::Active => {
+                debug!("Tenant state changed to active, continuing the task loop");
+                return ControlFlow::Continue(());
+            }
+            state => debug!("Not running the task loop, tenant is not active: {state:?}"),
         }
     }
 }
@@ -571,26 +521,41 @@ async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
 #[error("cancelled")]
 pub(crate) struct Cancelled;
 
-/// Provide a random delay for background task initialization.
+/// Sleeps for a random interval up to the given max value.
 ///
 /// This delay prevents a thundering herd of background tasks and will likely keep them running on
 /// different periods for more stable load.
-pub(crate) async fn random_init_delay(
-    period: Duration,
+pub(crate) async fn sleep_random(
+    max: Duration,
     cancel: &CancellationToken,
-) -> Result<(), Cancelled> {
-    if period == Duration::ZERO {
-        return Ok(());
-    }
+) -> Result<Duration, Cancelled> {
+    sleep_random_range(Duration::ZERO..=max, cancel).await
+}
 
-    let d = {
-        let mut rng = rand::thread_rng();
-        rng.gen_range(Duration::ZERO..=period)
-    };
-    match tokio::time::timeout(d, cancel.cancelled()).await {
-        Ok(_) => Err(Cancelled),
-        Err(_) => Ok(()),
+/// Sleeps for a random interval in the given range. Returns the duration.
+pub(crate) async fn sleep_random_range(
+    interval: RangeInclusive<Duration>,
+    cancel: &CancellationToken,
+) -> Result<Duration, Cancelled> {
+    let delay = rand::thread_rng().gen_range(interval);
+    if delay == Duration::ZERO {
+        return Ok(delay);
     }
+    tokio::select! {
+        _ = cancel.cancelled() => Err(Cancelled),
+        _ = tokio::time::sleep(delay) => Ok(delay),
+    }
+}
+
+/// Sleeps for an interval with a random jitter.
+pub(crate) async fn sleep_jitter(
+    duration: Duration,
+    jitter: Duration,
+    cancel: &CancellationToken,
+) -> Result<Duration, Cancelled> {
+    let from = duration.saturating_sub(jitter);
+    let to = duration.saturating_add(jitter);
+    sleep_random_range(from..=to, cancel).await
 }
 
 struct Iteration {
@@ -606,42 +571,25 @@ struct IterationResult<O> {
 
 impl Iteration {
     #[instrument(skip_all)]
-    pub(crate) async fn run<Fut, O>(self, fut: Fut) -> IterationResult<O>
-    where
-        Fut: std::future::Future<Output = O>,
-    {
-        let Self {
-            started_at,
-            period,
-            kind,
-        } = self;
-
-        let mut fut = std::pin::pin!(fut);
+    pub(crate) async fn run<F: Future<Output = O>, O>(self, fut: F) -> IterationResult<O> {
+        let mut fut = pin!(fut);
 
         // Wrap `fut` into a future that logs a message every `period` so that we get a
         // very obvious breadcrumb in the logs _while_ a slow iteration is happening.
-        let liveness_logger = async move {
-            loop {
-                match tokio::time::timeout(period, &mut fut).await {
-                    Ok(x) => return x,
-                    Err(_) => {
-                        // info level as per the same rationale why warn_when_period_overrun is info
-                        // =>  https://github.com/neondatabase/neon/pull/5724
-                        info!("still running");
-                    }
-                }
+        let output = loop {
+            match tokio::time::timeout(self.period, &mut fut).await {
+                Ok(r) => break r,
+                Err(_) => info!("still running"),
             }
         };
-
-        let output = liveness_logger.await;
-
-        let elapsed = started_at.elapsed();
-        warn_when_period_overrun(elapsed, period, kind);
+        let elapsed = self.started_at.elapsed();
+        warn_when_period_overrun(elapsed, self.period, self.kind);
 
         IterationResult { output, elapsed }
     }
 }
-/// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
+
+// NB: the `task` and `period` are used for metrics labels.
 pub(crate) fn warn_when_period_overrun(
     elapsed: Duration,
     period: Duration,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2be6fc1e59..f1843b4e96 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1718,10 +1718,10 @@ impl Timeline {
         let prepare = async move {
             let guard = self.compaction_lock.lock().await;
 
-            let permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
-                ctx,
+            let permit = super::tasks::acquire_concurrency_permit(
                 BackgroundLoopKind::Compaction,
                 self.conf.use_compaction_semaphore,
+                ctx,
             )
             .await;
 
@@ -3057,10 +3057,10 @@ impl Timeline {
             let self_ref = &self;
             let skip_concurrency_limiter = &skip_concurrency_limiter;
             async move {
-                let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
-                    background_ctx,
+                let wait_for_permit = super::tasks::acquire_concurrency_permit(
                     BackgroundLoopKind::InitialLogicalSizeCalculation,
                     false,
+                    background_ctx,
                 );
 
                 use crate::metrics::initial_logical_size::StartCircumstances;
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 985329136e..42e5f1496d 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -32,7 +32,7 @@ use crate::{
     tenant::{
         size::CalculateSyntheticSizeError,
         storage_layer::LayerVisibilityHint,
-        tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit},
+        tasks::{sleep_random, BackgroundLoopKind, BackgroundLoopSemaphorePermit},
         timeline::EvictionError,
         LogicalSizeCalculationCause, Tenant,
     },
@@ -83,8 +83,6 @@ impl Timeline {
 
     #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
     async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>) {
-        use crate::tenant::tasks::random_init_delay;
-
         // acquire the gate guard only once within a useful span
         let Ok(guard) = self.gate.enter() else {
             return;
@@ -97,7 +95,7 @@ impl Timeline {
                 EvictionPolicy::OnlyImitiate(lat) => lat.period,
                 EvictionPolicy::NoEviction => Duration::from_secs(10),
             };
-            if random_init_delay(period, &self.cancel).await.is_err() {
+            if sleep_random(period, &self.cancel).await.is_err() {
                 return;
             }
         }
@@ -334,10 +332,10 @@ impl Timeline {
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> {
-        let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
-            ctx,
+        let acquire_permit = crate::tenant::tasks::acquire_concurrency_permit(
             BackgroundLoopKind::Eviction,
             false,
+            ctx,
         );
 
         tokio::select! {

From ac55e2dbe5f0a270ff99ee2bea5425d6eaddfaa9 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Sat, 8 Feb 2025 13:42:55 +0100
Subject: [PATCH 413/583] pageserver: improve tenant housekeeping task (#10725)

# Problem

walredo shutdown is done in the compaction task. Let's move it to tenant
housekeeping.

# Summary of changes

* Rename "ingest housekeeping" to "tenant housekeeping".
* Move walredo shutdown into tenant housekeeping.
* Add a constant `WALREDO_IDLE_TIMEOUT` set to 3 minutes (previously 10x
compaction threshold).
---
 pageserver/src/task_mgr.rs     |  4 +--
 pageserver/src/tenant.rs       | 43 +++++++++++++---------------
 pageserver/src/tenant/tasks.rs | 51 ++++++++++++----------------------
 3 files changed, 39 insertions(+), 59 deletions(-)

diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 622738022a..cc93a06ccd 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -328,8 +328,8 @@ pub enum TaskKind {
     // Eviction. One per timeline.
     Eviction,
 
-    // Ingest housekeeping (flushing ephemeral layers on time threshold or disk pressure)
-    IngestHousekeeping,
+    // Tenant housekeeping (flush idle ephemeral layers, shut down idle walredo, etc.).
+    TenantHousekeeping,
 
     /// See [`crate::disk_usage_eviction_task`].
     DiskUsageEviction,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 3c6996dd51..d84cd4d278 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -20,6 +20,7 @@ use chrono::NaiveDateTime;
 use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
+use itertools::Itertools as _;
 use pageserver_api::models;
 use pageserver_api::models::CompactInfoResponse;
 use pageserver_api::models::LsnLease;
@@ -3088,32 +3089,28 @@ impl Tenant {
         Ok(rx)
     }
 
-    // Call through to all timelines to freeze ephemeral layers if needed.  Usually
-    // this happens during ingest: this background housekeeping is for freezing layers
-    // that are open but haven't been written to for some time.
-    async fn ingest_housekeeping(&self) {
-        // Scan through the hashmap and collect a list of all the timelines,
-        // while holding the lock. Then drop the lock and actually perform the
-        // compactions.  We don't want to block everything else while the
-        // compaction runs.
-        let timelines = {
-            self.timelines
-                .lock()
-                .unwrap()
-                .values()
-                .filter_map(|timeline| {
-                    if timeline.is_active() {
-                        Some(timeline.clone())
-                    } else {
-                        None
-                    }
-                })
-                .collect::<Vec<_>>()
-        };
+    /// Performs periodic housekeeping, via the tenant housekeeping background task.
+    async fn housekeeping(&self) {
+        // Call through to all timelines to freeze ephemeral layers as needed. This usually happens
+        // during ingest, but we don't want idle timelines to hold open layers for too long.
+        let timelines = self
+            .timelines
+            .lock()
+            .unwrap()
+            .values()
+            .filter(|tli| tli.is_active())
+            .cloned()
+            .collect_vec();
 
-        for timeline in &timelines {
+        for timeline in timelines {
             timeline.maybe_freeze_ephemeral_layer().await;
         }
+
+        // Shut down walredo if idle.
+        const WALREDO_IDLE_TIMEOUT: Duration = Duration::from_secs(180);
+        if let Some(ref walredo_mgr) = self.walredo_mgr {
+            walredo_mgr.maybe_quiesce(WALREDO_IDLE_TIMEOUT);
+        }
     }
 
     pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool {
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index a45eb002bd..1a6311dd9c 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -4,7 +4,6 @@ use std::cmp::max;
 use std::future::Future;
 use std::ops::{ControlFlow, RangeInclusive};
 use std::pin::pin;
-use std::str::FromStr;
 use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};
 
@@ -75,7 +74,7 @@ pub(crate) enum BackgroundLoopKind {
     Compaction,
     Gc,
     Eviction,
-    IngestHouseKeeping,
+    TenantHouseKeeping,
     ConsumptionMetricsCollectMetrics,
     ConsumptionMetricsSyntheticSizeWorker,
     InitialLogicalSizeCalculation,
@@ -186,10 +185,10 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
 
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
-        TaskKind::IngestHousekeeping,
+        TaskKind::TenantHousekeeping,
         tenant_shard_id,
         None,
-        &format!("ingest housekeeping for tenant {tenant_shard_id}"),
+        &format!("housekeeping for tenant {tenant_shard_id}"),
         {
             let tenant = Arc::clone(tenant);
             let can_start = can_start.cloned();
@@ -201,8 +200,8 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
                 };
                 TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
                 defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
-                ingest_housekeeping_loop(tenant, cancel)
-                    .instrument(info_span!("ingest_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
+                tenant_housekeeping_loop(tenant, cancel)
+                    .instrument(info_span!("tenant_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
                     .await;
                 Ok(())
             }
@@ -281,14 +280,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
             );
         };
 
-        // Perhaps we did no work and the walredo process has been idle for some time:
-        // give it a chance to shut down to avoid leaving walredo process running indefinitely.
-        // TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off,
-        // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
-        if let Some(walredo_mgr) = &tenant.walredo_mgr {
-            walredo_mgr.maybe_quiesce(period * 10);
-        }
-
         // Sleep
         if tokio::time::timeout(sleep_duration, cancel.cancelled())
             .await
@@ -431,42 +422,34 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
     }
 }
 
-/// Ingest housekeeping's main loop.
-async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
+/// Tenant housekeeping's main loop.
+async fn tenant_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
     let mut last_throttle_flag_reset_at = Instant::now();
     loop {
         if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
             return;
         }
 
-        // We run ingest housekeeping with the same frequency as compaction: it is not worth
-        // having a distinct setting.  But we don't run it in the same task, because compaction
-        // blocks on acquiring the background job semaphore.
-        let mut period = tenant.get_compaction_period();
+        // Use the same period as compaction; it's not worth a separate setting. But if it's set to
+        // zero (to disable compaction), then use a reasonable default. Jitter it by 5%.
+        let period = match tenant.get_compaction_period() {
+            Duration::ZERO => humantime::parse_duration(DEFAULT_COMPACTION_PERIOD).unwrap(),
+            period => period,
+        };
 
-        // If compaction period is set to zero (to disable it), then we will use a reasonable default
-        if period == Duration::ZERO {
-            period = humantime::Duration::from_str(DEFAULT_COMPACTION_PERIOD)
-                .unwrap()
-                .into()
-        }
-
-        // Always sleep first: we do not need to do ingest housekeeping early in the lifetime of
-        // a tenant, since it won't have started writing any ephemeral files yet. Jitter the
-        // period by ±5%.
         let Ok(period) = sleep_jitter(period, period * 5 / 100, &cancel).await else {
             break;
         };
 
+        // Do tenant housekeeping.
         let iteration = Iteration {
             started_at: Instant::now(),
             period,
-            kind: BackgroundLoopKind::IngestHouseKeeping,
+            kind: BackgroundLoopKind::TenantHouseKeeping,
         };
-        iteration.run(tenant.ingest_housekeeping()).await;
+        iteration.run(tenant.housekeeping()).await;
 
-        // TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
-        // Or just spawn another background loop for this throttle, it's not like it's super costly.
+        // Log any getpage throttling.
         info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
             let now = Instant::now();
             let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);

From d204d51faf065cab66b44831a5a430de96171bb3 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Mon, 10 Feb 2025 10:56:46 +0100
Subject: [PATCH 414/583] Fix the upgrade test for pg_jwt  by adding the
 database name (#10738)

## Problem
The upgrade test for pg_jwt does not work correctly.
## Summary of changes
The script for the upgrade test is modified to use the database
`contrib_regression`.
---
 docker-compose/ext-src/pgjwt-src/test-upgrade.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker-compose/ext-src/pgjwt-src/test-upgrade.sh b/docker-compose/ext-src/pgjwt-src/test-upgrade.sh
index b7158d2340..efb8bfc184 100755
--- a/docker-compose/ext-src/pgjwt-src/test-upgrade.sh
+++ b/docker-compose/ext-src/pgjwt-src/test-upgrade.sh
@@ -2,4 +2,4 @@
 set -ex
 cd "$(dirname ${0})"
 patch -p1 <test-upgrade.patch
-pg_prove test.sql
\ No newline at end of file
+pg_prove -d contrib_regression test.sql
\ No newline at end of file

From e7118213aba8aaa02bafd13678b18a1104558394 Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Mon, 10 Feb 2025 12:51:53 +0200
Subject: [PATCH 415/583] impr(proxy): Set TTL for Redis cancellation map keys
 (#10671)

Use expire() op to set TTL for Redis cancellation key
---
 proxy/src/cancellation.rs | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 4d919f374a..e84f1676e2 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -69,17 +69,35 @@ pub async fn handle_cancel_messages(
                     value,
                     resp_tx,
                     _guard,
-                    expire: _,
+                    expire,
                 } => {
+                    let res = client.hset(&key, field, value).await;
                     if let Some(resp_tx) = resp_tx {
-                        resp_tx
-                            .send(client.hset(key, field, value).await)
-                            .inspect_err(|e| {
-                                tracing::debug!("failed to send StoreCancelKey response: {:?}", e);
-                            })
-                            .ok();
+                        if res.is_ok() {
+                            resp_tx
+                                .send(client.expire(key, expire).await)
+                                .inspect_err(|e| {
+                                    tracing::debug!(
+                                        "failed to send StoreCancelKey response: {:?}",
+                                        e
+                                    );
+                                })
+                                .ok();
+                        } else {
+                            resp_tx
+                                .send(res)
+                                .inspect_err(|e| {
+                                    tracing::debug!(
+                                        "failed to send StoreCancelKey response: {:?}",
+                                        e
+                                    );
+                                })
+                                .ok();
+                        }
+                    } else if res.is_ok() {
+                        drop(client.expire(key, expire).await);
                     } else {
-                        drop(client.hset(key, field, value).await);
+                        tracing::warn!("failed to store cancel key: {:?}", res);
                     }
                 }
                 CancelKeyOp::GetCancelData {
@@ -436,7 +454,7 @@ impl Session {
         &self.key
     }
 
-    // Send the store key op to the cancellation handler
+    // Send the store key op to the cancellation handler and set TTL for the key
     pub(crate) async fn write_cancel_key(
         &self,
         cancel_closure: CancelClosure,

From 2f36bdb218f60e5e2ca85d30d9c4fc094579e64f Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 10 Feb 2025 12:29:39 +0000
Subject: [PATCH 416/583] CI(build-neon): fix duplicated builds (#10731)

## Problem

Parameterising `build-neon` job with `test-cfg` makes it to build
exactly the same thing several times.

See
-
https://github.com/neondatabase/neon/blob/874accd6ede7231e5e4e1f562a83862e2286f6cd/.github/workflows/_build-and-test-locally.yml#L51-L52
-
https://github.com/neondatabase/neon/actions/runs/13215068271/job/36893373038

## Summary of changes
- Extract `sanitizers` to a separate input from `test-cfg` and set it
separately
- Don't parametrise `build-neon` with `test-cfg`
---
 .github/workflows/_build-and-test-locally.yml | 21 +++++++++++--------
 .../build_and_test_with_sanitizers.yml        |  3 ++-
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index a963452523..86a791497c 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -20,9 +20,14 @@ on:
         required: true
         type: string
       test-cfg:
-        description: 'a json object of postgres versions and lfc/sanitizers states to build and run regression tests on'
+        description: 'a json object of postgres versions and lfc states to run regression tests on'
         required: true
         type: string
+      sanitizers:
+        description: 'enabled or disabled'
+        required: false
+        default: 'disabled'
+        type: string
 
 defaults:
   run:
@@ -48,8 +53,6 @@ jobs:
       # io_uring will account the memory of the CQ and SQ as locked.
       # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
       options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
-    strategy:
-      matrix: ${{ fromJSON(format('{{"include":{0}}}', inputs.test-cfg)) }}
     env:
       BUILD_TYPE: ${{ inputs.build-type }}
       GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
@@ -89,7 +92,7 @@ jobs:
       - name: Set env variables
         env:
           ARCH: ${{ inputs.arch }}
-          SANITIZERS: ${{ matrix.sanitizers }}
+          SANITIZERS: ${{ inputs.sanitizers }}
         run: |
           CARGO_FEATURES="--features testing"
           if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
@@ -167,7 +170,7 @@ jobs:
 
       - name: Run cargo build
         env:
-          WITH_TESTS: ${{ matrix.sanitizers != 'enabled' && '--tests' || '' }}
+          WITH_TESTS: ${{ inputs.sanitizers != 'enabled' && '--tests' || '' }}
         run: |
           export ASAN_OPTIONS=detect_leaks=0
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins ${WITH_TESTS}
@@ -177,7 +180,7 @@ jobs:
       - name: Install rust binaries
         env:
           ARCH: ${{ inputs.arch }}
-          SANITIZERS: ${{ matrix.sanitizers }}
+          SANITIZERS: ${{ inputs.sanitizers }}
         run: |
           # Install target binaries
           mkdir -p /tmp/neon/bin/
@@ -225,7 +228,7 @@ jobs:
           role-duration-seconds: 18000 # 5 hours
 
       - name: Run rust tests
-        if: ${{ matrix.sanitizers != 'enabled' }}
+        if: ${{ inputs.sanitizers != 'enabled' }}
         env:
           NEXTEST_RETRIES: 3
         run: |
@@ -334,7 +337,7 @@ jobs:
       - name: Pytest regression tests
         continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
         uses: ./.github/actions/run-python-test-set
-        timeout-minutes: ${{ matrix.sanitizers != 'enabled' && 60 || 180 }}
+        timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 60 || 180 }}
         with:
           build_type: ${{ inputs.build-type }}
           test_selection: regress
@@ -352,7 +355,7 @@ jobs:
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
           PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
           USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
-          SANITIZERS: ${{ matrix.sanitizers }}
+          SANITIZERS: ${{ inputs.sanitizers }}
 
       # Temporary disable this step until we figure out why it's so flaky
       # Ref https://github.com/neondatabase/neon/issues/4540
diff --git a/.github/workflows/build_and_test_with_sanitizers.yml b/.github/workflows/build_and_test_with_sanitizers.yml
index cf0de3f8dc..2bc938509f 100644
--- a/.github/workflows/build_and_test_with_sanitizers.yml
+++ b/.github/workflows/build_and_test_with_sanitizers.yml
@@ -74,7 +74,8 @@ jobs:
       build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
       build-tag: ${{ needs.tag.outputs.build-tag }}
       build-type: ${{ matrix.build-type }}
-      test-cfg: '[{"pg_version":"v17", "sanitizers": "enabled"}]'
+      test-cfg: '[{"pg_version":"v17"}]'
+      sanitizers: enabled
     secrets: inherit
 
 
From 443c8d0b4bfead651ebbbade5dcb49c6cba00ee6 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 10 Feb 2025 09:25:48 -0500
Subject: [PATCH 417/583] feat(pageserver): repartition on L0-L1 boundary
 (#10548)

## Problem

Reduce the read amplification when doing `repartition`.

## Summary of changes

Compute the L0-L1 boundary LSN and do repartition here.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs                      |  18 +-
 pageserver/src/tenant/timeline/compaction.rs  | 157 ++++++++++--------
 .../regress/test_layers_from_future.py        |   3 +
 3 files changed, 108 insertions(+), 70 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d84cd4d278..79d61ec389 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -7698,6 +7698,18 @@ mod tests {
             }
 
             tline.freeze_and_flush().await?;
+            // Force layers to L1
+            tline
+                .compact(
+                    &cancel,
+                    {
+                        let mut flags = EnumSet::new();
+                        flags.insert(CompactFlags::ForceL0Compaction);
+                        flags
+                    },
+                    &ctx,
+                )
+                .await?;
 
             if iter % 5 == 0 {
                 let (_, before_delta_file_accessed) =
@@ -7710,6 +7722,7 @@ mod tests {
                             let mut flags = EnumSet::new();
                             flags.insert(CompactFlags::ForceImageLayerCreation);
                             flags.insert(CompactFlags::ForceRepartition);
+                            flags.insert(CompactFlags::ForceL0Compaction);
                             flags
                         },
                         &ctx,
@@ -8156,6 +8169,8 @@ mod tests {
 
         let cancel = CancellationToken::new();
 
+        // Image layer creation happens on the disk_consistent_lsn so we need to force set it now.
+        tline.force_set_disk_consistent_lsn(Lsn(0x40));
         tline
             .compact(
                 &cancel,
@@ -8169,8 +8184,7 @@ mod tests {
             )
             .await
             .unwrap();
-
-        // Image layers are created at last_record_lsn
+        // Image layers are created at repartition LSN
         let images = tline
             .inspect_image_layers(Lsn(0x40), &ctx, io_concurrency.clone())
             .await
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index b9f4954453..19f9cbc665 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -687,6 +687,20 @@ impl Timeline {
 
         // Define partitioning schema if needed
 
+        let l0_l1_boundary_lsn = {
+            // We do the repartition on the L0-L1 boundary. All data below the boundary
+            // are compacted by L0 with low read amplification, thus making the `repartition`
+            // function run fast.
+            let guard = self.layers.read().await;
+            let l0_min_lsn = guard
+                .layer_map()?
+                .level0_deltas()
+                .iter()
+                .map(|l| l.get_lsn_range().start)
+                .min()
+                .unwrap_or(self.get_disk_consistent_lsn());
+            l0_min_lsn.max(self.get_ancestor_lsn())
+        };
         // 1. L0 Compact
         let l0_compaction_outcome = {
             let timer = self.metrics.compact_time_histo.start_timer();
@@ -709,80 +723,87 @@ impl Timeline {
             return Ok(CompactionOutcome::Pending);
         }
 
-        // 2. Repartition and create image layers if necessary
-        let partition_count = match self
-            .repartition(
-                self.get_last_record_lsn(), // TODO: use L0-L1 boundary
-                self.get_compaction_target_size(),
-                options.flags,
-                ctx,
-            )
-            .await
-        {
-            Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
-                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
-                let image_ctx = RequestContextBuilder::extend(ctx)
-                    .access_stats_behavior(AccessStatsBehavior::Skip)
-                    .build();
+        if l0_l1_boundary_lsn < self.partitioning.read().1 {
+            // We never go backwards when repartition and create image layers.
+            info!("skipping image layer generation because repartition LSN is greater than L0-L1 boundary LSN.");
+        } else {
+            // 2. Repartition and create image layers if necessary
+            match self
+                .repartition(
+                    l0_l1_boundary_lsn,
+                    self.get_compaction_target_size(),
+                    options.flags,
+                    ctx,
+                )
+                .await
+            {
+                Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
+                    // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
+                    let image_ctx = RequestContextBuilder::extend(ctx)
+                        .access_stats_behavior(AccessStatsBehavior::Skip)
+                        .build();
 
-                let mut partitioning = dense_partitioning;
-                partitioning
-                    .parts
-                    .extend(sparse_partitioning.into_dense().parts);
+                    let mut partitioning = dense_partitioning;
+                    partitioning
+                        .parts
+                        .extend(sparse_partitioning.into_dense().parts);
 
-                // 3. Create new image layers for partitions that have been modified "enough".
-                let (image_layers, outcome) = self
-                    .create_image_layers(
-                        &partitioning,
-                        lsn,
-                        if options
-                            .flags
-                            .contains(CompactFlags::ForceImageLayerCreation)
-                        {
-                            ImageLayerCreationMode::Force
-                        } else {
-                            ImageLayerCreationMode::Try
-                        },
-                        &image_ctx,
-                        self.last_image_layer_creation_status
-                            .load()
-                            .as_ref()
-                            .clone(),
-                    )
-                    .await
-                    .inspect_err(|err| {
-                        if let CreateImageLayersError::GetVectoredError(
-                            GetVectoredError::MissingKey(_),
-                        ) = err
-                        {
-                            critical!("missing key during compaction: {err:?}");
-                        }
-                    })?;
+                    // 3. Create new image layers for partitions that have been modified "enough".
+                    let (image_layers, outcome) = self
+                        .create_image_layers(
+                            &partitioning,
+                            lsn,
+                            if options
+                                .flags
+                                .contains(CompactFlags::ForceImageLayerCreation)
+                            {
+                                ImageLayerCreationMode::Force
+                            } else {
+                                ImageLayerCreationMode::Try
+                            },
+                            &image_ctx,
+                            self.last_image_layer_creation_status
+                                .load()
+                                .as_ref()
+                                .clone(),
+                        )
+                        .await
+                        .inspect_err(|err| {
+                            if let CreateImageLayersError::GetVectoredError(
+                                GetVectoredError::MissingKey(_),
+                            ) = err
+                            {
+                                critical!("missing key during compaction: {err:?}");
+                            }
+                        })?;
 
-                self.last_image_layer_creation_status
-                    .store(Arc::new(outcome.clone()));
+                    self.last_image_layer_creation_status
+                        .store(Arc::new(outcome.clone()));
 
-                self.upload_new_image_layers(image_layers)?;
-                if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
-                    // Yield and do not do any other kind of compaction.
-                    info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
-                    return Ok(CompactionOutcome::Pending);
+                    self.upload_new_image_layers(image_layers)?;
+                    if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
+                        // Yield and do not do any other kind of compaction.
+                        info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
+                        return Ok(CompactionOutcome::Pending);
+                    }
                 }
-                partitioning.parts.len()
-            }
-            Err(err) => {
-                // no partitioning? This is normal, if the timeline was just created
-                // as an empty timeline. Also in unit tests, when we use the timeline
-                // as a simple key-value store, ignoring the datadir layout. Log the
-                // error but continue.
-                //
-                // Suppress error when it's due to cancellation
-                if !self.cancel.is_cancelled() && !err.is_cancelled() {
-                    tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
+                Err(err) => {
+                    // no partitioning? This is normal, if the timeline was just created
+                    // as an empty timeline. Also in unit tests, when we use the timeline
+                    // as a simple key-value store, ignoring the datadir layout. Log the
+                    // error but continue.
+                    //
+                    // Suppress error when it's due to cancellation
+                    if !self.cancel.is_cancelled() && !err.is_cancelled() {
+                        tracing::error!(
+                            "could not compact, repartitioning keyspace failed: {err:?}"
+                        );
+                    }
                 }
-                1
-            }
-        };
+            };
+        }
+
+        let partition_count = self.partitioning.read().0 .0.parts.len();
 
         // 4. Shard ancestor compaction
 
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 872d3dc4cf..3ac4ed1a3e 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -20,6 +20,9 @@ from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.utils import query_scalar, wait_until
 
 
+@pytest.mark.skip(
+    reason="We won't create future layers any more after https://github.com/neondatabase/neon/pull/10548"
+)
 @pytest.mark.parametrize(
     "attach_mode",
     ["default_generation", "same_generation"],

From b37f52fdf13486ae768630bd9b03880535832f2c Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 10 Feb 2025 09:25:56 -0500
Subject: [PATCH 418/583] feat(pageserver): dump read path on missing key error
 (#10528)

## Problem

helps investigate https://github.com/neondatabase/neon/issues/10482

## Summary of changes

In debug mode and testing mode, we will record all files visited by a
read operation, and print it out when it errors.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/config.rs      |  6 ++
 pageserver/src/config.rs               |  6 ++
 pageserver/src/tenant/storage_layer.rs |  5 +-
 pageserver/src/tenant/timeline.rs      | 84 ++++++++++++++++++++++++++
 4 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index b806bd391c..ae3e0385cf 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -122,6 +122,7 @@ pub struct ConfigToml {
     pub wal_receiver_protocol: PostgresClientProtocol,
     pub page_service_pipelining: PageServicePipeliningConfig,
     pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
+    pub enable_read_path_debugging: Option<bool>,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -512,6 +513,11 @@ impl Default for ConfigToml {
             } else {
                 GetVectoredConcurrentIo::SidecarTask
             },
+            enable_read_path_debugging: if cfg!(test) || cfg!(feature = "testing") {
+                Some(true)
+            } else {
+                None
+            },
         }
     }
 }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 3c86b73933..3dd519de75 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -197,6 +197,10 @@ pub struct PageServerConf {
     pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig,
 
     pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo,
+
+    /// Enable read path debugging. If enabled, read key errors will print a backtrace of the layer
+    /// files read.
+    pub enable_read_path_debugging: bool,
 }
 
 /// Token for authentication to safekeepers
@@ -360,6 +364,7 @@ impl PageServerConf {
             wal_receiver_protocol,
             page_service_pipelining,
             get_vectored_concurrent_io,
+            enable_read_path_debugging,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -446,6 +451,7 @@ impl PageServerConf {
                 .unwrap_or_default(),
             virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
             no_sync: no_sync.unwrap_or(false),
+            enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
         };
 
         // ------------------------------------------------------------
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 3800852ccc..f9f843ef6b 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -44,7 +44,7 @@ pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
 
 use self::inmemory_layer::InMemoryLayerFileId;
 
-use super::timeline::GetVectoredError;
+use super::timeline::{GetVectoredError, ReadPath};
 use super::PageReconstructError;
 
 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
@@ -262,6 +262,8 @@ pub(crate) struct ValuesReconstructState {
 
     pub(crate) io_concurrency: IoConcurrency,
     num_active_ios: Arc<AtomicUsize>,
+
+    pub(crate) read_path: Option<ReadPath>,
 }
 
 /// The level of IO concurrency to be used on the read path
@@ -609,6 +611,7 @@ impl ValuesReconstructState {
             delta_layers_visited: 0,
             io_concurrency,
             num_active_ios: Arc::new(AtomicUsize::new(0)),
+            read_path: None,
         }
     }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f1843b4e96..6a7781038f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -626,6 +626,71 @@ impl From<layer_manager::Shutdown> for GetVectoredError {
     }
 }
 
+/// A layer identifier when used in the [`ReadPath`] structure. This enum is for observability purposes
+/// only and not used by the "real read path".
+pub enum ReadPathLayerId {
+    PersistentLayer(PersistentLayerKey),
+    InMemoryLayer(Range<Lsn>),
+}
+
+impl std::fmt::Display for ReadPathLayerId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            ReadPathLayerId::PersistentLayer(key) => write!(f, "{}", key),
+            ReadPathLayerId::InMemoryLayer(range) => {
+                write!(f, "in-mem {}..{}", range.start, range.end)
+            }
+        }
+    }
+}
+pub struct ReadPath {
+    keyspace: KeySpace,
+    lsn: Lsn,
+    path: Vec<(ReadPathLayerId, KeySpace, Range<Lsn>)>,
+}
+
+impl ReadPath {
+    pub fn new(keyspace: KeySpace, lsn: Lsn) -> Self {
+        Self {
+            keyspace,
+            lsn,
+            path: Vec::new(),
+        }
+    }
+
+    pub fn record_layer_visit(
+        &mut self,
+        layer_to_read: &ReadableLayer,
+        keyspace_to_read: &KeySpace,
+        lsn_range: &Range<Lsn>,
+    ) {
+        let id = match layer_to_read {
+            ReadableLayer::PersistentLayer(layer) => {
+                ReadPathLayerId::PersistentLayer(layer.layer_desc().key())
+            }
+            ReadableLayer::InMemoryLayer(layer) => {
+                ReadPathLayerId::InMemoryLayer(layer.get_lsn_range())
+            }
+        };
+        self.path
+            .push((id, keyspace_to_read.clone(), lsn_range.clone()));
+    }
+}
+
+impl std::fmt::Display for ReadPath {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        writeln!(f, "Read path for {} at lsn {}:", self.keyspace, self.lsn)?;
+        for (idx, (layer_id, keyspace, lsn_range)) in self.path.iter().enumerate() {
+            writeln!(
+                f,
+                "{}: {} {}..{} {}",
+                idx, layer_id, lsn_range.start, lsn_range.end, keyspace
+            )?;
+        }
+        Ok(())
+    }
+}
+
 #[derive(thiserror::Error)]
 pub struct MissingKeyError {
     key: Key,
@@ -633,6 +698,8 @@ pub struct MissingKeyError {
     cont_lsn: Lsn,
     request_lsn: Lsn,
     ancestor_lsn: Option<Lsn>,
+    /// Debug information about the read path if there's an error
+    read_path: Option<ReadPath>,
     backtrace: Option<std::backtrace::Backtrace>,
 }
 
@@ -649,10 +716,15 @@ impl std::fmt::Display for MissingKeyError {
             "could not find data for key {} (shard {:?}) at LSN {}, request LSN {}",
             self.key, self.shard, self.cont_lsn, self.request_lsn
         )?;
+
         if let Some(ref ancestor_lsn) = self.ancestor_lsn {
             write!(f, ", ancestor {}", ancestor_lsn)?;
         }
 
+        if let Some(ref read_path) = self.read_path {
+            write!(f, "\n{}", read_path)?;
+        }
+
         if let Some(ref backtrace) = self.backtrace {
             write!(f, "\n{}", backtrace)?;
         }
@@ -1069,6 +1141,7 @@ impl Timeline {
                 request_lsn: lsn,
                 ancestor_lsn: None,
                 backtrace: None,
+                read_path: None,
             })),
         }
     }
@@ -1195,6 +1268,13 @@ impl Timeline {
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        let read_path = if self.conf.enable_read_path_debugging {
+            Some(ReadPath::new(keyspace.clone(), lsn))
+        } else {
+            None
+        };
+        reconstruct_state.read_path = read_path;
+
         let traversal_res: Result<(), _> = self
             .get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx)
             .await;
@@ -3504,6 +3584,7 @@ impl Timeline {
                 request_lsn,
                 ancestor_lsn: Some(timeline.ancestor_lsn),
                 backtrace: None,
+                read_path: std::mem::take(&mut reconstruct_state.read_path),
             }));
         }
 
@@ -3622,6 +3703,9 @@ impl Timeline {
             }
 
             if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
+                if let Some(ref mut read_path) = reconstruct_state.read_path {
+                    read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range);
+                }
                 let next_cont_lsn = lsn_range.start;
                 layer_to_read
                     .get_values_reconstruct_data(

From 0cf01197518f2b65488d4538915ece4957a71f46 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 10 Feb 2025 17:48:03 +0200
Subject: [PATCH 419/583] Add --save_records option to pg_waldump (#10626)

## Problem

Make it possible to dump WAL records in format recognised by walredo
process.
Intended usage:

```
pg_waldump -R 1663/5/16396  -B 771727 000000010000000100000034 --save-records=/tmp/walredo.records
postgres --wal-redo < /tmp/walredo.records > /tmp/page.img
```

## Summary of changes

Related Postgres PRs:
https://github.com/neondatabase/postgres/pull/575
https://github.com/neondatabase/postgres/pull/572

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 86d9ea96eb..13cf5d06c9 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 86d9ea96ebb9088eac62f57f1f5ace68e70e0d1c
+Subproject commit 13cf5d06c98a8e9b0590ce6cdfd193a08d0a7792
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 8dfd5a7030..4c45d78ad5 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 8dfd5a7030d3e8a98b60265ebe045788892ac7f3
+Subproject commit 4c45d78ad587e4bcb4a5a7ef6931b88c6a3d575d
diff --git a/vendor/revisions.json b/vendor/revisions.json
index efddaef46a..5f60e1d690 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,11 +1,11 @@
 {
   "v17": [
     "17.2",
-    "8dfd5a7030d3e8a98b60265ebe045788892ac7f3"
+    "4c45d78ad587e4bcb4a5a7ef6931b88c6a3d575d"
   ],
   "v16": [
     "16.6",
-    "86d9ea96ebb9088eac62f57f1f5ace68e70e0d1c"
+    "13cf5d06c98a8e9b0590ce6cdfd193a08d0a7792"
   ],
   "v15": [
     "15.10",

From 73633e27edda6bf6bf7b3a7d829f1ebeee32adcc Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Mon, 10 Feb 2025 18:06:13 +0200
Subject: [PATCH 420/583] fix(proxy): Log errors from the local proxy in
 auth-broker (#10659)

Handle errors from local proxy by parsing HTTP response in auth broker
code

Closes [#19476](https://github.com/neondatabase/cloud/issues/19476)
---
 proxy/src/auth/backend/mod.rs         |  4 +++
 proxy/src/serverless/backend.rs       |  4 +--
 proxy/src/serverless/sql_over_http.rs | 52 +++++++++++++++++++++++++--
 3 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 7ef096207a..dc595844c5 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -108,6 +108,10 @@ impl<T> Backend<'_, T> {
             Self::Local(_) => panic!("Local backend has no API"),
         }
     }
+
+    pub(crate) fn is_local_proxy(&self) -> bool {
+        matches!(self, Self::Local(_))
+    }
 }
 
 impl<'a, T> Backend<'a, T> {
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 0fb4a8a6cc..edc2935618 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -400,9 +400,9 @@ fn create_random_jwk() -> (SigningKey, jose_jwk::Key) {
 pub(crate) enum HttpConnError {
     #[error("pooled connection closed at inconsistent state")]
     ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError<uuid::Uuid>),
-    #[error("could not connection to postgres in compute")]
+    #[error("could not connect to postgres in compute")]
     PostgresConnectionError(#[from] postgres_client::Error),
-    #[error("could not connection to local-proxy in compute")]
+    #[error("could not connect to local-proxy in compute")]
     LocalProxyConnectionError(#[from] LocalProxyConnError),
     #[error("could not parse JWT payload")]
     JwtPayloadError(serde_json::Error),
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 3e42787a09..8739ce49f9 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -11,10 +11,12 @@ use http_body_util::{BodyExt, Full};
 use hyper::body::Incoming;
 use hyper::http::{HeaderName, HeaderValue};
 use hyper::{header, HeaderMap, Request, Response, StatusCode};
+use indexmap::IndexMap;
 use postgres_client::error::{DbError, ErrorPosition, SqlState};
 use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction};
 use pq_proto::StartupMessageParamsBuilder;
 use serde::Serialize;
+use serde_json::value::RawValue;
 use serde_json::Value;
 use tokio::time::{self, Instant};
 use tokio_util::sync::CancellationToken;
@@ -249,6 +251,50 @@ pub(crate) async fn handle(
     let mut response = match result {
         Ok(r) => {
             ctx.set_success();
+
+            // Handling the error response from local proxy here
+            if config.authentication_config.is_auth_broker && r.status().is_server_error() {
+                let status = r.status();
+
+                let body_bytes = r
+                    .collect()
+                    .await
+                    .map_err(|e| {
+                        ApiError::InternalServerError(anyhow::Error::msg(format!(
+                            "could not collect http body: {e}"
+                        )))
+                    })?
+                    .to_bytes();
+
+                if let Ok(mut json_map) =
+                    serde_json::from_slice::<IndexMap<&str, &RawValue>>(&body_bytes)
+                {
+                    let message = json_map.get("message");
+                    if let Some(message) = message {
+                        let msg: String = match serde_json::from_str(message.get()) {
+                            Ok(msg) => msg,
+                            Err(_) => {
+                                "Unable to parse the response message from server".to_string()
+                            }
+                        };
+
+                        error!("Error response from local_proxy: {status} {msg}");
+
+                        json_map.retain(|key, _| !key.starts_with("neon:")); // remove all the neon-related keys
+
+                        let resp_json = serde_json::to_string(&json_map)
+                            .unwrap_or("failed to serialize the response message".to_string());
+
+                        return json_response(status, resp_json);
+                    }
+                }
+
+                error!("Unable to parse the response message from local_proxy");
+                return json_response(
+                    status,
+                    json!({ "message": "Unable to parse the response message from server".to_string() }),
+                );
+            }
             r
         }
         Err(e @ SqlOverHttpError::Cancelled(_)) => {
@@ -618,8 +664,6 @@ async fn handle_db_inner(
 
     let authenticate_and_connect = Box::pin(
         async {
-            let is_local_proxy = matches!(backend.auth_backend, crate::auth::Backend::Local(_));
-
             let keys = match auth {
                 AuthData::Password(pw) => {
                     backend
@@ -634,7 +678,9 @@ async fn handle_db_inner(
             };
 
             let client = match keys.keys {
-                ComputeCredentialKeys::JwtPayload(payload) if is_local_proxy => {
+                ComputeCredentialKeys::JwtPayload(payload)
+                    if backend.auth_backend.is_local_proxy() =>
+                {
                     let mut client = backend.connect_to_local_postgres(ctx, conn_info).await?;
                     let (cli_inner, _dsc) = client.client_inner();
                     cli_inner.set_jwt_session(&payload).await?;

From 946da3f7e2baa0140c635f628c560286eb63a900 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 10 Feb 2025 10:46:20 -0600
Subject: [PATCH 421/583] Require --compute-id when running compute_ctl
 (#10523)

The compute_id will be used when verifying claims sent by the control
plane.

Signed-off-by: Tristan Partin <tristan@neon.tech>

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs          | 20 ++++++-------------
 compute_tools/src/compute.rs                  |  2 ++
 control_plane/src/endpoint.rs                 | 14 +++++++++++++
 .../compute_wrapper/shell/compute.sh          |  1 +
 4 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 47fc9cb7fe..522743d7bb 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -130,10 +130,10 @@ struct Cli {
     #[arg(short = 'S', long, group = "spec-path")]
     pub spec_path: Option<OsString>,
 
-    #[arg(short = 'i', long, group = "compute-id", conflicts_with_all = ["spec", "spec-path"])]
-    pub compute_id: Option<String>,
+    #[arg(short = 'i', long, group = "compute-id")]
+    pub compute_id: String,
 
-    #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], requires = "compute-id", value_name = "CONTROL_PLANE_API_BASE_URL")]
+    #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")]
     pub control_plane_uri: Option<String>,
 }
 
@@ -259,20 +259,11 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
         });
     }
 
-    if cli.compute_id.is_none() {
-        panic!(
-            "compute spec should be provided by one of the following ways: \
-                --spec OR --spec-path OR --control-plane-uri and --compute-id"
-        );
-    };
     if cli.control_plane_uri.is_none() {
-        panic!("must specify both --control-plane-uri and --compute-id or none");
+        panic!("must specify --control-plane-uri");
     };
 
-    match get_spec_from_control_plane(
-        cli.control_plane_uri.as_ref().unwrap(),
-        cli.compute_id.as_ref().unwrap(),
-    ) {
+    match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
         Ok(spec) => Ok(CliSpecParams {
             spec,
             live_config_allowed: true,
@@ -319,6 +310,7 @@ fn wait_spec(
     let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
         .context("cannot build tokio postgres config from connstr")?;
     let compute_node = ComputeNode {
+        compute_id: cli.compute_id.clone(),
         connstr,
         conn_conf,
         tokio_conn_conf,
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index fd76e404c6..5fc5615d61 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -59,6 +59,8 @@ pub static PG_PID: AtomicU32 = AtomicU32::new(0);
 
 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
+    /// The ID of the compute
+    pub compute_id: String,
     // Url type maintains proper escaping
     pub connstr: url::Url,
     // We connect to Postgres from many different places, so build configs once
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index bc86d09103..869f48a243 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -44,6 +44,8 @@ use std::process::Command;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
+use std::time::SystemTime;
+use std::time::UNIX_EPOCH;
 
 use anyhow::{anyhow, bail, Context, Result};
 use compute_api::spec::Database;
@@ -665,6 +667,18 @@ impl Endpoint {
                     .to_str()
                     .unwrap(),
             ])
+            // TODO: It would be nice if we generated compute IDs with the same
+            // algorithm as the real control plane.
+            .args([
+                "--compute-id",
+                &format!(
+                    "compute-{}",
+                    SystemTime::now()
+                        .duration_since(UNIX_EPOCH)
+                        .unwrap()
+                        .as_secs()
+                ),
+            ])
             .stdin(std::process::Stdio::null())
             .stderr(logfile.try_clone()?)
             .stdout(logfile);
diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh
index b4f8d3d66a..9dbdcce69f 100755
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -77,4 +77,5 @@ echo "Start compute node"
 /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
      -C "postgresql://cloud_admin@localhost:55433/postgres"  \
      -b /usr/local/bin/postgres                              \
+     --compute-id "compute-$RANDOM"                          \
      -S ${SPEC_FILE}

From aba61a371236ab215c6821ac6e23dfa7a17e627b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 10 Feb 2025 18:48:28 +0200
Subject: [PATCH 422/583] Download awscli in separate layer in Dockerfile, to
 allow caching (#10733)

The awscli was downloaded at the last stages of the overall compute
image build, which meant that if you modified any part of the build, it
would trigger a re-download of the awscli. That's a bit annoying when
developing locally and rebuilding the compute image repeatedly. Move it
to a separate layer, to cache separately and to avoid the spurious
rebuilds.
---
 compute/compute-node.Dockerfile | 47 ++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 43910f2622..7dccc0a067 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1641,6 +1641,29 @@ RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 30
     && echo "${pgbouncer_exporter_sha256} pgbouncer_exporter" | sha256sum -c -\
     && echo "${sql_exporter_sha256} sql_exporter" | sha256sum -c -
 
+#########################################################################################
+#
+# Layer "awscli"
+#
+#########################################################################################
+FROM alpine/curl:${ALPINE_CURL_VERSION} AS awscli
+ARG TARGETARCH
+RUN set -ex; \
+    if [ "${TARGETARCH}" = "amd64" ]; then \
+        TARGETARCH_ALT="x86_64"; \
+        CHECKSUM="c9a9df3770a3ff9259cb469b6179e02829687a464e0824d5c32d378820b53a00"; \
+    elif [ "${TARGETARCH}" = "arm64" ]; then \
+        TARGETARCH_ALT="aarch64"; \
+        CHECKSUM="8181730be7891582b38b028112e81b4899ca817e8c616aad807c9e9d1289223a"; \
+    else \
+        echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \
+    fi; \
+    curl --retry 5 -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \
+    echo "${CHECKSUM}  /tmp/awscliv2.zip" | sha256sum -c -; \
+    unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \
+    /tmp/awscliv2/aws/install; \
+    rm -rf /tmp/awscliv2.zip /tmp/awscliv2
+
 #########################################################################################
 #
 # Clean up postgres folder before inclusion
@@ -1754,6 +1777,9 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
     # create folder for file cache
     mkdir -p -m 777 /neon/cache
 
+# aws cli is used by fast_import
+COPY --from=awscli /usr/local/aws-cli /usr/local/aws-cli
+
 COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import
@@ -1831,31 +1857,10 @@ RUN apt update && \
         locales \
         procps \
         ca-certificates \
-        curl \
-        unzip \
         $VERSION_INSTALLS && \
     apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
 
-# aws cli is used by fast_import (curl and unzip above are at this time only used for this installation step)
-ARG TARGETARCH
-RUN set -ex; \
-    if [ "${TARGETARCH}" = "amd64" ]; then \
-        TARGETARCH_ALT="x86_64"; \
-        CHECKSUM="c9a9df3770a3ff9259cb469b6179e02829687a464e0824d5c32d378820b53a00"; \
-    elif [ "${TARGETARCH}" = "arm64" ]; then \
-        TARGETARCH_ALT="aarch64"; \
-        CHECKSUM="8181730be7891582b38b028112e81b4899ca817e8c616aad807c9e9d1289223a"; \
-    else \
-        echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \
-    fi; \
-    curl --retry 5 -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \
-    echo "${CHECKSUM}  /tmp/awscliv2.zip" | sha256sum -c -; \
-    unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \
-    /tmp/awscliv2/aws/install; \
-    rm -rf /tmp/awscliv2.zip /tmp/awscliv2; \
-    true
-
 ENV LANG=en_US.utf8
 USER postgres
 ENTRYPOINT ["/usr/local/bin/compute_ctl"]

From c368b0fe143b46f59651df9cbba41e861b5c14f5 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 10 Feb 2025 18:58:29 +0200
Subject: [PATCH 423/583] Use a cache mount to speed up rebuilding compute node
 image (#10737)

Building the compute rust binaries from scratch is pretty slow, it takes
between 4-15 minutes on my laptop, depending on which compiler flags and
other tricks I use. A cache mount allows caching the dependencies and
incremental builds, which speeds up rebuilding significantly when you
only makes a small change in a source file.
---
 compute/compute-node.Dockerfile | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 7dccc0a067..a251f0c549 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1578,7 +1578,15 @@ ENV BUILD_TAG=$BUILD_TAG
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
-RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy
+RUN --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/registry \
+    --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/git \
+    --mount=type=cache,uid=1000,target=/home/nonroot/target \
+    mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy && \
+    mkdir target-bin && \
+    cp target/release-line-debug-size-lto/compute_ctl \
+       target/release-line-debug-size-lto/fast_import \
+       target/release-line-debug-size-lto/local_proxy \
+       target-bin
 
 #########################################################################################
 #
@@ -1781,15 +1789,15 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
 COPY --from=awscli /usr/local/aws-cli /usr/local/aws-cli
 
 COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
-COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
-COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import
+COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/compute_ctl /usr/local/bin/compute_ctl
+COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/fast_import /usr/local/bin/fast_import
 
 # pgbouncer and its config
 COPY --from=pgbouncer         /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
 COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini
 
 # local_proxy and its config
-COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/local_proxy /usr/local/bin/local_proxy
+COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/local_proxy /usr/local/bin/local_proxy
 RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
 
 # Metrics exporter binaries and configuration files

From 8c4e94107d0d5d5243116bbb70833df9f60b6434 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 10 Feb 2025 18:48:09 +0100
Subject: [PATCH 424/583] pageserver: notify compaction loop at threshold
 (#10740)

## Problem

The compaction loop currently runs periodically, which can cause it to
wait for up to 20 seconds before starting L0 compaction by default.

Also, when we later separate the semaphores for L0 compaction and image
compaction, we want to give up waiting for the image compaction
semaphore if L0 compaction is needed on any timeline.

Touches #10694.

## Summary of changes

Notify the compaction loop when an L0 flush (on any timeline) exceeds
`compaction_threshold`.

Also do some opportunistic cleanups in the area.
---
 libs/utils/src/backoff.rs                |   6 +
 pageserver/src/tenant.rs                 |  23 ++--
 pageserver/src/tenant/tasks.rs           | 133 ++++++++++++-----------
 pageserver/src/tenant/timeline.rs        |  17 ++-
 pageserver/src/tenant/timeline/delete.rs |  11 +-
 5 files changed, 103 insertions(+), 87 deletions(-)

diff --git a/libs/utils/src/backoff.rs b/libs/utils/src/backoff.rs
index 096c7e5854..e6503fe377 100644
--- a/libs/utils/src/backoff.rs
+++ b/libs/utils/src/backoff.rs
@@ -1,4 +1,5 @@
 use std::fmt::{Debug, Display};
+use std::time::Duration;
 
 use futures::Future;
 use tokio_util::sync::CancellationToken;
@@ -29,6 +30,11 @@ pub async fn exponential_backoff(
     }
 }
 
+pub fn exponential_backoff_duration(n: u32, base_increment: f64, max_seconds: f64) -> Duration {
+    let seconds = exponential_backoff_duration_seconds(n, base_increment, max_seconds);
+    Duration::from_secs_f64(seconds)
+}
+
 pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
     if n == 0 {
         0.0
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 79d61ec389..91df47b250 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -56,6 +56,7 @@ use timeline::CompactOptions;
 use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
+use tokio::sync::Notify;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -350,6 +351,9 @@ pub struct Tenant {
     /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
     compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
 
+    /// Signals the tenant compaction loop that there is L0 compaction work to be done.
+    pub(crate) l0_compaction_trigger: Arc<Notify>,
+
     /// Scheduled gc-compaction tasks.
     scheduled_compaction_tasks: std::sync::Mutex<HashMap<TimelineId, Arc<GcCompactionQueue>>>,
 
@@ -1691,12 +1695,7 @@ impl Tenant {
                     timeline_id,
                     index_part,
                     remote_metadata,
-                    TimelineResources {
-                        remote_client,
-                        pagestream_throttle: self.pagestream_throttle.clone(),
-                        pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
-                        l0_flush_global_state: self.l0_flush_global_state.clone(),
-                    },
+                    self.get_timeline_resources_for(remote_client),
                     LoadTimelineCause::Attach,
                     ctx,
                 )
@@ -4112,6 +4111,7 @@ impl Tenant {
                 // use an extremely long backoff.
                 Some(Duration::from_secs(3600 * 24)),
             )),
+            l0_compaction_trigger: Arc::new(Notify::new()),
             scheduled_compaction_tasks: Mutex::new(Default::default()),
             activate_now_sem: tokio::sync::Semaphore::new(0),
             attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
@@ -5020,12 +5020,19 @@ impl Tenant {
         )
     }
 
-    /// Call this before constructing a timeline, to build its required structures
+    /// Builds required resources for a new timeline.
     fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
+        let remote_client = self.build_timeline_remote_client(timeline_id);
+        self.get_timeline_resources_for(remote_client)
+    }
+
+    /// Builds timeline resources for the given remote client.
+    fn get_timeline_resources_for(&self, remote_client: RemoteTimelineClient) -> TimelineResources {
         TimelineResources {
-            remote_client: self.build_timeline_remote_client(timeline_id),
+            remote_client,
             pagestream_throttle: self.pagestream_throttle.clone(),
             pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
+            l0_compaction_trigger: self.l0_compaction_trigger.clone(),
             l0_flush_global_state: self.l0_flush_global_state.clone(),
         }
     }
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 1a6311dd9c..5df7351216 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -22,9 +22,10 @@ use crate::tenant::timeline::compaction::CompactionOutcome;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
 use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD;
+use utils::backoff::exponential_backoff_duration;
 use utils::completion::Barrier;
+use utils::pausable_failpoint;
 use utils::rate_limit::RateLimit;
-use utils::{backoff, pausable_failpoint};
 
 /// Semaphore limiting concurrent background tasks (across all tenants).
 ///
@@ -211,89 +212,93 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
 
 /// Compaction task's main loop.
 async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
+    const BASE_BACKOFF_SECS: f64 = 1.0;
     const MAX_BACKOFF_SECS: f64 = 300.0;
+    const RECHECK_CONFIG_INTERVAL: Duration = Duration::from_secs(10);
 
     let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
-    let mut first = true;
+    let mut period = tenant.get_compaction_period();
     let mut error_run = 0; // consecutive errors
 
+    // Stagger the compaction loop across tenants.
+    if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
+        return;
+    }
+    if sleep_random(period, &cancel).await.is_err() {
+        return;
+    }
+
     loop {
+        // Recheck that we're still active.
         if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
             return;
         }
 
-        let period = tenant.get_compaction_period();
-
-        // TODO: we shouldn't need to await to find tenant and this could be moved outside of
-        // loop, #3501. There are also additional "allowed_errors" in tests.
-        if first {
-            first = false;
-            if sleep_random(period, &cancel).await.is_err() {
-                break;
-            }
-        }
-
-        let sleep_duration;
+        // Refresh the period. If compaction is disabled, check again in a bit.
+        period = tenant.get_compaction_period();
         if period == Duration::ZERO {
             #[cfg(not(feature = "testing"))]
             info!("automatic compaction is disabled");
-            // check again in 10 seconds, in case it's been enabled again.
-            sleep_duration = Duration::from_secs(10)
-        } else {
-            let iteration = Iteration {
-                started_at: Instant::now(),
-                period,
-                kind: BackgroundLoopKind::Compaction,
-            };
+            tokio::select! {
+                _ = tokio::time::sleep(RECHECK_CONFIG_INTERVAL) => {},
+                _ = cancel.cancelled() => return,
+            }
+            continue;
+        }
 
-            // Run compaction
-            let IterationResult { output, elapsed } = iteration
-                .run(tenant.compaction_iteration(&cancel, &ctx))
-                .await;
-            match output {
-                Ok(outcome) => {
-                    error_run = 0;
-                    // schedule the next compaction immediately in case there is a pending compaction task
-                    sleep_duration = if let CompactionOutcome::Pending = outcome {
-                        Duration::from_secs(1)
-                    } else {
-                        period
-                    };
-                }
-                Err(err) => {
-                    let wait_duration = backoff::exponential_backoff_duration_seconds(
-                        error_run + 1,
-                        1.0,
-                        MAX_BACKOFF_SECS,
-                    );
-                    error_run += 1;
-                    let wait_duration = Duration::from_secs_f64(wait_duration);
-                    log_compaction_error(&err, error_run, &wait_duration, cancel.is_cancelled());
-                    sleep_duration = wait_duration;
+        // Wait for the next compaction run.
+        let backoff = exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS);
+        tokio::select! {
+            _ = tokio::time::sleep(backoff), if error_run > 0 => {},
+            _ = tokio::time::sleep(period), if error_run == 0 => {},
+            _ = tenant.l0_compaction_trigger.notified(), if error_run == 0 => {},
+            _ = cancel.cancelled() => return,
+        }
+
+        // Run compaction.
+        let iteration = Iteration {
+            started_at: Instant::now(),
+            period,
+            kind: BackgroundLoopKind::Compaction,
+        };
+        let IterationResult { output, elapsed } = iteration
+            .run(tenant.compaction_iteration(&cancel, &ctx))
+            .await;
+
+        match output {
+            Ok(outcome) => {
+                error_run = 0;
+                // If there's more compaction work pending, reschedule immediately. This isn't
+                // necessarily L0 compaction, but that's fine for now.
+                //
+                // TODO: differentiate between L0 compaction and other compaction. The former needs
+                // to be responsive, the latter doesn't.
+                if outcome == CompactionOutcome::Pending {
+                    tenant.l0_compaction_trigger.notify_one();
                 }
             }
 
-            // the duration is recorded by performance tests by enabling debug in this function
-            debug!(
-                elapsed_ms = elapsed.as_millis(),
-                "compaction iteration complete"
-            );
-        };
-
-        // Sleep
-        if tokio::time::timeout(sleep_duration, cancel.cancelled())
-            .await
-            .is_ok()
-        {
-            break;
+            Err(err) => {
+                error_run += 1;
+                let backoff =
+                    exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS);
+                log_compaction_error(&err, error_run, backoff, cancel.is_cancelled());
+                continue;
+            }
         }
+
+        // NB: this log entry is recorded by performance tests.
+        debug!(
+            elapsed_ms = elapsed.as_millis(),
+            "compaction iteration complete"
+        );
     }
 }
 
 fn log_compaction_error(
     err: &CompactionError,
     error_count: u32,
-    sleep_duration: &Duration,
+    sleep_duration: Duration,
     task_cancelled: bool,
 ) {
     use crate::tenant::upload_queue::NotInitialized;
@@ -390,13 +395,9 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                     return;
                 }
                 Err(e) => {
-                    let wait_duration = backoff::exponential_backoff_duration_seconds(
-                        error_run + 1,
-                        1.0,
-                        MAX_BACKOFF_SECS,
-                    );
                     error_run += 1;
-                    let wait_duration = Duration::from_secs_f64(wait_duration);
+                    let wait_duration =
+                        exponential_backoff_duration(error_run, 1.0, MAX_BACKOFF_SECS);
 
                     if matches!(e, crate::tenant::GcError::TimelineCancelled) {
                         // Timeline was cancelled during gc. We might either be in an event
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6a7781038f..1fbcd6bceb 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -45,11 +45,9 @@ use rand::Rng;
 use remote_storage::DownloadError;
 use serde_with::serde_as;
 use storage_broker::BrokerClientChannel;
+use tokio::runtime::Handle;
 use tokio::sync::mpsc::Sender;
-use tokio::{
-    runtime::Handle,
-    sync::{oneshot, watch},
-};
+use tokio::sync::{oneshot, watch, Notify};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::critical;
@@ -227,6 +225,7 @@ pub struct TimelineResources {
     pub remote_client: RemoteTimelineClient,
     pub pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,
     pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
+    pub l0_compaction_trigger: Arc<Notify>,
     pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }
 
@@ -426,6 +425,9 @@ pub struct Timeline {
     /// If true, the last compaction failed.
     compaction_failed: AtomicBool,
 
+    /// Notifies the tenant compaction loop that there is pending L0 compaction work.
+    l0_compaction_trigger: Arc<Notify>,
+
     /// Make sure we only have one running gc at a time.
     ///
     /// Must only be taken in two places:
@@ -2664,6 +2666,7 @@ impl Timeline {
 
                 compaction_lock: tokio::sync::Mutex::default(),
                 compaction_failed: AtomicBool::default(),
+                l0_compaction_trigger: resources.l0_compaction_trigger,
                 gc_lock: tokio::sync::Mutex::default(),
 
                 standby_horizon: AtomicLsn::new(0),
@@ -4006,6 +4009,12 @@ impl Timeline {
                 }
                 let flush_duration = flush_timer.stop_and_record();
 
+                // Notify the tenant compaction loop if L0 compaction is needed.
+                let l0_count = *watch_l0.borrow();
+                if l0_count >= self.get_compaction_threshold() {
+                    self.l0_compaction_trigger.notify_one();
+                }
+
                 // Delay the next flush to backpressure if compaction can't keep up. We delay by the
                 // flush duration such that the flush takes 2x as long. This is propagated up to WAL
                 // ingestion by having ephemeral layer rolls wait for flushes.
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 5eb2d3aa24..93b7efedb8 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -17,13 +17,11 @@ use crate::{
         metadata::TimelineMetadata,
         remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
         CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
-        TenantManifestError, TimelineOrOffloaded,
+        TenantManifestError, Timeline, TimelineOrOffloaded,
     },
     virtual_file::MaybeFatalIo,
 };
 
-use super::{Timeline, TimelineResources};
-
 /// Mark timeline as deleted in S3 so we won't pick it up next time
 /// during attach or pageserver restart.
 /// See comment in persist_index_part_with_deleted_flag.
@@ -296,12 +294,7 @@ impl DeleteTimelineFlow {
                 timeline_id,
                 local_metadata,
                 None, // Ancestor is not needed for deletion.
-                TimelineResources {
-                    remote_client,
-                    pagestream_throttle: tenant.pagestream_throttle.clone(),
-                    pagestream_throttle_metrics: tenant.pagestream_throttle_metrics.clone(),
-                    l0_flush_global_state: tenant.l0_flush_global_state.clone(),
-                },
+                tenant.get_timeline_resources_for(remote_client),
                 // Important. We dont pass ancestor above because it can be missing.
                 // Thus we need to skip the validation here.
                 CreateTimelineCause::Delete,

From b0c7ee017526928e0375d5f4df318c1fe554ba5c Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 10 Feb 2025 14:33:34 -0500
Subject: [PATCH 425/583] feat(pageserver): better gc_compaction_split
 heuristics (#10727)

## Problem

close https://github.com/neondatabase/neon/issues/10213

`range_search` only returns the top-most layers that may satisfy the
search, so it doesn't include all layers that might be accessed (the
user needs to recursively call this function). We need to retrieve the
full layer map and find overlaps in order to have a correct heuristics
of the job split.

## Summary of changes

Retrieve all layers and find overlaps instead of doing `range_search`.
The patch also reduces the time holding the layer map read guard.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 29 ++++++++++++++------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 19f9cbc665..4cbc344669 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -2259,8 +2259,11 @@ impl Timeline {
             split_key_ranges.push((start, end));
         }
         split_key_ranges.sort();
-        let guard = self.layers.read().await;
-        let layer_map = guard.layer_map()?;
+        let all_layers = {
+            let guard = self.layers.read().await;
+            let layer_map = guard.layer_map()?;
+            layer_map.iter_historic_layers().collect_vec()
+        };
         let mut current_start = None;
         let ranges_num = split_key_ranges.len();
         for (idx, (start, end)) in split_key_ranges.into_iter().enumerate() {
@@ -2272,14 +2275,23 @@ impl Timeline {
                 // We have already processed this partition.
                 continue;
             }
-            let res = layer_map.range_search(start..end, compact_below_lsn);
-            let total_size = res.found.keys().map(|x| x.layer.file_size()).sum::<u64>();
+            let overlapping_layers = {
+                let mut desc = Vec::new();
+                for layer in all_layers.iter() {
+                    if overlaps_with(&layer.get_key_range(), &(start..end))
+                        && layer.get_lsn_range().start <= compact_below_lsn
+                    {
+                        desc.push(layer.clone());
+                    }
+                }
+                desc
+            };
+            let total_size = overlapping_layers.iter().map(|x| x.file_size).sum::<u64>();
             if total_size > sub_compaction_max_job_size_mb * 1024 * 1024 || ranges_num == idx + 1 {
                 // Try to extend the compaction range so that we include at least one full layer file.
-                let extended_end = res
-                    .found
-                    .keys()
-                    .map(|layer| layer.layer.key_range.end)
+                let extended_end = overlapping_layers
+                    .iter()
+                    .map(|layer| layer.key_range.end)
                     .min();
                 // It is possible that the search range does not contain any layer files when we reach the end of the loop.
                 // In this case, we simply use the specified key range end.
@@ -2306,7 +2318,6 @@ impl Timeline {
                 current_start = Some(end);
             }
         }
-        drop(guard);
         Ok(compact_jobs)
     }
 

From 3d143ad79981c66f705b7be106d4e846d2448fb8 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 10 Feb 2025 16:22:10 -0600
Subject: [PATCH 426/583] Unbrick the forward compatibility test failures
 (#10747)

Since the merge of https://github.com/neondatabase/neon/pull/10523,
forward compatibility tests have been broken everywhere.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs          | 16 +++++++++++-
 control_plane/src/endpoint.rs                 | 26 ++++++++++---------
 .../compute_wrapper/shell/compute.sh          |  1 -
 3 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 522743d7bb..aaf0f7f213 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -41,6 +41,7 @@ use std::process::exit;
 use std::str::FromStr;
 use std::sync::atomic::Ordering;
 use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
+use std::time::SystemTime;
 use std::{thread, time::Duration};
 
 use anyhow::{Context, Result};
@@ -85,6 +86,19 @@ fn parse_remote_ext_config(arg: &str) -> Result<String> {
     }
 }
 
+/// Generate a compute ID if one is not supplied. This exists to keep forward
+/// compatibility tests working, but will be removed in a future iteration.
+fn generate_compute_id() -> String {
+    let now = SystemTime::now();
+
+    format!(
+        "compute-{}",
+        now.duration_since(SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_secs()
+    )
+}
+
 #[derive(Parser)]
 #[command(rename_all = "kebab-case")]
 struct Cli {
@@ -130,7 +144,7 @@ struct Cli {
     #[arg(short = 'S', long, group = "spec-path")]
     pub spec_path: Option<OsString>,
 
-    #[arg(short = 'i', long, group = "compute-id")]
+    #[arg(short = 'i', long, group = "compute-id", default_value = generate_compute_id())]
     pub compute_id: String,
 
     #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")]
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 869f48a243..6ee6f8f1ec 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -44,8 +44,6 @@ use std::process::Command;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
-use std::time::SystemTime;
-use std::time::UNIX_EPOCH;
 
 use anyhow::{anyhow, bail, Context, Result};
 use compute_api::spec::Database;
@@ -669,16 +667,20 @@ impl Endpoint {
             ])
             // TODO: It would be nice if we generated compute IDs with the same
             // algorithm as the real control plane.
-            .args([
-                "--compute-id",
-                &format!(
-                    "compute-{}",
-                    SystemTime::now()
-                        .duration_since(UNIX_EPOCH)
-                        .unwrap()
-                        .as_secs()
-                ),
-            ])
+            //
+            // TODO: Add this back when
+            // https://github.com/neondatabase/neon/pull/10747 is merged.
+            //
+            //.args([
+            //    "--compute-id",
+            //    &format!(
+            //        "compute-{}",
+            //        SystemTime::now()
+            //            .duration_since(UNIX_EPOCH)
+            //            .unwrap()
+            //            .as_secs()
+            //    ),
+            //])
             .stdin(std::process::Stdio::null())
             .stderr(logfile.try_clone()?)
             .stdout(logfile);
diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh
index 9dbdcce69f..b4f8d3d66a 100755
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -77,5 +77,4 @@ echo "Start compute node"
 /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
      -C "postgresql://cloud_admin@localhost:55433/postgres"  \
      -b /usr/local/bin/postgres                              \
-     --compute-id "compute-$RANDOM"                          \
      -S ${SPEC_FILE}

From 98883e4b30a76720a6802f1e5076976c0b5b6795 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 11 Feb 2025 02:39:44 +0200
Subject: [PATCH 427/583] compute_ctl: Use a single tokio runtime (#10743)

compute_ctl is mostly written in synchronous fashion, intended to run in
a single thread. However various parts had become async, and they
launched their own tokio runtimes to run the async code. For example, VM
monitor ran in its own multi-threaded runtime, and apply_spec_sql()
launched another multi-threaded runtime to run the per-database SQL
commands in parallel. In addition to that, a few places used a
current-thread runtime to run async code in the main thread, or launched
a current-thread runtime in a *different* thread to run background
tasks.

Unify the runtimes so that there is only one tokio runtime. It's created
very early at process startup, and the main thread "enters" the runtime,
so that it's always available for tokio::spawn() and runtime.block_on()
calls. All code that needs to run async code uses the same runtime.

The main thread still mostly runs in a synchronous fashion. When it
needs to run async code, it uses rt.block_on().

Spawn fewer additional threads, prefer to spawn tokio tasks instead.
Convert some code that ran synchronously in background threads into
async. I didn't go all the way, though, some background threads are
still spawned.
---
 compute_tools/src/bin/compute_ctl.rs |  64 +++++++----------
 compute_tools/src/compute.rs         | 102 +++++++++++++--------------
 compute_tools/src/configurator.rs    |   3 +
 compute_tools/src/http/server.rs     |  11 +--
 compute_tools/src/logger.rs          |   4 +-
 compute_tools/src/migration.rs       |  47 +++++++-----
 compute_tools/src/pg_helpers.rs      |  22 ++----
 compute_tools/src/spec.rs            |  34 ++++-----
 8 files changed, 135 insertions(+), 152 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index aaf0f7f213..275f345897 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -154,7 +154,16 @@ struct Cli {
 fn main() -> Result<()> {
     let cli = Cli::parse();
 
-    let build_tag = init()?;
+    // For historical reasons, the main thread that processes the spec and launches postgres
+    // is synchronous, but we always have this tokio runtime available and we "enter" it so
+    // that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...)
+    // from all parts of compute_ctl.
+    let runtime = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()?;
+    let _rt_guard = runtime.enter();
+
+    let build_tag = runtime.block_on(init())?;
 
     let scenario = failpoint_support::init();
 
@@ -186,8 +195,8 @@ fn main() -> Result<()> {
     deinit_and_exit(wait_pg_result);
 }
 
-fn init() -> Result<String> {
-    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
+async fn init() -> Result<String> {
+    init_tracing_and_logging(DEFAULT_LOG_LEVEL).await?;
 
     let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
     thread::spawn(move || {
@@ -351,8 +360,7 @@ fn wait_spec(
 
     // Launch http service first, so that we can serve control-plane requests
     // while configuration is still in progress.
-    let _http_handle =
-        launch_http_server(cli.http_port, &compute).expect("cannot launch http endpoint thread");
+    let _http_handle = launch_http_server(cli.http_port, &compute);
 
     if !spec_set {
         // No spec provided, hang waiting for it.
@@ -490,21 +498,6 @@ fn start_postgres(
             use std::env;
             use tokio_util::sync::CancellationToken;
 
-            // Note: it seems like you can make a runtime in an inner scope and
-            // if you start a task in it it won't be dropped. However, make it
-            // in the outermost scope just to be safe.
-            let rt = if env::var_os("AUTOSCALING").is_some() {
-                Some(
-                    tokio::runtime::Builder::new_multi_thread()
-                        .worker_threads(4)
-                        .enable_all()
-                        .build()
-                        .expect("failed to create tokio runtime for monitor")
-                )
-            } else {
-                None
-            };
-
             // This token is used internally by the monitor to clean up all threads
             let token = CancellationToken::new();
 
@@ -515,16 +508,19 @@ fn start_postgres(
                 Some(cli.filecache_connstr.clone())
             };
 
-            let vm_monitor = rt.as_ref().map(|rt| {
-                rt.spawn(vm_monitor::start(
+            let vm_monitor = if env::var_os("AUTOSCALING").is_some() {
+                let vm_monitor = tokio::spawn(vm_monitor::start(
                     Box::leak(Box::new(vm_monitor::Args {
                         cgroup: Some(cli.cgroup.clone()),
                         pgconnstr,
                         addr: cli.vm_monitor_addr.clone(),
                     })),
                     token.clone(),
-                ))
-            });
+                ));
+                Some(vm_monitor)
+            } else {
+                None
+            };
         }
     }
 
@@ -534,8 +530,6 @@ fn start_postgres(
             delay_exit,
             compute,
             #[cfg(target_os = "linux")]
-            rt,
-            #[cfg(target_os = "linux")]
             token,
             #[cfg(target_os = "linux")]
             vm_monitor,
@@ -543,15 +537,13 @@ fn start_postgres(
     ))
 }
 
-type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>);
+type PostgresHandle = (std::process::Child, tokio::task::JoinHandle<Result<()>>);
 
 struct StartPostgresResult {
     delay_exit: bool,
     // passed through from WaitSpecResult
     compute: Arc<ComputeNode>,
 
-    #[cfg(target_os = "linux")]
-    rt: Option<tokio::runtime::Runtime>,
     #[cfg(target_os = "linux")]
     token: tokio_util::sync::CancellationToken,
     #[cfg(target_os = "linux")]
@@ -570,10 +562,10 @@ fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
             .expect("failed to start waiting on Postgres process");
         PG_PID.store(0, Ordering::SeqCst);
 
-        // Process has exited, so we can join the logs thread.
-        let _ = logs_handle
-            .join()
-            .map_err(|e| tracing::error!("log thread panicked: {:?}", e));
+        // Process has exited. Wait for the log collecting task to finish.
+        let _ = tokio::runtime::Handle::current()
+            .block_on(logs_handle)
+            .map_err(|e| tracing::error!("log task panicked: {:?}", e));
 
         info!("Postgres exited with code {}, shutting down", ecode);
         exit_code = ecode.code()
@@ -594,8 +586,6 @@ fn cleanup_after_postgres_exit(
         vm_monitor,
         #[cfg(target_os = "linux")]
         token,
-        #[cfg(target_os = "linux")]
-        rt,
     }: StartPostgresResult,
 ) -> Result<bool> {
     // Terminate the vm_monitor so it releases the file watcher on
@@ -608,10 +598,6 @@ fn cleanup_after_postgres_exit(
                 token.cancel();
                 // Kills the actual task running the monitor
                 handle.abort();
-
-                // If handle is some, rt must have been used to produce it, and
-                // hence is also some
-                rt.unwrap().shutdown_timeout(Duration::from_secs(2));
             }
         }
     }
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 5fc5615d61..7fc54bb490 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -9,7 +9,6 @@ use std::str::FromStr;
 use std::sync::atomic::AtomicU32;
 use std::sync::atomic::Ordering;
 use std::sync::{Arc, Condvar, Mutex, RwLock};
-use std::thread;
 use std::time::Duration;
 use std::time::Instant;
 
@@ -548,11 +547,7 @@ impl ComputeNode {
     pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result<Option<Lsn>> {
         let start_time = Utc::now();
 
-        // Run actual work with new tokio runtime
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .expect("failed to create rt");
+        let rt = tokio::runtime::Handle::current();
         let result = rt.block_on(self.check_safekeepers_synced_async(compute_state));
 
         // Record runtime
@@ -599,9 +594,9 @@ impl ComputeNode {
         SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst);
 
         // Process has exited, so we can join the logs thread.
-        let _ = logs_handle
-            .join()
-            .map_err(|e| tracing::error!("log thread panicked: {:?}", e));
+        let _ = tokio::runtime::Handle::current()
+            .block_on(logs_handle)
+            .map_err(|e| tracing::error!("log task panicked: {:?}", e));
 
         if !sync_output.status.success() {
             anyhow::bail!(
@@ -786,7 +781,7 @@ impl ComputeNode {
     pub fn start_postgres(
         &self,
         storage_auth_token: Option<String>,
-    ) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
+    ) -> Result<(std::process::Child, tokio::task::JoinHandle<Result<()>>)> {
         let pgdata_path = Path::new(&self.pgdata);
 
         // Run postgres as a child process.
@@ -802,7 +797,7 @@ impl ComputeNode {
             .expect("cannot start postgres process");
         PG_PID.store(pg.id(), Ordering::SeqCst);
 
-        // Start a thread to collect logs from stderr.
+        // Start a task to collect logs from stderr.
         let stderr = pg.stderr.take().expect("stderr should be captured");
         let logs_handle = handle_postgres_logs(stderr);
 
@@ -811,20 +806,28 @@ impl ComputeNode {
         Ok((pg, logs_handle))
     }
 
-    /// Do post configuration of the already started Postgres. This function spawns a background thread to
+    /// Do post configuration of the already started Postgres. This function spawns a background task to
     /// configure the database after applying the compute spec. Currently, it upgrades the neon extension
     /// version. In the future, it may upgrade all 3rd-party extensions.
     #[instrument(skip_all)]
     pub fn post_apply_config(&self) -> Result<()> {
-        let conf = self.get_conn_conf(Some("compute_ctl:post_apply_config"));
-        thread::spawn(move || {
-            let func = || {
-                let mut client = conf.connect(NoTls)?;
+        let conf = self.get_tokio_conn_conf(Some("compute_ctl:post_apply_config"));
+        tokio::spawn(async move {
+            let res = async {
+                let (mut client, connection) = conf.connect(NoTls).await?;
+                tokio::spawn(async move {
+                    if let Err(e) = connection.await {
+                        eprintln!("connection error: {}", e);
+                    }
+                });
+
                 handle_neon_extension_upgrade(&mut client)
+                    .await
                     .context("handle_neon_extension_upgrade")?;
                 Ok::<_, anyhow::Error>(())
-            };
-            if let Err(err) = func() {
+            }
+            .await;
+            if let Err(err) = res {
                 error!("error while post_apply_config: {err:#}");
             }
         });
@@ -921,13 +924,10 @@ impl ComputeNode {
         conf: Arc<tokio_postgres::Config>,
         concurrency: usize,
     ) -> Result<()> {
-        let rt = tokio::runtime::Builder::new_multi_thread()
-            .enable_all()
-            .build()?;
-
         info!("Applying config with max {} concurrency", concurrency);
         debug!("Config: {:?}", spec);
 
+        let rt = tokio::runtime::Handle::current();
         rt.block_on(async {
             // Proceed with post-startup configuration. Note, that order of operations is important.
             let client = Self::get_maintenance_client(&conf).await?;
@@ -1321,14 +1321,18 @@ impl ComputeNode {
         }
 
         // Run migrations separately to not hold up cold starts
-        thread::spawn(move || {
-            let conf = conf.as_ref().clone();
-            let mut conf = postgres::config::Config::from(conf);
+        tokio::spawn(async move {
+            let mut conf = conf.as_ref().clone();
             conf.application_name("compute_ctl:migrations");
 
-            match conf.connect(NoTls) {
-                Ok(mut client) => {
-                    if let Err(e) = handle_migrations(&mut client) {
+            match conf.connect(NoTls).await {
+                Ok((mut client, connection)) => {
+                    tokio::spawn(async move {
+                        if let Err(e) = connection.await {
+                            eprintln!("connection error: {}", e);
+                        }
+                    });
+                    if let Err(e) = handle_migrations(&mut client).await {
                         error!("Failed to run migrations: {}", e);
                     }
                 }
@@ -1365,16 +1369,11 @@ impl ComputeNode {
         if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
             info!("tuning pgbouncer");
 
-            let rt = tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()
-                .expect("failed to create rt");
-
-            // Spawn a thread to do the tuning,
+            // Spawn a background task to do the tuning,
             // so that we don't block the main thread that starts Postgres.
             let pgbouncer_settings = pgbouncer_settings.clone();
-            let _handle = thread::spawn(move || {
-                let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
+            tokio::spawn(async move {
+                let res = tune_pgbouncer(pgbouncer_settings).await;
                 if let Err(err) = res {
                     error!("error while tuning pgbouncer: {err:?}");
                 }
@@ -1384,14 +1383,14 @@ impl ComputeNode {
         if let Some(ref local_proxy) = spec.local_proxy_config {
             info!("configuring local_proxy");
 
-            // Spawn a thread to do the configuration,
+            // Spawn a background task to do the configuration,
             // so that we don't block the main thread that starts Postgres.
             let local_proxy = local_proxy.clone();
-            let _handle = Some(thread::spawn(move || {
+            tokio::spawn(async move {
                 if let Err(err) = local_proxy::configure(&local_proxy) {
                     error!("error while configuring local_proxy: {err:?}");
                 }
-            }));
+            });
         }
 
         // Write new config
@@ -1433,7 +1432,9 @@ impl ComputeNode {
     }
 
     #[instrument(skip_all)]
-    pub fn start_compute(&self) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
+    pub fn start_compute(
+        &self,
+    ) -> Result<(std::process::Child, tokio::task::JoinHandle<Result<()>>)> {
         let compute_state = self.state.lock().unwrap().clone();
         let pspec = compute_state.pspec.as_ref().expect("spec must be set");
         info!(
@@ -1448,16 +1449,11 @@ impl ComputeNode {
         if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings {
             info!("tuning pgbouncer");
 
-            let rt = tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()
-                .expect("failed to create rt");
-
-            // Spawn a thread to do the tuning,
+            // Spawn a background task to do the tuning,
             // so that we don't block the main thread that starts Postgres.
             let pgbouncer_settings = pgbouncer_settings.clone();
-            let _handle = thread::spawn(move || {
-                let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
+            let _handle = tokio::spawn(async move {
+                let res = tune_pgbouncer(pgbouncer_settings).await;
                 if let Err(err) = res {
                     error!("error while tuning pgbouncer: {err:?}");
                 }
@@ -1467,10 +1463,10 @@ impl ComputeNode {
         if let Some(local_proxy) = &pspec.spec.local_proxy_config {
             info!("configuring local_proxy");
 
-            // Spawn a thread to do the configuration,
+            // Spawn a background task to do the configuration,
             // so that we don't block the main thread that starts Postgres.
             let local_proxy = local_proxy.clone();
-            let _handle = thread::spawn(move || {
+            let _handle = tokio::spawn(async move {
                 if let Err(err) = local_proxy::configure(&local_proxy) {
                     error!("error while configuring local_proxy: {err:?}");
                 }
@@ -1489,7 +1485,8 @@ impl ComputeNode {
             extension_server::create_control_files(remote_extensions, &self.pgbin);
 
             let library_load_start_time = Utc::now();
-            let remote_ext_metrics = self.prepare_preload_libraries(&pspec.spec)?;
+            let rt = tokio::runtime::Handle::current();
+            let remote_ext_metrics = rt.block_on(self.prepare_preload_libraries(&pspec.spec))?;
 
             let library_load_time = Utc::now()
                 .signed_duration_since(library_load_start_time)
@@ -1544,7 +1541,7 @@ impl ComputeNode {
             self.post_apply_config()?;
 
             let conf = self.get_conn_conf(None);
-            thread::spawn(move || {
+            tokio::task::spawn_blocking(|| {
                 let res = get_installed_extensions(conf);
                 match res {
                     Ok(extensions) => {
@@ -1893,7 +1890,6 @@ LIMIT 100",
         Ok(ext_version)
     }
 
-    #[tokio::main]
     pub async fn prepare_preload_libraries(
         &self,
         spec: &ComputeSpec,
diff --git a/compute_tools/src/configurator.rs b/compute_tools/src/configurator.rs
index a2043529a1..d88f26ca20 100644
--- a/compute_tools/src/configurator.rs
+++ b/compute_tools/src/configurator.rs
@@ -51,9 +51,12 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
 pub fn launch_configurator(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
     let compute = Arc::clone(compute);
 
+    let runtime = tokio::runtime::Handle::current();
+
     thread::Builder::new()
         .name("compute-configurator".into())
         .spawn(move || {
+            let _rt_guard = runtime.enter();
             configurator_main_loop(&compute);
             info!("configurator thread is exited");
         })
diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs
index e41ed9df2d..19dded5172 100644
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -1,11 +1,9 @@
 use std::{
     net::{IpAddr, Ipv6Addr, SocketAddr},
     sync::Arc,
-    thread,
     time::Duration,
 };
 
-use anyhow::Result;
 use axum::{
     extract::Request,
     middleware::{self, Next},
@@ -46,7 +44,6 @@ async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Respon
 }
 
 /// Run the HTTP server and wait on it forever.
-#[tokio::main]
 async fn serve(port: u16, compute: Arc<ComputeNode>) {
     let mut app = Router::new()
         .route("/check_writability", post(check_writability::is_writable))
@@ -139,11 +136,9 @@ async fn serve(port: u16, compute: Arc<ComputeNode>) {
     }
 }
 
-/// Launch a separate HTTP server thread and return its `JoinHandle`.
-pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+/// Launch HTTP server in a new task and return its `JoinHandle`.
+pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> tokio::task::JoinHandle<()> {
     let state = Arc::clone(state);
 
-    Ok(thread::Builder::new()
-        .name("http-server".into())
-        .spawn(move || serve(port, state))?)
+    tokio::spawn(serve(port, state))
 }
diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs
index 00be5c13f9..3749dfc844 100644
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -11,7 +11,7 @@ use tracing_subscriber::prelude::*;
 /// set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. See
 /// `tracing-utils` package description.
 ///
-pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
+pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
     // Initialize Logging
     let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
         .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));
@@ -22,7 +22,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
         .with_writer(std::io::stderr);
 
     // Initialize OpenTelemetry
-    let otlp_layer = tracing_utils::init_tracing_without_runtime("compute_ctl");
+    let otlp_layer = tracing_utils::init_tracing("compute_ctl").await;
 
     // Put it all together
     tracing_subscriber::registry()
diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs
index 7b7b042d84..c5e05822c0 100644
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -1,6 +1,6 @@
 use anyhow::{Context, Result};
 use fail::fail_point;
-use postgres::{Client, Transaction};
+use tokio_postgres::{Client, Transaction};
 use tracing::{error, info};
 
 use crate::metrics::DB_MIGRATION_FAILED;
@@ -21,10 +21,11 @@ impl<'m> MigrationRunner<'m> {
     }
 
     /// Get the current value neon_migration.migration_id
-    fn get_migration_id(&mut self) -> Result<i64> {
+    async fn get_migration_id(&mut self) -> Result<i64> {
         let row = self
             .client
-            .query_one("SELECT id FROM neon_migration.migration_id", &[])?;
+            .query_one("SELECT id FROM neon_migration.migration_id", &[])
+            .await?;
 
         Ok(row.get::<&str, i64>("id"))
     }
@@ -34,7 +35,7 @@ impl<'m> MigrationRunner<'m> {
     /// This function has a fail point called compute-migration, which can be
     /// used if you would like to fail the application of a series of migrations
     /// at some point.
-    fn update_migration_id(txn: &mut Transaction, migration_id: i64) -> Result<()> {
+    async fn update_migration_id(txn: &mut Transaction<'_>, migration_id: i64) -> Result<()> {
         // We use this fail point in order to check that failing in the
         // middle of applying a series of migrations fails in an expected
         // manner
@@ -59,31 +60,38 @@ impl<'m> MigrationRunner<'m> {
             "UPDATE neon_migration.migration_id SET id = $1",
             &[&migration_id],
         )
+        .await
         .with_context(|| format!("update neon_migration.migration_id to {migration_id}"))?;
 
         Ok(())
     }
 
     /// Prepare the migrations the target database for handling migrations
-    fn prepare_database(&mut self) -> Result<()> {
+    async fn prepare_database(&mut self) -> Result<()> {
         self.client
-            .simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration")?;
-        self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)")?;
-        self.client.simple_query(
-            "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING",
-        )?;
+            .simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration")
+            .await?;
+        self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)").await?;
         self.client
-            .simple_query("ALTER SCHEMA neon_migration OWNER TO cloud_admin")?;
+            .simple_query(
+                "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING",
+            )
+            .await?;
         self.client
-            .simple_query("REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC")?;
+            .simple_query("ALTER SCHEMA neon_migration OWNER TO cloud_admin")
+            .await?;
+        self.client
+            .simple_query("REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC")
+            .await?;
 
         Ok(())
     }
 
     /// Run an individual migration in a separate transaction block.
-    fn run_migration(client: &mut Client, migration_id: i64, migration: &str) -> Result<()> {
+    async fn run_migration(client: &mut Client, migration_id: i64, migration: &str) -> Result<()> {
         let mut txn = client
             .transaction()
+            .await
             .with_context(|| format!("begin transaction for migration {migration_id}"))?;
 
         if migration.starts_with("-- SKIP") {
@@ -92,35 +100,38 @@ impl<'m> MigrationRunner<'m> {
             // Even though we are skipping the migration, updating the
             // migration ID should help keep logic easy to understand when
             // trying to understand the state of a cluster.
-            Self::update_migration_id(&mut txn, migration_id)?;
+            Self::update_migration_id(&mut txn, migration_id).await?;
         } else {
             info!("Running migration id={}:\n{}\n", migration_id, migration);
 
             txn.simple_query(migration)
+                .await
                 .with_context(|| format!("apply migration {migration_id}"))?;
 
-            Self::update_migration_id(&mut txn, migration_id)?;
+            Self::update_migration_id(&mut txn, migration_id).await?;
         }
 
         txn.commit()
+            .await
             .with_context(|| format!("commit transaction for migration {migration_id}"))?;
 
         Ok(())
     }
 
     /// Run the configured set of migrations
-    pub fn run_migrations(mut self) -> Result<()> {
+    pub async fn run_migrations(mut self) -> Result<()> {
         self.prepare_database()
+            .await
             .context("prepare database to handle migrations")?;
 
-        let mut current_migration = self.get_migration_id()? as usize;
+        let mut current_migration = self.get_migration_id().await? as usize;
         while current_migration < self.migrations.len() {
             // The index lags the migration ID by 1, so the current migration
             // ID is also the next index
             let migration_id = (current_migration + 1) as i64;
             let migration = self.migrations[current_migration];
 
-            match Self::run_migration(self.client, migration_id, migration) {
+            match Self::run_migration(self.client, migration_id, migration).await {
                 Ok(_) => {
                     info!("Finished migration id={}", migration_id);
                 }
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index e03b410699..86fcf99085 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -7,7 +7,6 @@ use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::Child;
 use std::str::FromStr;
-use std::thread::JoinHandle;
 use std::time::{Duration, Instant};
 
 use anyhow::{bail, Result};
@@ -16,6 +15,7 @@ use ini::Ini;
 use notify::{RecursiveMode, Watcher};
 use postgres::config::Config;
 use tokio::io::AsyncBufReadExt;
+use tokio::task::JoinHandle;
 use tokio::time::timeout;
 use tokio_postgres;
 use tokio_postgres::NoTls;
@@ -477,23 +477,13 @@ pub async fn tune_pgbouncer(pgbouncer_config: HashMap<String, String>) -> Result
     Ok(())
 }
 
-/// Spawn a thread that will read Postgres logs from `stderr`, join multiline logs
+/// Spawn a task that will read Postgres logs from `stderr`, join multiline logs
 /// and send them to the logger. In the future we may also want to add context to
 /// these logs.
-pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()> {
-    std::thread::spawn(move || {
-        let runtime = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .expect("failed to build tokio runtime");
-
-        let res = runtime.block_on(async move {
-            let stderr = tokio::process::ChildStderr::from_std(stderr)?;
-            handle_postgres_logs_async(stderr).await
-        });
-        if let Err(e) = res {
-            tracing::error!("error while processing postgres logs: {}", e);
-        }
+pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<Result<()>> {
+    tokio::spawn(async move {
+        let stderr = tokio::process::ChildStderr::from_std(stderr)?;
+        handle_postgres_logs_async(stderr).await
     })
 }
 
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 37d5d3a1a6..73950cd95a 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,8 +1,8 @@
 use anyhow::{anyhow, bail, Result};
-use postgres::Client;
 use reqwest::StatusCode;
 use std::fs::File;
 use std::path::Path;
+use tokio_postgres::Client;
 use tracing::{error, info, instrument, warn};
 
 use crate::config;
@@ -166,17 +166,17 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
 }
 
 #[instrument(skip_all)]
-pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
+pub async fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
     info!("handle neon extension upgrade");
     let query = "ALTER EXTENSION neon UPDATE";
     info!("update neon extension version with query: {}", query);
-    client.simple_query(query)?;
+    client.simple_query(query).await?;
 
     Ok(())
 }
 
 #[instrument(skip_all)]
-pub fn handle_migrations(client: &mut Client) -> Result<()> {
+pub async fn handle_migrations(client: &mut Client) -> Result<()> {
     info!("handle migrations");
 
     // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@@ -206,7 +206,9 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
         ),
     ];
 
-    MigrationRunner::new(client, &migrations).run_migrations()?;
+    MigrationRunner::new(client, &migrations)
+        .run_migrations()
+        .await?;
 
     Ok(())
 }
@@ -214,7 +216,7 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
 /// Connect to the database as superuser and pre-create anon extension
 /// if it is present in shared_preload_libraries
 #[instrument(skip_all)]
-pub fn handle_extension_anon(
+pub async fn handle_extension_anon(
     spec: &ComputeSpec,
     db_owner: &str,
     db_client: &mut Client,
@@ -227,7 +229,7 @@ pub fn handle_extension_anon(
             if !grants_only {
                 // check if extension is already initialized using anon.is_initialized()
                 let query = "SELECT anon.is_initialized()";
-                match db_client.query(query, &[]) {
+                match db_client.query(query, &[]).await {
                     Ok(rows) => {
                         if !rows.is_empty() {
                             let is_initialized: bool = rows[0].get(0);
@@ -249,7 +251,7 @@ pub fn handle_extension_anon(
                 // Users cannot create it themselves, because superuser is required.
                 let mut query = "CREATE EXTENSION IF NOT EXISTS anon CASCADE";
                 info!("creating anon extension with query: {}", query);
-                match db_client.query(query, &[]) {
+                match db_client.query(query, &[]).await {
                     Ok(_) => {}
                     Err(e) => {
                         error!("anon extension creation failed with error: {}", e);
@@ -259,7 +261,7 @@ pub fn handle_extension_anon(
 
                 // check that extension is installed
                 query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
-                let rows = db_client.query(query, &[])?;
+                let rows = db_client.query(query, &[]).await?;
                 if rows.is_empty() {
                     error!("anon extension is not installed");
                     return Ok(());
@@ -268,7 +270,7 @@ pub fn handle_extension_anon(
                 // Initialize anon extension
                 // This also requires superuser privileges, so users cannot do it themselves.
                 query = "SELECT anon.init()";
-                match db_client.query(query, &[]) {
+                match db_client.query(query, &[]).await {
                     Ok(_) => {}
                     Err(e) => {
                         error!("anon.init() failed with error: {}", e);
@@ -279,7 +281,7 @@ pub fn handle_extension_anon(
 
             // check that extension is installed, if not bail early
             let query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
-            match db_client.query(query, &[]) {
+            match db_client.query(query, &[]).await {
                 Ok(rows) => {
                     if rows.is_empty() {
                         error!("anon extension is not installed");
@@ -294,12 +296,12 @@ pub fn handle_extension_anon(
 
             let query = format!("GRANT ALL ON SCHEMA anon TO {}", db_owner);
             info!("granting anon extension permissions with query: {}", query);
-            db_client.simple_query(&query)?;
+            db_client.simple_query(&query).await?;
 
             // Grant permissions to db_owner to use anon extension functions
             let query = format!("GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", db_owner);
             info!("granting anon extension permissions with query: {}", query);
-            db_client.simple_query(&query)?;
+            db_client.simple_query(&query).await?;
 
             // This is needed, because some functions are defined as SECURITY DEFINER.
             // In Postgres SECURITY DEFINER functions are executed with the privileges
@@ -314,16 +316,16 @@ pub fn handle_extension_anon(
                 where nsp.nspname = 'anon';", db_owner);
 
             info!("change anon extension functions owner to db owner");
-            db_client.simple_query(&query)?;
+            db_client.simple_query(&query).await?;
 
             //  affects views as well
             let query = format!("GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", db_owner);
             info!("granting anon extension permissions with query: {}", query);
-            db_client.simple_query(&query)?;
+            db_client.simple_query(&query).await?;
 
             let query = format!("GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", db_owner);
             info!("granting anon extension permissions with query: {}", query);
-            db_client.simple_query(&query)?;
+            db_client.simple_query(&query).await?;
         }
     }
 

From 4ab18444ecd88e1d18c01b54779e933a4428c3be Mon Sep 17 00:00:00 2001
From: Andrew Rudenko <me@prepor.dev>
Date: Tue, 11 Feb 2025 08:02:13 +0100
Subject: [PATCH 428/583] compute_ctl: database_schema should keep
 process::Child as part of returned value (#10273)

## Problem

/database_schema endpoint returns incomplete output from `pg_dump`

## Summary of changes

The Tokio process was not used properly. The returned stream does not
include `process::Child`, and the process is scheduled to be killed
immediately after the `get_database_schema` call when `cmd` goes out of
scope.

The solution in this PR is to return a special Stream implementation
that retains `process::Child`.
---
 compute_tools/src/catalog.rs                | 31 ++++++++++++++++++++-
 test_runner/regress/test_compute_catalog.py |  2 +-
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs
index 4a297cfacf..28b10ce21c 100644
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -140,5 +140,34 @@ pub async fn get_database_schema(
             warn!("pg_dump stderr: {}", line)
         }
     });
-    Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))))
+
+    #[allow(dead_code)]
+    struct SchemaStream<S> {
+        // We keep a reference to the child process to ensure it stays alive
+        // while the stream is being consumed. When SchemaStream is dropped,
+        // cmd will be dropped, which triggers kill_on_drop and terminates pg_dump
+        cmd: tokio::process::Child,
+        stream: S,
+    }
+
+    impl<S> Stream for SchemaStream<S>
+    where
+        S: Stream<Item = Result<bytes::Bytes, std::io::Error>> + Unpin,
+    {
+        type Item = Result<bytes::Bytes, std::io::Error>;
+
+        fn poll_next(
+            mut self: std::pin::Pin<&mut Self>,
+            cx: &mut std::task::Context<'_>,
+        ) -> std::task::Poll<Option<Self::Item>> {
+            Stream::poll_next(std::pin::Pin::new(&mut self.stream), cx)
+        }
+    }
+
+    let schema_stream = SchemaStream {
+        cmd,
+        stream: initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))),
+    };
+
+    Ok(schema_stream)
 }
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
index 50a922a616..3a08671bbf 100644
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -82,7 +82,7 @@ def test_compute_catalog(neon_simple_env: NeonEnv):
         ddl = client.database_schema(database=test_db["name"])
 
         # Check that it looks like a valid PostgreSQL dump
-        assert "-- PostgreSQL database dump" in ddl
+        assert "-- PostgreSQL database dump complete" in ddl
 
         # Check that it doesn't contain health_check and migration traces.
         # They are only created in system `postgres` database, so by checking

From c26131c2b3a948ec3767f9daf2cb418827e36cab Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 11 Feb 2025 09:48:54 +0200
Subject: [PATCH 429/583] Link pgbouncer dynamically (#10749)

I don't see the point of static linking, postgres itself and many of the
extensions are already built dynamically.

One reason for the change is that I'm working on bigger changes to start
using systemd in the compute, and as part of that I wanted to add the
--with-systemd configure option to pgbouncer, and there doesn't seem to
be a static version of libsystemd (at least not on Debian).
---
 compute/compute-node.Dockerfile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index a251f0c549..4357256093 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1615,7 +1615,7 @@ RUN set -e \
     && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
     && cd pgbouncer \
     && ./autogen.sh \
-    && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \
+    && ./configure --prefix=/usr/local/pgbouncer --without-openssl \
     && make -j $(nproc) dist_man_MANS= \
     && make install dist_man_MANS=
 
@@ -1824,6 +1824,7 @@ RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/loca
 # libzstd1 for zstd
 # libboost* for rdkit
 # ca-certificates for communicating with s3 by compute_ctl
+# libevent for pgbouncer
 
 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
     echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc
@@ -1862,6 +1863,7 @@ RUN apt update && \
         libxslt1.1 \
         libzstd1 \
         libcurl4 \
+        libevent-2.1-7 \
         locales \
         procps \
         ca-certificates \

From a4ea1e53ae6fd4c529237e36a6a441bef8d5cb0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 11 Feb 2025 10:40:22 +0100
Subject: [PATCH 430/583] Apply Azure SDK patch to periodically load workload
 identity file (#10415)

The SDK bug https://github.com/Azure/azure-sdk-for-rust/issues/1739 was
originally worked around via #10378, but now upstream has provided a fix
in [this](https://github.com/Azure/azure-sdk-for-rust/pull/1997) PR,
which we've been asked to test.

So this is what this PR is doing: revert #10378 (to make sure we fail if
the bug isn't fixed by the SDK PR), and apply the SDK PR to our fork.

Currently pointing to my local branch to check CI. I'd like to merge the
[SDK fork PR](https://github.com/neondatabase/azure-sdk-for-rust/pull/2)
before merging this to main.
---
 Cargo.lock                   | 10 ++---
 proxy/src/context/parquet.rs | 73 +++++++++++++-----------------------
 2 files changed, 32 insertions(+), 51 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e73f1f9cdb..3f06a74c5e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -786,7 +786,7 @@ dependencies = [
 [[package]]
 name = "azure_core"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
 dependencies = [
  "async-trait",
  "base64 0.22.1",
@@ -815,7 +815,7 @@ dependencies = [
 [[package]]
 name = "azure_identity"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
 dependencies = [
  "async-lock",
  "async-trait",
@@ -834,7 +834,7 @@ dependencies = [
 [[package]]
 name = "azure_storage"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
 dependencies = [
  "RustyXML",
  "async-lock",
@@ -852,7 +852,7 @@ dependencies = [
 [[package]]
 name = "azure_storage_blobs"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
 dependencies = [
  "RustyXML",
  "azure_core",
@@ -872,7 +872,7 @@ dependencies = [
 [[package]]
 name = "azure_svc_blobstorage"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
 dependencies = [
  "azure_core",
  "bytes",
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 4f1dd39d92..0537ae6a62 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -187,6 +187,10 @@ pub async fn worker(
     let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx));
     let rx = rx.map(RequestData::from);
 
+    let storage = GenericRemoteStorage::from_config(&remote_storage_config)
+        .await
+        .context("remote storage init")?;
+
     let properties = WriterProperties::builder()
         .set_data_page_size_limit(config.parquet_upload_page_size)
         .set_compression(config.parquet_upload_compression);
@@ -220,18 +224,18 @@ pub async fn worker(
         let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx));
         let rx_disconnect = rx_disconnect.map(RequestData::from);
 
+        let storage_disconnect =
+            GenericRemoteStorage::from_config(&disconnect_events_storage_config)
+                .await
+                .context("remote storage for disconnect events init")?;
         let parquet_config_disconnect = parquet_config.clone();
         tokio::try_join!(
-            worker_inner(remote_storage_config, rx, parquet_config),
-            worker_inner(
-                disconnect_events_storage_config,
-                rx_disconnect,
-                parquet_config_disconnect
-            )
+            worker_inner(storage, rx, parquet_config),
+            worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect)
         )
         .map(|_| ())
     } else {
-        worker_inner(remote_storage_config, rx, parquet_config).await
+        worker_inner(storage, rx, parquet_config).await
     }
 }
 
@@ -247,32 +251,18 @@ struct ParquetConfig {
     test_remote_failures: u64,
 }
 
-impl ParquetConfig {
-    async fn storage(
-        &self,
-        storage_config: &RemoteStorageConfig,
-    ) -> anyhow::Result<GenericRemoteStorage> {
-        let storage = GenericRemoteStorage::from_config(storage_config)
-            .await
-            .context("remote storage init")?;
-
-        #[cfg(any(test, feature = "testing"))]
-        if self.test_remote_failures > 0 {
-            return Ok(GenericRemoteStorage::unreliable_wrapper(
-                storage,
-                self.test_remote_failures,
-            ));
-        }
-
-        Ok(storage)
-    }
-}
-
 async fn worker_inner(
-    storage_config: RemoteStorageConfig,
+    storage: GenericRemoteStorage,
     rx: impl Stream<Item = RequestData>,
     config: ParquetConfig,
 ) -> anyhow::Result<()> {
+    #[cfg(any(test, feature = "testing"))]
+    let storage = if config.test_remote_failures > 0 {
+        GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures)
+    } else {
+        storage
+    };
+
     let mut rx = std::pin::pin!(rx);
 
     let mut rows = Vec::with_capacity(config.rows_per_group);
@@ -295,7 +285,7 @@ async fn worker_inner(
         }
         if len > config.file_size || force {
             last_upload = time::Instant::now();
-            let file = upload_parquet(w, len, &storage_config, &config).await?;
+            let file = upload_parquet(w, len, &storage).await?;
             w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?;
             len = 0;
         }
@@ -308,7 +298,7 @@ async fn worker_inner(
     }
 
     if !w.flushed_row_groups().is_empty() {
-        let _rtchk: Writer<BytesMut> = upload_parquet(w, len, &storage_config, &config).await?;
+        let _rtchk: Writer<BytesMut> = upload_parquet(w, len, &storage).await?;
     }
 
     Ok(())
@@ -350,8 +340,7 @@ where
 async fn upload_parquet(
     mut w: SerializedFileWriter<Writer<BytesMut>>,
     len: i64,
-    storage_config: &RemoteStorageConfig,
-    config: &ParquetConfig,
+    storage: &GenericRemoteStorage,
 ) -> anyhow::Result<Writer<BytesMut>> {
     let len_uncompressed = w
         .flushed_row_groups()
@@ -388,15 +377,6 @@ async fn upload_parquet(
         size, compression, "uploading request parquet file"
     );
 
-    // A bug in azure-sdk means that the identity-token-file that expires after
-    // 1 hour is not refreshed. This identity-token is used to fetch the actual azure storage
-    // tokens that last for 24 hours. After this 24 hour period, azure-sdk tries to refresh
-    // the storage token, but the identity token has now expired.
-    // <https://github.com/Azure/azure-sdk-for-rust/issues/1739>
-    //
-    // To work around this, we recreate the storage every time.
-    let storage = config.storage(storage_config).await?;
-
     let year = now.year();
     let month = now.month();
     let day = now.day();
@@ -451,8 +431,8 @@ mod tests {
     use rand::rngs::StdRng;
     use rand::{Rng, SeedableRng};
     use remote_storage::{
-        RemoteStorageConfig, RemoteStorageKind, S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
-        DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
+        GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config,
+        DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
     };
     use tokio::sync::mpsc;
     use tokio::time;
@@ -579,11 +559,12 @@ mod tests {
             timeout: std::time::Duration::from_secs(120),
             small_timeout: std::time::Duration::from_secs(30),
         };
-
-        worker_inner(remote_storage_config, rx, config)
+        let storage = GenericRemoteStorage::from_config(&remote_storage_config)
             .await
             .unwrap();
 
+        worker_inner(storage, rx, config).await.unwrap();
+
         let mut files = WalkDir::new(tmpdir.as_std_path())
             .into_iter()
             .filter_map(|entry| entry.ok())

From fcedd102262577d1beabc60785b923cbfdad7410 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 11 Feb 2025 12:37:09 +0000
Subject: [PATCH 431/583] tests: temporarily permit a log error (#10752)

## Problem

These tests can encounter a bug in the pageserver read path (#9185)
which occurs under the very specific circumstances that the tests
create, but is very unlikely to happen in the field.

We will fix the bug, but in the meantime let's un-flake the tests.

Related: https://github.com/neondatabase/neon/issues/10720

## Summary of changes

- Permit "could not find data for key" errors in tests affected by #9185
---
 test_runner/regress/test_sharding.py         | 8 ++++++++
 test_runner/regress/test_storage_scrubber.py | 8 ++++++++
 2 files changed, 16 insertions(+)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 86a6b7428b..6f8070e2ba 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1810,3 +1810,11 @@ def test_sharding_gc(
         shard_gc_cutoff_lsn = Lsn(shard_index["metadata_bytes"]["latest_gc_cutoff_lsn"])
         log.info(f"Shard {shard_number} cutoff LSN: {shard_gc_cutoff_lsn}")
         assert shard_gc_cutoff_lsn == shard_0_gc_cutoff_lsn
+
+    for ps in env.pageservers:
+        # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by
+        # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does.
+        # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed
+        ps.allowed_errors.append(
+            ".*could not find data for key 020000000000000000000000000000000000.*"
+        )
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 0f4e5688a9..46038ccbbb 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -312,6 +312,14 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_
     drop_local_state(env, tenant_id)
     workload.validate()
 
+    for ps in env.pageservers:
+        # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by
+        # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does.
+        # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed
+        ps.allowed_errors.append(
+            ".*could not find data for key 020000000000000000000000000000000000.*"
+        )
+
 
 def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder):
     """

From 9247331c673e13d9dbabc2e5ae7664ef1b51aa97 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 11 Feb 2025 15:05:59 +0100
Subject: [PATCH 432/583] fix(page_service / batching): smgr op latency metric
 of dropped responses include flush time (#10756)

# Problem

Say we have a batch of 10 responses to send out.

Then, even with

- #10728

we've still only called observe_execution_end_flush_start for the first
3 responses.

The remaining 7 response timers are still ticking.

When compute now closes the connection, the waiting flush fails with an
error and we `drop()` the remaining 7 responses' smgr op timers. The
`impl Drop for SmgrOpTimer` will observe an execution time that includes
the flush time.

In practice, this is supsected to produce the `+Inf` observations in the
smgr op latency histogram we've seen since the introduction of
pipelining, even after shipping #10728.

refs:
- fixup of https://github.com/neondatabase/neon/pull/10042
- fixup of https://github.com/neondatabase/neon/pull/10728
- fixes https://github.com/neondatabase/neon/issues/10754
---
 pageserver/src/metrics.rs      | 16 ++++------
 pageserver/src/page_service.rs | 55 +++++++++++++++++++---------------
 2 files changed, 37 insertions(+), 34 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 3b8612a3fa..983a3079e4 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1366,10 +1366,7 @@ impl SmgrOpTimer {
     /// The first callers receives Some, subsequent ones None.
     ///
     /// See [`SmgrOpTimerState`] for more context.
-    pub(crate) fn observe_execution_end_flush_start(
-        &mut self,
-        at: Instant,
-    ) -> Option<SmgrOpFlushInProgress> {
+    pub(crate) fn observe_execution_end(&mut self, at: Instant) -> Option<SmgrOpFlushInProgress> {
         // NB: unlike the other observe_* methods, this one take()s.
         #[allow(clippy::question_mark)] // maintain similar code pattern.
         let Some(mut inner) = self.0.take() else {
@@ -1403,7 +1400,6 @@ impl SmgrOpTimer {
             ..
         } = inner;
         Some(SmgrOpFlushInProgress {
-            flush_started_at: at,
             global_micros: global_flush_in_progress_micros,
             per_timeline_micros: per_timeline_flush_in_progress_micros,
         })
@@ -1419,7 +1415,6 @@ impl SmgrOpTimer {
 /// add another `observe_*` method to [`SmgrOpTimer`], follow the existing pattern there,
 /// and remove this struct from the code base.
 pub(crate) struct SmgrOpFlushInProgress {
-    flush_started_at: Instant,
     global_micros: IntCounter,
     per_timeline_micros: IntCounter,
 }
@@ -1438,12 +1433,13 @@ impl Drop for SmgrOpTimer {
         self.observe_throttle_start(now);
         self.observe_throttle_done(ThrottleResult::NotThrottled { end: now });
         self.observe_execution_start(now);
-        self.observe_execution_end_flush_start(now);
+        let maybe_flush_timer = self.observe_execution_end(now);
+        drop(maybe_flush_timer);
     }
 }
 
 impl SmgrOpFlushInProgress {
-    pub(crate) async fn measure<Fut, O>(mut self, mut fut: Fut) -> O
+    pub(crate) async fn measure<Fut, O>(self, mut started_at: Instant, mut fut: Fut) -> O
     where
         Fut: std::future::Future<Output = O>,
     {
@@ -1455,12 +1451,12 @@ impl SmgrOpFlushInProgress {
         let mut observe_guard = scopeguard::guard(
             || {
                 let now = Instant::now();
-                let elapsed = now - self.flush_started_at;
+                let elapsed = now - started_at;
                 self.global_micros
                     .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
                 self.per_timeline_micros
                     .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
-                self.flush_started_at = now;
+                started_at = now;
             },
             |mut observe| {
                 observe();
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 69f1f1c051..972dad34d4 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1074,7 +1074,7 @@ impl PageServerHandler {
         };
 
         // invoke handler function
-        let (handler_results, span): (
+        let (mut handler_results, span): (
             Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>>,
             _,
         ) = match batch {
@@ -1201,7 +1201,7 @@ impl PageServerHandler {
             }
         };
 
-        // We purposefully don't count flush time into the smgr operaiton timer.
+        // We purposefully don't count flush time into the smgr operation timer.
         //
         // The reason is that current compute client will not perform protocol processing
         // if the postgres backend process is doing things other than `->smgr_read()`.
@@ -1218,17 +1218,32 @@ impl PageServerHandler {
         // call, which (all unmeasured) adds syscall overhead but reduces time to first byte
         // and avoids building up a "giant" contiguous userspace buffer to hold the entire response.
         // TODO: vectored socket IO would be great, but pgb_writer doesn't support that.
-        //
-        // Since we're flushing multiple times in the loop, but only have access to the per-op
-        // timers inside the loop, we capture the flush start time here and reuse it to finish
-        // each op timer.
-        let flushing_start_time = Instant::now();
+        let flush_timers = {
+            let flushing_start_time = Instant::now();
+            let mut flush_timers = Vec::with_capacity(handler_results.len());
+            for handler_result in &mut handler_results {
+                let flush_timer = match handler_result {
+                    Ok((_, timer)) => Some(
+                        timer
+                            .observe_execution_end(flushing_start_time)
+                            .expect("we are the first caller"),
+                    ),
+                    Err(_) => {
+                        // TODO: measure errors
+                        None
+                    }
+                };
+                flush_timers.push(flush_timer);
+            }
+            assert_eq!(flush_timers.len(), handler_results.len());
+            flush_timers
+        };
 
         // Map handler result to protocol behavior.
         // Some handler errors cause exit from pagestream protocol.
         // Other handler errors are sent back as an error message and we stay in pagestream protocol.
-        for handler_result in handler_results {
-            let (response_msg, timer) = match handler_result {
+        for (handler_result, flushing_timer) in handler_results.into_iter().zip(flush_timers) {
+            let response_msg = match handler_result {
                 Err(e) => match &e.err {
                     PageStreamError::Shutdown => {
                         // If we fail to fulfil a request during shutdown, which may be _because_ of
@@ -1252,16 +1267,14 @@ impl PageServerHandler {
                         span.in_scope(|| {
                             error!("error reading relation or page version: {full:#}")
                         });
-                        (
-                            PagestreamBeMessage::Error(PagestreamErrorResponse {
-                                req: e.req,
-                                message: e.err.to_string(),
-                            }),
-                            None, // TODO: measure errors
-                        )
+
+                        PagestreamBeMessage::Error(PagestreamErrorResponse {
+                            req: e.req,
+                            message: e.err.to_string(),
+                        })
                     }
                 },
-                Ok((response_msg, timer)) => (response_msg, Some(timer)),
+                Ok((response_msg, _op_timer_already_observed)) => response_msg,
             };
 
             //
@@ -1272,18 +1285,12 @@ impl PageServerHandler {
                 &response_msg.serialize(protocol_version),
             ))?;
 
-            let flushing_timer = timer.map(|mut timer| {
-                timer
-                    .observe_execution_end_flush_start(flushing_start_time)
-                    .expect("we are the first caller")
-            });
-
             // what we want to do
             let flush_fut = pgb_writer.flush();
             // metric for how long flushing takes
             let flush_fut = match flushing_timer {
                 Some(flushing_timer) => {
-                    futures::future::Either::Left(flushing_timer.measure(flush_fut))
+                    futures::future::Either::Left(flushing_timer.measure(Instant::now(), flush_fut))
                 }
                 None => futures::future::Either::Right(flush_fut),
             };

From be447ba4f8fdf7d10a690b14236c2e862b4d6806 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 11 Feb 2025 17:36:54 +0100
Subject: [PATCH 433/583] Change timeline_offloading setting default to true
 (#10760)

This changes the default value of the `timeline_offloading` pageserver
and tenant configs to true, now that offloading has been rolled out
without problems.

There is also a small fix in the tenant config merge function, where we
applied the `lazy_slru_download` value instead of `timeline_offloading`.

Related issue: https://github.com/neondatabase/cloud/issues/21353
---
 libs/pageserver_api/src/config.rs                |  4 ++--
 pageserver/src/tenant/config.rs                  |  2 +-
 test_runner/regress/test_attach_tenant_config.py |  2 +-
 test_runner/regress/test_timeline_archive.py     | 10 +++-------
 4 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index ae3e0385cf..a00d7838fd 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -493,7 +493,7 @@ impl Default for ConfigToml {
                 NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
             )),
             image_compression: (DEFAULT_IMAGE_COMPRESSION),
-            timeline_offloading: false,
+            timeline_offloading: true,
             ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: None,
             virtual_file_io_mode: None,
@@ -624,7 +624,7 @@ impl Default for TenantConfigToml {
             image_creation_preempt_threshold: DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD,
             lsn_lease_length: LsnLease::DEFAULT_LENGTH,
             lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
-            timeline_offloading: false,
+            timeline_offloading: true,
             wal_receiver_protocol_override: None,
             rel_size_v2_enabled: None,
             gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 972837dc44..ad13e9e8e4 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -466,7 +466,7 @@ impl TenantConfOpt {
                 .lsn_lease_length_for_ts
                 .unwrap_or(global_conf.lsn_lease_length_for_ts),
             timeline_offloading: self
-                .lazy_slru_download
+                .timeline_offloading
                 .unwrap_or(global_conf.timeline_offloading),
             wal_receiver_protocol_override: self
                 .wal_receiver_protocol_override
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index a4b9eabf8e..7acc64377e 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -175,7 +175,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "image_layer_creation_check_threshold": 1,
         "lsn_lease_length": "1m",
         "lsn_lease_length_for_ts": "5s",
-        "timeline_offloading": True,
+        "timeline_offloading": False,
         "wal_receiver_protocol_override": {
             "type": "interpreted",
             "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 50f674f539..2706ddf2f0 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -139,9 +139,9 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int):
 
 @pytest.mark.parametrize("manual_offload", [False, True])
 def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: bool):
-    if not manual_offload:
-        # (automatic) timeline offloading defaults to false for now
-        neon_env_builder.pageserver_config_override = "timeline_offloading = true"
+    if manual_offload:
+        # (automatic) timeline offloading defaults to true
+        neon_env_builder.pageserver_config_override = "timeline_offloading = false"
 
     env = neon_env_builder.init_start()
     ps_http = env.pageserver.http_client()
@@ -396,8 +396,6 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
     with tenant migrations and timeline deletions.
     """
 
-    # Offloading is off by default at time of writing: remove this line when it's on by default
-    neon_env_builder.pageserver_config_override = "timeline_offloading = true"
     neon_env_builder.storage_controller_config = {"heartbeat_interval": "100msec"}
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
 
@@ -994,8 +992,6 @@ def test_timeline_offload_race_unarchive(
     Ensure that unarchive and timeline offload don't race each other
     """
     # Regression test for issue https://github.com/neondatabase/neon/issues/10220
-    # (automatic) timeline offloading defaults to false for now
-    neon_env_builder.pageserver_config_override = "timeline_offloading = true"
 
     failpoint = "before-timeline-auto-offload"
 

From f7b2293317d0012783a384e458947082eda4e89f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 11 Feb 2025 17:58:34 +0100
Subject: [PATCH 434/583] Hardlink resident layers during detach ancestor
 (#10729)

After a detach ancestor operation, we don't want to on-demand download
layers that are already resident. This has shown to impede performance,
sometimes quite a lot (50 seconds:
https://github.com/neondatabase/neon/issues/8828#issuecomment-2643735644)

Fixes #8828.
---
 pageserver/src/tenant/storage_layer/layer.rs  |  2 -
 .../src/tenant/timeline/detach_ancestor.rs    | 90 ++++++++++++-------
 2 files changed, 59 insertions(+), 33 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 92313afba7..40282defd4 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -353,7 +353,6 @@ impl Layer {
     /// while the guard exists.
     ///
     /// Returns None if the layer is currently evicted or becoming evicted.
-    #[cfg(test)]
     pub(crate) async fn keep_resident(&self) -> Option<ResidentLayer> {
         let downloaded = self.0.inner.get().and_then(|rowe| rowe.get())?;
 
@@ -530,7 +529,6 @@ impl ResidentOrWantedEvicted {
     /// This is not used on the read path (anything that calls
     /// [`LayerInner::get_or_maybe_download`]) because it was decided that reads always win
     /// evictions, and part of that winning is using [`ResidentOrWantedEvicted::get_and_upgrade`].
-    #[cfg(test)]
     fn get(&self) -> Option<Arc<DownloadedLayer>> {
         match self {
             ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()),
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index f8bc4352e2..b6347d1219 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -6,7 +6,9 @@ use crate::{
     task_mgr::TaskKind,
     tenant::{
         remote_timeline_client::index::GcBlockingReason::DetachAncestor,
-        storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer},
+        storage_layer::{
+            layer::local_layer_path, AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer,
+        },
         Tenant,
     },
     virtual_file::{MaybeFatalIo, VirtualFile},
@@ -351,18 +353,7 @@ pub(super) async fn prepare(
 
         // FIXME: the fsync should be mandatory, after both rewrites and copies
         if wrote_any {
-            let timeline_dir = VirtualFile::open(
-                &detached
-                    .conf
-                    .timeline_path(&detached.tenant_shard_id, &detached.timeline_id),
-                ctx,
-            )
-            .await
-            .fatal_err("VirtualFile::open for timeline dir fsync");
-            timeline_dir
-                .sync_all()
-                .await
-                .fatal_err("VirtualFile::sync_all timeline dir");
+            fsync_timeline_dir(detached, ctx).await;
         }
     }
 
@@ -376,7 +367,7 @@ pub(super) async fn prepare(
         tasks.spawn(
             async move {
                 let _permit = limiter.acquire().await;
-                let owned = remote_copy(
+                let (owned, did_hardlink) = remote_copy(
                     &adopted,
                     &timeline,
                     timeline.generation,
@@ -384,16 +375,20 @@ pub(super) async fn prepare(
                     &timeline.cancel,
                 )
                 .await?;
-                tracing::info!(layer=%owned, "remote copied");
-                Ok(owned)
+                tracing::info!(layer=%owned, did_hard_link=%did_hardlink, "remote copied");
+                Ok((owned, did_hardlink))
             }
             .in_current_span(),
         );
     }
 
+    let mut should_fsync = false;
     while let Some(res) = tasks.join_next().await {
         match res {
-            Ok(Ok(owned)) => {
+            Ok(Ok((owned, did_hardlink))) => {
+                if did_hardlink {
+                    should_fsync = true;
+                }
                 new_layers.push(owned);
             }
             Ok(Err(failed)) => {
@@ -403,7 +398,10 @@ pub(super) async fn prepare(
         }
     }
 
-    // TODO: fsync directory again if we hardlinked something
+    // fsync directory again if we hardlinked something
+    if should_fsync {
+        fsync_timeline_dir(detached, ctx).await;
+    }
 
     let prepared = PreparedTimelineDetach { layers: new_layers };
 
@@ -629,35 +627,52 @@ async fn copy_lsn_prefix(
     }
 }
 
-/// Creates a new Layer instance for the adopted layer, and ensures it is found from the remote
-/// storage on successful return without the adopted layer being added to `index_part.json`.
+/// Creates a new Layer instance for the adopted layer, and ensures it is found in the remote
+/// storage on successful return. without the adopted layer being added to `index_part.json`.
+/// Returns (Layer, did hardlink)
 async fn remote_copy(
     adopted: &Layer,
     adoptee: &Arc<Timeline>,
     generation: Generation,
     shard_identity: ShardIdentity,
     cancel: &CancellationToken,
-) -> Result<Layer, Error> {
-    // depending if Layer::keep_resident we could hardlink
-
+) -> Result<(Layer, bool), Error> {
     let mut metadata = adopted.metadata();
     debug_assert!(metadata.generation <= generation);
     metadata.generation = generation;
     metadata.shard = shard_identity.shard_index();
 
-    let owned = crate::tenant::storage_layer::Layer::for_evicted(
-        adoptee.conf,
-        adoptee,
-        adopted.layer_desc().layer_name(),
-        metadata,
-    );
+    let conf = adoptee.conf;
+    let file_name = adopted.layer_desc().layer_name();
 
-    adoptee
+    // depending if Layer::keep_resident, do a hardlink
+    let did_hardlink;
+    let owned = if let Some(adopted_resident) = adopted.keep_resident().await {
+        let adopted_path = adopted_resident.local_path();
+        let adoptee_path = local_layer_path(
+            conf,
+            &adoptee.tenant_shard_id,
+            &adoptee.timeline_id,
+            &file_name,
+            &metadata.generation,
+        );
+        std::fs::hard_link(adopted_path, &adoptee_path)
+            .map_err(|e| Error::launder(e.into(), Error::Prepare))?;
+        did_hardlink = true;
+        Layer::for_resident(conf, adoptee, adoptee_path, file_name, metadata).drop_eviction_guard()
+    } else {
+        did_hardlink = false;
+        Layer::for_evicted(conf, adoptee, file_name, metadata)
+    };
+
+    let layer = adoptee
         .remote_client
         .copy_timeline_layer(adopted, &owned, cancel)
         .await
         .map(move |()| owned)
-        .map_err(|e| Error::launder(e, Error::Prepare))
+        .map_err(|e| Error::launder(e, Error::Prepare))?;
+
+    Ok((layer, did_hardlink))
 }
 
 pub(crate) enum DetachingAndReparenting {
@@ -1001,3 +1016,16 @@ fn check_no_archived_children_of_ancestor(
     }
     Ok(())
 }
+
+async fn fsync_timeline_dir(timeline: &Timeline, ctx: &RequestContext) {
+    let path = &timeline
+        .conf
+        .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id);
+    let timeline_dir = VirtualFile::open(&path, ctx)
+        .await
+        .fatal_err("VirtualFile::open for timeline dir fsync");
+    timeline_dir
+        .sync_all()
+        .await
+        .fatal_err("VirtualFile::sync_all timeline dir");
+}

From da9c101939eded0063e78f32ea53f0a7e6a44aa1 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 11 Feb 2025 12:02:22 -0600
Subject: [PATCH 435/583] Implement a second HTTP server within compute_ctl
 (#10574)

The compute_ctl HTTP server has the following purposes:

- Allow management via the control plane
- Provide an endpoint for scaping metrics
- Provide APIs for compute internal clients
  - Neon Postgres extension for installing remote extensions
  - local_proxy for installing extensions and adding grants

The first two purposes require the HTTP server to be available outside
the compute.

The Neon threat model is a bad actor within our internal network. We
need to reduce the surface area of attack. By exposing unnecessary
unauthenticated HTTP endpoints to the internal network, we increase the
surface area of attack. For endpoints described in the third bullet
point, we can just run an extra HTTP server, which is only bound to the
loopback interface since all consumers of those endpoints are within the
compute.
---
 compute/patches/pg_hint_plan_v16.patch        |   8 +-
 compute/patches/pg_hint_plan_v17.patch        |   8 +-
 compute_tools/src/bin/compute_ctl.rs          |  31 ++-
 compute_tools/src/compute.rs                  |  10 +-
 compute_tools/src/http/mod.rs                 |   4 +-
 compute_tools/src/http/server.rs              | 195 ++++++++++++------
 control_plane/src/bin/neon_local.rs           |   7 +-
 control_plane/src/endpoint.rs                 | 130 +++++++-----
 proxy/src/bin/local_proxy.rs                  |   4 +-
 test_runner/fixtures/endpoint/http.py         |  18 +-
 test_runner/fixtures/neon_cli.py              |   9 +-
 test_runner/fixtures/neon_fixtures.py         |  18 +-
 test_runner/performance/test_lazy_startup.py  |   4 +-
 test_runner/performance/test_startup.py       |   4 +-
 test_runner/regress/test_neon_local_cli.py    |  24 ++-
 .../regress/test_wal_acceptor_async.py        |   3 +-
 16 files changed, 310 insertions(+), 167 deletions(-)

diff --git a/compute/patches/pg_hint_plan_v16.patch b/compute/patches/pg_hint_plan_v16.patch
index 4039a036df..1fc3ffa609 100644
--- a/compute/patches/pg_hint_plan_v16.patch
+++ b/compute/patches/pg_hint_plan_v16.patch
@@ -6,16 +6,16 @@ index da723b8..5328114 100644
  ----
  -- No.A-1-1-3
  CREATE EXTENSION pg_hint_plan;
-+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
++LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
  -- No.A-1-2-3
  DROP EXTENSION pg_hint_plan;
  -- No.A-1-1-4
  CREATE SCHEMA other_schema;
  CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
-+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
++LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
  ERROR:  extension "pg_hint_plan" must be installed in schema "hint_plan"
  CREATE EXTENSION pg_hint_plan;
-+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
++LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
  DROP SCHEMA other_schema;
  ----
  ---- No. A-5-1 comment pattern
@@ -35,7 +35,7 @@ index d372459..6282afe 100644
  SET client_min_messages TO LOG;
  SET pg_hint_plan.enable_hint TO on;
  CREATE EXTENSION file_fdw;
-+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw
++LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw
  CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
  CREATE USER MAPPING FOR PUBLIC SERVER file_server;
  CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
diff --git a/compute/patches/pg_hint_plan_v17.patch b/compute/patches/pg_hint_plan_v17.patch
index dbf4e470ea..3442a094eb 100644
--- a/compute/patches/pg_hint_plan_v17.patch
+++ b/compute/patches/pg_hint_plan_v17.patch
@@ -6,16 +6,16 @@ index e7d68a1..65a056c 100644
  ----
  -- No.A-1-1-3
  CREATE EXTENSION pg_hint_plan;
-+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
++LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
  -- No.A-1-2-3
  DROP EXTENSION pg_hint_plan;
  -- No.A-1-1-4
  CREATE SCHEMA other_schema;
  CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
-+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
++LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
  ERROR:  extension "pg_hint_plan" must be installed in schema "hint_plan"
  CREATE EXTENSION pg_hint_plan;
-+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
++LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
  DROP SCHEMA other_schema;
  ----
  ---- No. A-5-1 comment pattern
@@ -168,7 +168,7 @@ index 017fa4b..98d989b 100644
  SET client_min_messages TO LOG;
  SET pg_hint_plan.enable_hint TO on;
  CREATE EXTENSION file_fdw;
-+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw
++LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw
  CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
  CREATE USER MAPPING FOR PUBLIC SERVER file_server;
  CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 275f345897..df47adda6c 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -48,6 +48,7 @@ use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Parser;
 use compute_tools::disk_quota::set_disk_quota;
+use compute_tools::http::server::Server;
 use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
@@ -62,7 +63,6 @@ use compute_tools::compute::{
 };
 use compute_tools::configurator::launch_configurator;
 use compute_tools::extension_server::get_pg_version_string;
-use compute_tools::http::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
@@ -108,8 +108,20 @@ struct Cli {
     #[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
     pub remote_ext_config: Option<String>,
 
-    #[arg(long, default_value_t = 3080)]
-    pub http_port: u16,
+    /// The port to bind the external listening HTTP server to. Clients running
+    /// outside the compute will talk to the compute through this port. Keep
+    /// the previous name for this argument around for a smoother release
+    /// with the control plane.
+    ///
+    /// TODO: Remove the alias after the control plane release which teaches the
+    /// control plane about the renamed argument.
+    #[arg(long, alias = "http-port", default_value_t = 3080)]
+    pub external_http_port: u16,
+
+    /// The port to bind the internal listening HTTP server to. Clients like
+    /// the neon extension (for installing remote extensions) and local_proxy.
+    #[arg(long)]
+    pub internal_http_port: Option<u16>,
 
     #[arg(short = 'D', long, value_name = "DATADIR")]
     pub pgdata: String,
@@ -340,7 +352,8 @@ fn wait_spec(
         pgdata: cli.pgdata.clone(),
         pgbin: cli.pgbin.clone(),
         pgversion: get_pg_version_string(&cli.pgbin),
-        http_port: cli.http_port,
+        external_http_port: cli.external_http_port,
+        internal_http_port: cli.internal_http_port.unwrap_or(cli.external_http_port + 1),
         live_config_allowed,
         state: Mutex::new(new_state),
         state_changed: Condvar::new(),
@@ -358,9 +371,13 @@ fn wait_spec(
         compute.prewarm_postgres()?;
     }
 
-    // Launch http service first, so that we can serve control-plane requests
-    // while configuration is still in progress.
-    let _http_handle = launch_http_server(cli.http_port, &compute);
+    // Launch the external HTTP server first, so that we can serve control plane
+    // requests while configuration is still in progress.
+    Server::External(cli.external_http_port).launch(&compute);
+
+    // The internal HTTP server could be launched later, but there isn't much
+    // sense in waiting.
+    Server::Internal(cli.internal_http_port.unwrap_or(cli.external_http_port + 1)).launch(&compute);
 
     if !spec_set {
         // No spec provided, hang waiting for it.
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 7fc54bb490..cadc6f84d1 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -82,8 +82,10 @@ pub struct ComputeNode {
     /// - we push spec and it does configuration
     /// - but then it is restarted without any spec again
     pub live_config_allowed: bool,
-    /// The port that the compute's HTTP server listens on
-    pub http_port: u16,
+    /// The port that the compute's external HTTP server listens on
+    pub external_http_port: u16,
+    /// The port that the compute's internal HTTP server listens on
+    pub internal_http_port: u16,
     /// Volatile part of the `ComputeNode`, which should be used under `Mutex`.
     /// To allow HTTP API server to serving status requests, while configuration
     /// is in progress, lock should be held only for short periods of time to do
@@ -631,7 +633,7 @@ impl ComputeNode {
         config::write_postgres_conf(
             &pgdata_path.join("postgresql.conf"),
             &pspec.spec,
-            self.http_port,
+            self.internal_http_port,
         )?;
 
         // Syncing safekeepers is only safe with primary nodes: if a primary
@@ -1396,7 +1398,7 @@ impl ComputeNode {
         // Write new config
         let pgdata_path = Path::new(&self.pgdata);
         let postgresql_conf_path = pgdata_path.join("postgresql.conf");
-        config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?;
+        config::write_postgres_conf(&postgresql_conf_path, &spec, self.internal_http_port)?;
 
         let max_concurrent_connections = spec.reconfigure_concurrency;
 
diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs
index a596bea504..93eb6ef5b7 100644
--- a/compute_tools/src/http/mod.rs
+++ b/compute_tools/src/http/mod.rs
@@ -4,11 +4,9 @@ use http::{header::CONTENT_TYPE, StatusCode};
 use serde::Serialize;
 use tracing::error;
 
-pub use server::launch_http_server;
-
 mod extract;
 mod routes;
-mod server;
+pub mod server;
 
 /// Convenience response builder for JSON responses
 struct JsonResponse;
diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs
index 19dded5172..a523ecd96f 100644
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -1,9 +1,11 @@
 use std::{
+    fmt::Display,
     net::{IpAddr, Ipv6Addr, SocketAddr},
     sync::Arc,
     time::Duration,
 };
 
+use anyhow::Result;
 use axum::{
     extract::Request,
     middleware::{self, Next},
@@ -24,45 +26,65 @@ use super::routes::{
 };
 use crate::compute::ComputeNode;
 
-async fn handle_404() -> Response {
-    StatusCode::NOT_FOUND.into_response()
-}
-
 const X_REQUEST_ID: &str = "x-request-id";
 
-/// This middleware function allows compute_ctl to generate its own request ID
-/// if one isn't supplied. The control plane will always send one as a UUID. The
-/// neon Postgres extension on the other hand does not send one.
-async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
-    let headers = request.headers_mut();
-
-    if headers.get(X_REQUEST_ID).is_none() {
-        headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
-    }
-
-    next.run(request).await
+/// `compute_ctl` has two servers: internal and external. The internal server
+/// binds to the loopback interface and handles communication from clients on
+/// the compute. The external server is what receives communication from the
+/// control plane, the metrics scraper, etc. We make the distinction because
+/// certain routes in `compute_ctl` only need to be exposed to local processes
+/// like Postgres via the neon extension and local_proxy.
+#[derive(Clone, Copy, Debug)]
+pub enum Server {
+    Internal(u16),
+    External(u16),
 }
 
-/// Run the HTTP server and wait on it forever.
-async fn serve(port: u16, compute: Arc<ComputeNode>) {
-    let mut app = Router::new()
-        .route("/check_writability", post(check_writability::is_writable))
-        .route("/configure", post(configure::configure))
-        .route("/database_schema", get(database_schema::get_schema_dump))
-        .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
-        .route(
-            "/extension_server/{*filename}",
-            post(extension_server::download_extension),
-        )
-        .route("/extensions", post(extensions::install_extension))
-        .route("/grants", post(grants::add_grant))
-        .route("/insights", get(insights::get_insights))
-        .route("/metrics", get(metrics::get_metrics))
-        .route("/metrics.json", get(metrics_json::get_metrics))
-        .route("/status", get(status::get_status))
-        .route("/terminate", post(terminate::terminate))
-        .fallback(handle_404)
-        .layer(
+impl Display for Server {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Server::Internal(_) => f.write_str("internal"),
+            Server::External(_) => f.write_str("external"),
+        }
+    }
+}
+
+impl From<Server> for Router<Arc<ComputeNode>> {
+    fn from(server: Server) -> Self {
+        let mut router = Router::<Arc<ComputeNode>>::new();
+
+        router = match server {
+            Server::Internal(_) => {
+                router = router
+                    .route(
+                        "/extension_server/{*filename}",
+                        post(extension_server::download_extension),
+                    )
+                    .route("/extensions", post(extensions::install_extension))
+                    .route("/grants", post(grants::add_grant));
+
+                // Add in any testing support
+                if cfg!(feature = "testing") {
+                    use super::routes::failpoints;
+
+                    router = router.route("/failpoints", post(failpoints::configure_failpoints));
+                }
+
+                router
+            }
+            Server::External(_) => router
+                .route("/check_writability", post(check_writability::is_writable))
+                .route("/configure", post(configure::configure))
+                .route("/database_schema", get(database_schema::get_schema_dump))
+                .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
+                .route("/insights", get(insights::get_insights))
+                .route("/metrics", get(metrics::get_metrics))
+                .route("/metrics.json", get(metrics_json::get_metrics))
+                .route("/status", get(status::get_status))
+                .route("/terminate", post(terminate::terminate)),
+        };
+
+        router.fallback(Server::handle_404).method_not_allowed_fallback(Server::handle_405).layer(
             ServiceBuilder::new()
                 // Add this middleware since we assume the request ID exists
                 .layer(middleware::from_fn(maybe_add_request_id_header))
@@ -102,43 +124,88 @@ async fn serve(port: u16, compute: Arc<ComputeNode>) {
                 )
                 .layer(PropagateRequestIdLayer::x_request_id()),
         )
-        .with_state(compute);
+    }
+}
 
-    // Add in any testing support
-    if cfg!(feature = "testing") {
-        use super::routes::failpoints;
-
-        app = app.route("/failpoints", post(failpoints::configure_failpoints))
+impl Server {
+    async fn handle_404() -> impl IntoResponse {
+        StatusCode::NOT_FOUND
     }
 
-    // This usually binds to both IPv4 and IPv6 on Linux, see
-    // https://github.com/rust-lang/rust/pull/34440 for more information
-    let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
-    let listener = match TcpListener::bind(&addr).await {
-        Ok(listener) => listener,
-        Err(e) => {
-            error!(
-                "failed to bind the compute_ctl HTTP server to port {}: {}",
-                port, e
-            );
-            return;
+    async fn handle_405() -> impl IntoResponse {
+        StatusCode::METHOD_NOT_ALLOWED
+    }
+
+    async fn listener(&self) -> Result<TcpListener> {
+        let addr = SocketAddr::new(self.ip(), self.port());
+        let listener = TcpListener::bind(&addr).await?;
+
+        Ok(listener)
+    }
+
+    fn ip(&self) -> IpAddr {
+        match self {
+            // TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners
+            // allow binding to localhost
+            Server::Internal(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
+            Server::External(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
         }
-    };
-
-    if let Ok(local_addr) = listener.local_addr() {
-        info!("compute_ctl HTTP server listening on {}", local_addr);
-    } else {
-        info!("compute_ctl HTTP server listening on port {}", port);
     }
 
-    if let Err(e) = axum::serve(listener, app).await {
-        error!("compute_ctl HTTP server error: {}", e);
+    fn port(self) -> u16 {
+        match self {
+            Server::Internal(port) => port,
+            Server::External(port) => port,
+        }
+    }
+
+    async fn serve(self, compute: Arc<ComputeNode>) {
+        let listener = self.listener().await.unwrap_or_else(|e| {
+            // If we can't bind, the compute cannot operate correctly
+            panic!(
+                "failed to bind the compute_ctl {} HTTP server to {}: {}",
+                self,
+                SocketAddr::new(self.ip(), self.port()),
+                e
+            );
+        });
+
+        if tracing::enabled!(tracing::Level::INFO) {
+            let local_addr = match listener.local_addr() {
+                Ok(local_addr) => local_addr,
+                Err(_) => SocketAddr::new(self.ip(), self.port()),
+            };
+
+            info!(
+                "compute_ctl {} HTTP server listening at {}",
+                self, local_addr
+            );
+        }
+
+        let router = Router::from(self).with_state(compute);
+
+        if let Err(e) = axum::serve(listener, router).await {
+            error!("compute_ctl {} HTTP server error: {}", self, e);
+        }
+    }
+
+    pub fn launch(self, compute: &Arc<ComputeNode>) {
+        let state = Arc::clone(compute);
+
+        info!("Launching the {} server", self);
+
+        tokio::spawn(self.serve(state));
     }
 }
 
-/// Launch HTTP server in a new task and return its `JoinHandle`.
-pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> tokio::task::JoinHandle<()> {
-    let state = Arc::clone(state);
+/// This middleware function allows compute_ctl to generate its own request ID
+/// if one isn't supplied. The control plane will always send one as a UUID. The
+/// neon Postgres extension on the other hand does not send one.
+async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
+    let headers = request.headers_mut();
+    if headers.get(X_REQUEST_ID).is_none() {
+        headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
+    }
 
-    tokio::spawn(serve(port, state))
+    next.run(request).await
 }
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index ba67ffa2dd..02d793400a 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -552,8 +552,10 @@ struct EndpointCreateCmdArgs {
     lsn: Option<Lsn>,
     #[clap(long)]
     pg_port: Option<u16>,
+    #[clap(long, alias = "http-port")]
+    external_http_port: Option<u16>,
     #[clap(long)]
-    http_port: Option<u16>,
+    internal_http_port: Option<u16>,
     #[clap(long = "pageserver-id")]
     endpoint_pageserver_id: Option<NodeId>,
 
@@ -1353,7 +1355,8 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                 tenant_id,
                 timeline_id,
                 args.pg_port,
-                args.http_port,
+                args.external_http_port,
+                args.internal_http_port,
                 args.pg_version,
                 mode,
                 !args.update_catalog,
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 6ee6f8f1ec..3b2634204c 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -37,6 +37,8 @@
 //! ```
 //!
 use std::collections::BTreeMap;
+use std::net::IpAddr;
+use std::net::Ipv4Addr;
 use std::net::SocketAddr;
 use std::net::TcpStream;
 use std::path::PathBuf;
@@ -73,7 +75,8 @@ pub struct EndpointConf {
     timeline_id: TimelineId,
     mode: ComputeMode,
     pg_port: u16,
-    http_port: u16,
+    external_http_port: u16,
+    internal_http_port: u16,
     pg_version: u32,
     skip_pg_catalog_updates: bool,
     drop_subscriptions_before_start: bool,
@@ -128,7 +131,7 @@ impl ComputeControlPlane {
         1 + self
             .endpoints
             .values()
-            .map(|ep| std::cmp::max(ep.pg_address.port(), ep.http_address.port()))
+            .map(|ep| std::cmp::max(ep.pg_address.port(), ep.external_http_address.port()))
             .max()
             .unwrap_or(self.base_port)
     }
@@ -140,18 +143,27 @@ impl ComputeControlPlane {
         tenant_id: TenantId,
         timeline_id: TimelineId,
         pg_port: Option<u16>,
-        http_port: Option<u16>,
+        external_http_port: Option<u16>,
+        internal_http_port: Option<u16>,
         pg_version: u32,
         mode: ComputeMode,
         skip_pg_catalog_updates: bool,
         drop_subscriptions_before_start: bool,
     ) -> Result<Arc<Endpoint>> {
         let pg_port = pg_port.unwrap_or_else(|| self.get_port());
-        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
+        let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1);
+        let internal_http_port = internal_http_port.unwrap_or_else(|| external_http_port + 1);
         let ep = Arc::new(Endpoint {
             endpoint_id: endpoint_id.to_owned(),
-            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
-            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
+            pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), pg_port),
+            external_http_address: SocketAddr::new(
+                IpAddr::from(Ipv4Addr::UNSPECIFIED),
+                external_http_port,
+            ),
+            internal_http_address: SocketAddr::new(
+                IpAddr::from(Ipv4Addr::LOCALHOST),
+                internal_http_port,
+            ),
             env: self.env.clone(),
             timeline_id,
             mode,
@@ -176,7 +188,8 @@ impl ComputeControlPlane {
                 tenant_id,
                 timeline_id,
                 mode,
-                http_port,
+                external_http_port,
+                internal_http_port,
                 pg_port,
                 pg_version,
                 skip_pg_catalog_updates,
@@ -230,9 +243,10 @@ pub struct Endpoint {
     pub timeline_id: TimelineId,
     pub mode: ComputeMode,
 
-    // port and address of the Postgres server and `compute_ctl`'s HTTP API
+    // port and address of the Postgres server and `compute_ctl`'s HTTP APIs
     pub pg_address: SocketAddr,
-    pub http_address: SocketAddr,
+    pub external_http_address: SocketAddr,
+    pub internal_http_address: SocketAddr,
 
     // postgres major version in the format: 14, 15, etc.
     pg_version: u32,
@@ -287,8 +301,15 @@ impl Endpoint {
             serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;
 
         Ok(Endpoint {
-            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
-            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
+            pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), conf.pg_port),
+            external_http_address: SocketAddr::new(
+                IpAddr::from(Ipv4Addr::UNSPECIFIED),
+                conf.external_http_port,
+            ),
+            internal_http_address: SocketAddr::new(
+                IpAddr::from(Ipv4Addr::LOCALHOST),
+                conf.internal_http_port,
+            ),
             endpoint_id,
             env: env.clone(),
             timeline_id: conf.timeline_id,
@@ -650,40 +671,51 @@ impl Endpoint {
             println!("Also at '{}'", conn_str);
         }
         let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
-        cmd.args(["--http-port", &self.http_address.port().to_string()])
-            .args(["--pgdata", self.pgdata().to_str().unwrap()])
-            .args(["--connstr", &conn_str])
-            .args([
-                "--spec-path",
-                self.endpoint_path().join("spec.json").to_str().unwrap(),
-            ])
-            .args([
-                "--pgbin",
-                self.env
-                    .pg_bin_dir(self.pg_version)?
-                    .join("postgres")
-                    .to_str()
-                    .unwrap(),
-            ])
-            // TODO: It would be nice if we generated compute IDs with the same
-            // algorithm as the real control plane.
-            //
-            // TODO: Add this back when
-            // https://github.com/neondatabase/neon/pull/10747 is merged.
-            //
-            //.args([
-            //    "--compute-id",
-            //    &format!(
-            //        "compute-{}",
-            //        SystemTime::now()
-            //            .duration_since(UNIX_EPOCH)
-            //            .unwrap()
-            //            .as_secs()
-            //    ),
-            //])
-            .stdin(std::process::Stdio::null())
-            .stderr(logfile.try_clone()?)
-            .stdout(logfile);
+        //cmd.args([
+        //    "--external-http-port",
+        //    &self.external_http_address.port().to_string(),
+        //])
+        //.args([
+        //    "--internal-http-port",
+        //    &self.internal_http_address.port().to_string(),
+        //])
+        cmd.args([
+            "--http-port",
+            &self.external_http_address.port().to_string(),
+        ])
+        .args(["--pgdata", self.pgdata().to_str().unwrap()])
+        .args(["--connstr", &conn_str])
+        .args([
+            "--spec-path",
+            self.endpoint_path().join("spec.json").to_str().unwrap(),
+        ])
+        .args([
+            "--pgbin",
+            self.env
+                .pg_bin_dir(self.pg_version)?
+                .join("postgres")
+                .to_str()
+                .unwrap(),
+        ])
+        // TODO: It would be nice if we generated compute IDs with the same
+        // algorithm as the real control plane.
+        //
+        // TODO: Add this back when
+        // https://github.com/neondatabase/neon/pull/10747 is merged.
+        //
+        //.args([
+        //    "--compute-id",
+        //    &format!(
+        //        "compute-{}",
+        //        SystemTime::now()
+        //            .duration_since(UNIX_EPOCH)
+        //            .unwrap()
+        //            .as_secs()
+        //    ),
+        //])
+        .stdin(std::process::Stdio::null())
+        .stderr(logfile.try_clone()?)
+        .stdout(logfile);
 
         if let Some(remote_ext_config) = remote_ext_config {
             cmd.args(["--remote-ext-config", remote_ext_config]);
@@ -770,8 +802,8 @@ impl Endpoint {
                 reqwest::Method::GET,
                 format!(
                     "http://{}:{}/status",
-                    self.http_address.ip(),
-                    self.http_address.port()
+                    self.external_http_address.ip(),
+                    self.external_http_address.port()
                 ),
             )
             .send()
@@ -844,8 +876,8 @@ impl Endpoint {
         let response = client
             .post(format!(
                 "http://{}:{}/configure",
-                self.http_address.ip(),
-                self.http_address.port()
+                self.external_http_address.ip(),
+                self.external_http_address.port()
             ))
             .header(CONTENT_TYPE.as_str(), "application/json")
             .body(format!(
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index 7a855bf54b..8d8a4c124a 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -85,8 +85,8 @@ struct LocalProxyCliArgs {
     /// Address of the postgres server
     #[clap(long, default_value = "127.0.0.1:5432")]
     postgres: SocketAddr,
-    /// Address of the compute-ctl api service
-    #[clap(long, default_value = "http://127.0.0.1:3080/")]
+    /// Address of the internal compute-ctl api service
+    #[clap(long, default_value = "http://127.0.0.1:3081/")]
     compute_ctl: ApiUrl,
     /// Path of the local proxy config file
     #[clap(long, default_value = "./local_proxy.json")]
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
index 6e8210e978..cdc162fca2 100644
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -9,21 +9,23 @@ from requests.adapters import HTTPAdapter
 class EndpointHttpClient(requests.Session):
     def __init__(
         self,
-        port: int,
+        external_port: int,
+        internal_port: int,
     ):
         super().__init__()
-        self.port = port
+        self.external_port: int = external_port
+        self.internal_port: int = internal_port
 
         self.mount("http://", HTTPAdapter())
 
     def dbs_and_roles(self):
-        res = self.get(f"http://localhost:{self.port}/dbs_and_roles")
+        res = self.get(f"http://localhost:{self.external_port}/dbs_and_roles")
         res.raise_for_status()
         return res.json()
 
     def database_schema(self, database: str):
         res = self.get(
-            f"http://localhost:{self.port}/database_schema?database={urllib.parse.quote(database, safe='')}"
+            f"http://localhost:{self.external_port}/database_schema?database={urllib.parse.quote(database, safe='')}"
         )
         res.raise_for_status()
         return res.text
@@ -34,20 +36,20 @@ class EndpointHttpClient(requests.Session):
             "version": version,
             "database": database,
         }
-        res = self.post(f"http://localhost:{self.port}/extensions", json=body)
+        res = self.post(f"http://localhost:{self.internal_port}/extensions", json=body)
         res.raise_for_status()
         return res.json()
 
     def set_role_grants(self, database: str, role: str, schema: str, privileges: list[str]):
         res = self.post(
-            f"http://localhost:{self.port}/grants",
+            f"http://localhost:{self.internal_port}/grants",
             json={"database": database, "schema": schema, "role": role, "privileges": privileges},
         )
         res.raise_for_status()
         return res.json()
 
     def metrics(self) -> str:
-        res = self.get(f"http://localhost:{self.port}/metrics")
+        res = self.get(f"http://localhost:{self.external_port}/metrics")
         res.raise_for_status()
         return res.text
 
@@ -62,5 +64,5 @@ class EndpointHttpClient(requests.Session):
                 }
             )
 
-        res = self.post(f"http://localhost:{self.port}/failpoints", json=body)
+        res = self.post(f"http://localhost:{self.internal_port}/failpoints", json=body)
         res.raise_for_status()
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index 33d422c590..6a016d2621 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -478,7 +478,8 @@ class NeonLocalCli(AbstractNeonCli):
         self,
         branch_name: str,
         pg_port: int,
-        http_port: int,
+        external_http_port: int,
+        internal_http_port: int,
         tenant_id: TenantId,
         pg_version: PgVersion,
         endpoint_id: str | None = None,
@@ -501,8 +502,10 @@ class NeonLocalCli(AbstractNeonCli):
             args.extend(["--lsn", str(lsn)])
         if pg_port is not None:
             args.extend(["--pg-port", str(pg_port)])
-        if http_port is not None:
-            args.extend(["--http-port", str(http_port)])
+        if external_http_port is not None:
+            args.extend(["--external-http-port", str(external_http_port)])
+        if internal_http_port is not None:
+            args.extend(["--internal-http-port", str(internal_http_port)])
         if endpoint_id is not None:
             args.append(endpoint_id)
         if hot_standby:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 3d3a445b97..41e9952b8a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3807,7 +3807,8 @@ class Endpoint(PgProtocol, LogUtils):
         env: NeonEnv,
         tenant_id: TenantId,
         pg_port: int,
-        http_port: int,
+        external_http_port: int,
+        internal_http_port: int,
         check_stop_result: bool = True,
     ):
         super().__init__(host="localhost", port=pg_port, user="cloud_admin", dbname="postgres")
@@ -3817,7 +3818,8 @@ class Endpoint(PgProtocol, LogUtils):
         self.pgdata_dir: Path | None = None  # Path to computenode PGDATA
         self.tenant_id = tenant_id
         self.pg_port = pg_port
-        self.http_port = http_port
+        self.external_http_port = external_http_port
+        self.internal_http_port = internal_http_port
         self.check_stop_result = check_stop_result
         # passed to endpoint create and endpoint reconfigure
         self.active_safekeepers: list[int] = list(map(lambda sk: sk.id, env.safekeepers))
@@ -3834,7 +3836,8 @@ class Endpoint(PgProtocol, LogUtils):
         self, auth_token: str | None = None, retries: Retry | None = None
     ) -> EndpointHttpClient:
         return EndpointHttpClient(
-            port=self.http_port,
+            external_port=self.external_http_port,
+            internal_port=self.internal_http_port,
         )
 
     def create(
@@ -3866,7 +3869,8 @@ class Endpoint(PgProtocol, LogUtils):
             lsn=lsn,
             hot_standby=hot_standby,
             pg_port=self.pg_port,
-            http_port=self.http_port,
+            external_http_port=self.external_http_port,
+            internal_http_port=self.internal_http_port,
             pg_version=self.env.pg_version,
             pageserver_id=pageserver_id,
             allow_multiple=allow_multiple,
@@ -4258,7 +4262,8 @@ class EndpointFactory:
             self.env,
             tenant_id=tenant_id or self.env.initial_tenant,
             pg_port=self.env.port_distributor.get_port(),
-            http_port=self.env.port_distributor.get_port(),
+            external_http_port=self.env.port_distributor.get_port(),
+            internal_http_port=self.env.port_distributor.get_port(),
         )
         self.num_instances += 1
         self.endpoints.append(ep)
@@ -4288,7 +4293,8 @@ class EndpointFactory:
             self.env,
             tenant_id=tenant_id or self.env.initial_tenant,
             pg_port=self.env.port_distributor.get_port(),
-            http_port=self.env.port_distributor.get_port(),
+            external_http_port=self.env.port_distributor.get_port(),
+            internal_http_port=self.env.port_distributor.get_port(),
         )
 
         endpoint_id = endpoint_id or self.env.generate_endpoint_id()
diff --git a/test_runner/performance/test_lazy_startup.py b/test_runner/performance/test_lazy_startup.py
index 704073fe3b..3bf3ef890f 100644
--- a/test_runner/performance/test_lazy_startup.py
+++ b/test_runner/performance/test_lazy_startup.py
@@ -79,7 +79,9 @@ def test_lazy_startup(slru: str, neon_env_builder: NeonEnvBuilder, zenbenchmark:
             assert sum == 1000000
 
         # Get metrics
-        metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json()
+        metrics = requests.get(
+            f"http://localhost:{endpoint.external_http_port}/metrics.json"
+        ).json()
         durations = {
             "wait_for_spec_ms": f"{slru}_{i}_wait_for_spec",
             "sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers",
diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py
index d051717e92..60d8b5be30 100644
--- a/test_runner/performance/test_startup.py
+++ b/test_runner/performance/test_startup.py
@@ -56,7 +56,9 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc
             endpoint.safe_psql("select 1;")
 
         # Get metrics
-        metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json()
+        metrics = requests.get(
+            f"http://localhost:{endpoint.external_http_port}/metrics.json"
+        ).json()
         durations = {
             "wait_for_spec_ms": f"{i}_wait_for_spec",
             "sync_safekeepers_ms": f"{i}_sync_safekeepers",
diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py
index 80e26d9432..8d9aab6848 100644
--- a/test_runner/regress/test_neon_local_cli.py
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -17,11 +17,13 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por
 
         main_branch_name = "main"
         pg_port = port_distributor.get_port()
-        http_port = port_distributor.get_port()
+        external_http_port = port_distributor.get_port()
+        internal_http_port = port_distributor.get_port()
         env.neon_cli.endpoint_create(
             main_branch_name,
             pg_port,
-            http_port,
+            external_http_port,
+            internal_http_port,
             endpoint_id="ep-basic-main",
             tenant_id=env.initial_tenant,
             pg_version=env.pg_version,
@@ -35,11 +37,13 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por
             new_branch_name=branch_name,
         )
         pg_port = port_distributor.get_port()
-        http_port = port_distributor.get_port()
+        external_http_port = port_distributor.get_port()
+        internal_http_port = port_distributor.get_port()
         env.neon_cli.endpoint_create(
             branch_name,
             pg_port,
-            http_port,
+            external_http_port,
+            internal_http_port,
             endpoint_id=f"ep-{branch_name}",
             tenant_id=env.initial_tenant,
             pg_version=env.pg_version,
@@ -59,23 +63,27 @@ def test_neon_two_primary_endpoints_fail(
     branch_name = "main"
 
     pg_port = port_distributor.get_port()
-    http_port = port_distributor.get_port()
+    external_http_port = port_distributor.get_port()
+    internal_http_port = port_distributor.get_port()
     env.neon_cli.endpoint_create(
         branch_name,
         pg_port,
-        http_port,
+        external_http_port,
+        internal_http_port,
         endpoint_id="ep1",
         tenant_id=env.initial_tenant,
         pg_version=env.pg_version,
     )
 
     pg_port = port_distributor.get_port()
-    http_port = port_distributor.get_port()
+    external_http_port = port_distributor.get_port()
+    internal_http_port = port_distributor.get_port()
     # ep1 is not running so create will succeed
     env.neon_cli.endpoint_create(
         branch_name,
         pg_port,
-        http_port,
+        external_http_port,
+        internal_http_port,
         endpoint_id="ep2",
         tenant_id=env.initial_tenant,
         pg_version=env.pg_version,
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index b32b028fa1..936c774657 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -268,7 +268,8 @@ def endpoint_create_start(
         env,
         tenant_id=env.initial_tenant,
         pg_port=env.port_distributor.get_port(),
-        http_port=env.port_distributor.get_port(),
+        external_http_port=env.port_distributor.get_port(),
+        internal_http_port=env.port_distributor.get_port(),
         # In these tests compute has high probability of terminating on its own
         # before our stop() due to lost consensus leadership.
         check_stop_result=False,

From f62bc28086ef6649bb9f58ef4431ed53f7f48f2a Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 11 Feb 2025 20:46:23 +0100
Subject: [PATCH 436/583] proxy: Move binaries into the lib (#10758)

* This way all clippy lints defined in the lib also cover the binary
code.
* It's much easier to detect unused code.
* Fix all discovered lints.
---
 proxy/src/bin/local_proxy.rs              | 411 +----------
 proxy/src/bin/pg_sni_router.rs            | 301 +-------
 proxy/src/bin/proxy.rs                    | 826 +--------------------
 proxy/src/binary/local_proxy.rs           | 410 +++++++++++
 proxy/src/binary/mod.rs                   |   7 +
 proxy/src/binary/pg_sni_router.rs         | 304 ++++++++
 proxy/src/binary/proxy.rs                 | 827 ++++++++++++++++++++++
 proxy/src/compute_ctl/mod.rs              |  12 +-
 proxy/src/config.rs                       |   1 -
 proxy/src/control_plane/client/mod.rs     |   6 +-
 proxy/src/control_plane/messages.rs       |   3 +-
 proxy/src/lib.rs                          |  62 +-
 proxy/src/metrics.rs                      |   3 +-
 proxy/src/redis/cancellation_publisher.rs |  36 -
 proxy/src/serverless/sql_over_http.rs     |   1 -
 15 files changed, 1601 insertions(+), 1609 deletions(-)
 create mode 100644 proxy/src/binary/local_proxy.rs
 create mode 100644 proxy/src/binary/mod.rs
 create mode 100644 proxy/src/binary/pg_sni_router.rs
 create mode 100644 proxy/src/binary/proxy.rs

diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index 8d8a4c124a..8f225dc1e0 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -1,416 +1,7 @@
-use std::net::SocketAddr;
-use std::pin::pin;
-use std::str::FromStr;
-use std::sync::Arc;
-use std::time::Duration;
-
-use anyhow::{bail, ensure, Context};
-use camino::{Utf8Path, Utf8PathBuf};
-use compute_api::spec::LocalProxySpec;
-use futures::future::Either;
-use proxy::auth::backend::jwt::JwkCache;
-use proxy::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP};
-use proxy::auth::{self};
-use proxy::cancellation::CancellationHandler;
-use proxy::config::{
-    self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig,
-};
-use proxy::control_plane::locks::ApiLocks;
-use proxy::control_plane::messages::{EndpointJwksResponse, JwksSettings};
-use proxy::http::health_server::AppMetrics;
-use proxy::intern::RoleNameInt;
-use proxy::metrics::{Metrics, ThreadPoolMetrics};
-use proxy::rate_limiter::{
-    BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo,
-};
-use proxy::scram::threadpool::ThreadPool;
-use proxy::serverless::cancel_set::CancelSet;
-use proxy::serverless::{self, GlobalConnPoolOptions};
-use proxy::tls::client_config::compute_client_config_with_root_certs;
-use proxy::types::RoleName;
-use proxy::url::ApiUrl;
-
-project_git_version!(GIT_VERSION);
-project_build_tag!(BUILD_TAG);
-
-use clap::Parser;
-use thiserror::Error;
-use tokio::net::TcpListener;
-use tokio::sync::Notify;
-use tokio::task::JoinSet;
-use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, warn};
-use utils::sentry_init::init_sentry;
-use utils::{pid_file, project_build_tag, project_git_version};
-
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
-/// Neon proxy/router
-#[derive(Parser)]
-#[command(version = GIT_VERSION, about)]
-struct LocalProxyCliArgs {
-    /// listen for incoming metrics connections on ip:port
-    #[clap(long, default_value = "127.0.0.1:7001")]
-    metrics: String,
-    /// listen for incoming http connections on ip:port
-    #[clap(long)]
-    http: String,
-    /// timeout for the TLS handshake
-    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
-    handshake_timeout: tokio::time::Duration,
-    /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
-    #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
-    connect_compute_lock: String,
-    #[clap(flatten)]
-    sql_over_http: SqlOverHttpArgs,
-    /// User rate limiter max number of requests per second.
-    ///
-    /// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
-    /// Can be given multiple times for different bucket sizes.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
-    user_rps_limit: Vec<RateBucketInfo>,
-    /// Whether the auth rate limiter actually takes effect (for testing)
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    auth_rate_limit_enabled: bool,
-    /// Authentication rate limiter max number of hashes per second.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
-    auth_rate_limit: Vec<RateBucketInfo>,
-    /// The IP subnet to use when considering whether two IP addresses are considered the same.
-    #[clap(long, default_value_t = 64)]
-    auth_rate_limit_ip_subnet: u8,
-    /// Whether to retry the connection to the compute node
-    #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
-    connect_to_compute_retry: String,
-    /// Address of the postgres server
-    #[clap(long, default_value = "127.0.0.1:5432")]
-    postgres: SocketAddr,
-    /// Address of the internal compute-ctl api service
-    #[clap(long, default_value = "http://127.0.0.1:3081/")]
-    compute_ctl: ApiUrl,
-    /// Path of the local proxy config file
-    #[clap(long, default_value = "./local_proxy.json")]
-    config_path: Utf8PathBuf,
-    /// Path of the local proxy PID file
-    #[clap(long, default_value = "./local_proxy.pid")]
-    pid_path: Utf8PathBuf,
-}
-
-#[derive(clap::Args, Clone, Copy, Debug)]
-struct SqlOverHttpArgs {
-    /// How many connections to pool for each endpoint. Excess connections are discarded
-    #[clap(long, default_value_t = 200)]
-    sql_over_http_pool_max_total_conns: usize,
-
-    /// How long pooled connections should remain idle for before closing
-    #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
-    sql_over_http_idle_timeout: tokio::time::Duration,
-
-    #[clap(long, default_value_t = 100)]
-    sql_over_http_client_conn_threshold: u64,
-
-    #[clap(long, default_value_t = 16)]
-    sql_over_http_cancel_set_shards: usize,
-
-    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
-    sql_over_http_max_request_size_bytes: usize,
-
-    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
-    sql_over_http_max_response_size_bytes: usize,
-}
-
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
-    let _logging_guard = proxy::logging::init_local_proxy()?;
-    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
-    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
-
-    Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
-
-    // TODO: refactor these to use labels
-    debug!("Version: {GIT_VERSION}");
-    debug!("Build_tag: {BUILD_TAG}");
-    let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
-        revision: GIT_VERSION,
-        build_tag: BUILD_TAG,
-    });
-
-    let jemalloc = match proxy::jemalloc::MetricRecorder::new() {
-        Ok(t) => Some(t),
-        Err(e) => {
-            tracing::error!(error = ?e, "could not start jemalloc metrics loop");
-            None
-        }
-    };
-
-    let args = LocalProxyCliArgs::parse();
-    let config = build_config(&args)?;
-    let auth_backend = build_auth_backend(&args)?;
-
-    // before we bind to any ports, write the process ID to a file
-    // so that compute-ctl can find our process later
-    // in order to trigger the appropriate SIGHUP on config change.
-    //
-    // This also claims a "lock" that makes sure only one instance
-    // of local_proxy runs at a time.
-    let _process_guard = loop {
-        match pid_file::claim_for_current_process(&args.pid_path) {
-            Ok(guard) => break guard,
-            Err(e) => {
-                // compute-ctl might have tried to read the pid-file to let us
-                // know about some config change. We should try again.
-                error!(path=?args.pid_path, "could not claim PID file guard: {e:?}");
-                tokio::time::sleep(Duration::from_secs(1)).await;
-            }
-        }
-    };
-
-    let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?;
-    let http_listener = TcpListener::bind(args.http).await?;
-    let shutdown = CancellationToken::new();
-
-    // todo: should scale with CU
-    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
-        LeakyBucketConfig {
-            rps: 10.0,
-            max: 100.0,
-        },
-        16,
-    ));
-
-    let mut maintenance_tasks = JoinSet::new();
-
-    let refresh_config_notify = Arc::new(Notify::new());
-    maintenance_tasks.spawn(proxy::signals::handle(shutdown.clone(), {
-        let refresh_config_notify = Arc::clone(&refresh_config_notify);
-        move || {
-            refresh_config_notify.notify_one();
-        }
-    }));
-
-    // trigger the first config load **after** setting up the signal hook
-    // to avoid the race condition where:
-    // 1. No config file registered when local_proxy starts up
-    // 2. The config file is written but the signal hook is not yet received
-    // 3. local_proxy completes startup but has no config loaded, despite there being a registerd config.
-    refresh_config_notify.notify_one();
-    tokio::spawn(refresh_config_loop(args.config_path, refresh_config_notify));
-
-    maintenance_tasks.spawn(proxy::http::health_server::task_main(
-        metrics_listener,
-        AppMetrics {
-            jemalloc,
-            neon_metrics,
-            proxy: proxy::metrics::Metrics::get(),
-        },
-    ));
-
-    let task = serverless::task_main(
-        config,
-        auth_backend,
-        http_listener,
-        shutdown.clone(),
-        Arc::new(CancellationHandler::new(&config.connect_to_compute, None)),
-        endpoint_rate_limiter,
-    );
-
-    match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await {
-        // exit immediately on maintenance task completion
-        Either::Left((Some(res), _)) => match proxy::error::flatten_err(res)? {},
-        // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
-        Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
-        // exit immediately on client task error
-        Either::Right((res, _)) => res?,
-    }
-
-    Ok(())
-}
-
-/// ProxyConfig is created at proxy startup, and lives forever.
-fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
-    let config::ConcurrencyLockOptions {
-        shards,
-        limiter,
-        epoch,
-        timeout,
-    } = args.connect_compute_lock.parse()?;
-    info!(
-        ?limiter,
-        shards,
-        ?epoch,
-        "Using NodeLocks (connect_compute)"
-    );
-    let connect_compute_locks = ApiLocks::new(
-        "connect_compute_lock",
-        limiter,
-        shards,
-        timeout,
-        epoch,
-        &Metrics::get().proxy.connect_compute_lock,
-    )?;
-
-    let http_config = HttpConfig {
-        accept_websockets: false,
-        pool_options: GlobalConnPoolOptions {
-            gc_epoch: Duration::from_secs(60),
-            pool_shards: 2,
-            idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
-            opt_in: false,
-
-            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_total_conns,
-            max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
-        },
-        cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
-        client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
-        max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes,
-        max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
-    };
-
-    let compute_config = ComputeConfig {
-        retry: RetryConfig::parse(RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)?,
-        tls: Arc::new(compute_client_config_with_root_certs()?),
-        timeout: Duration::from_secs(2),
-    };
-
-    Ok(Box::leak(Box::new(ProxyConfig {
-        tls_config: None,
-        metric_collection: None,
-        http_config,
-        authentication_config: AuthenticationConfig {
-            jwks_cache: JwkCache::default(),
-            thread_pool: ThreadPool::new(0),
-            scram_protocol_timeout: Duration::from_secs(10),
-            rate_limiter_enabled: false,
-            rate_limiter: BucketRateLimiter::new(vec![]),
-            rate_limit_ip_subnet: 64,
-            ip_allowlist_check_enabled: true,
-            is_vpc_acccess_proxy: false,
-            is_auth_broker: false,
-            accept_jwts: true,
-            console_redirect_confirmation_timeout: Duration::ZERO,
-        },
-        proxy_protocol_v2: config::ProxyProtocolV2::Rejected,
-        handshake_timeout: Duration::from_secs(10),
-        region: "local".into(),
-        wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?,
-        connect_compute_locks,
-        connect_to_compute: compute_config,
-    })))
-}
-
-/// auth::Backend is created at proxy startup, and lives forever.
-fn build_auth_backend(
-    args: &LocalProxyCliArgs,
-) -> anyhow::Result<&'static auth::Backend<'static, ()>> {
-    let auth_backend = proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned(
-        LocalBackend::new(args.postgres, args.compute_ctl.clone()),
-    ));
-
-    Ok(Box::leak(Box::new(auth_backend)))
-}
-
-#[derive(Error, Debug)]
-enum RefreshConfigError {
-    #[error(transparent)]
-    Read(#[from] std::io::Error),
-    #[error(transparent)]
-    Parse(#[from] serde_json::Error),
-    #[error(transparent)]
-    Validate(anyhow::Error),
-}
-
-async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc<Notify>) {
-    let mut init = true;
-    loop {
-        rx.notified().await;
-
-        match refresh_config_inner(&path).await {
-            Ok(()) => {}
-            // don't log for file not found errors if this is the first time we are checking
-            // for computes that don't use local_proxy, this is not an error.
-            Err(RefreshConfigError::Read(e))
-                if init && e.kind() == std::io::ErrorKind::NotFound =>
-            {
-                debug!(error=?e, ?path, "could not read config file");
-            }
-            Err(e) => {
-                error!(error=?e, ?path, "could not read config file");
-            }
-        }
-
-        init = false;
-    }
-}
-
-async fn refresh_config_inner(path: &Utf8Path) -> Result<(), RefreshConfigError> {
-    let bytes = tokio::fs::read(&path).await?;
-    let data: LocalProxySpec = serde_json::from_slice(&bytes)?;
-
-    let mut jwks_set = vec![];
-
-    fn parse_jwks_settings(jwks: compute_api::spec::JwksSettings) -> anyhow::Result<JwksSettings> {
-        let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?;
-
-        ensure!(
-            jwks_url.has_authority()
-                && (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"),
-            "Invalid JWKS url. Must be HTTP",
-        );
-
-        ensure!(
-            jwks_url.host().is_some_and(|h| h != url::Host::Domain("")),
-            "Invalid JWKS url. No domain listed",
-        );
-
-        // clear username, password and ports
-        jwks_url
-            .set_username("")
-            .expect("url can be a base and has a valid host and is not a file. should not error");
-        jwks_url
-            .set_password(None)
-            .expect("url can be a base and has a valid host and is not a file. should not error");
-        // local testing is hard if we need to have a specific restricted port
-        if cfg!(not(feature = "testing")) {
-            jwks_url.set_port(None).expect(
-                "url can be a base and has a valid host and is not a file. should not error",
-            );
-        }
-
-        // clear query params
-        jwks_url.set_fragment(None);
-        jwks_url.query_pairs_mut().clear().finish();
-
-        if jwks_url.scheme() != "https" {
-            // local testing is hard if we need to set up https support.
-            if cfg!(not(feature = "testing")) {
-                jwks_url
-                    .set_scheme("https")
-                    .expect("should not error to set the scheme to https if it was http");
-            } else {
-                warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS");
-            }
-        }
-
-        Ok(JwksSettings {
-            id: jwks.id,
-            jwks_url,
-            provider_name: jwks.provider_name,
-            jwt_audience: jwks.jwt_audience,
-            role_names: jwks
-                .role_names
-                .into_iter()
-                .map(RoleName::from)
-                .map(|s| RoleNameInt::from(&s))
-                .collect(),
-        })
-    }
-
-    for jwks in data.jwks.into_iter().flatten() {
-        jwks_set.push(parse_jwks_settings(jwks).map_err(RefreshConfigError::Validate)?);
-    }
-
-    info!("successfully loaded new config");
-    JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set })));
-
-    Ok(())
+    proxy::binary::local_proxy::run().await
 }
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 97d870a83a..0c3326af85 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -1,299 +1,10 @@
-/// A stand-alone program that routes connections, e.g. from
-/// `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`.
-///
-/// This allows connecting to pods/services running in the same Kubernetes cluster from
-/// the outside. Similar to an ingress controller for HTTPS.
-use std::{net::SocketAddr, sync::Arc};
-
-use anyhow::{anyhow, bail, ensure, Context};
-use clap::Arg;
-use futures::future::Either;
-use futures::TryFutureExt;
-use itertools::Itertools;
-use proxy::context::RequestContext;
-use proxy::metrics::{Metrics, ThreadPoolMetrics};
-use proxy::protocol2::ConnectionInfo;
-use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
-use proxy::stream::{PqStream, Stream};
-use proxy::tls::TlsServerEndPoint;
-use rustls::crypto::ring;
-use rustls::pki_types::PrivateKeyDer;
-use tokio::io::{AsyncRead, AsyncWrite};
-use tokio::net::TcpListener;
-use tokio_util::sync::CancellationToken;
-use tracing::{error, info, Instrument};
-use utils::project_git_version;
-use utils::sentry_init::init_sentry;
-
-project_git_version!(GIT_VERSION);
-
-fn cli() -> clap::Command {
-    clap::Command::new("Neon proxy/router")
-        .version(GIT_VERSION)
-        .arg(
-            Arg::new("listen")
-                .short('l')
-                .long("listen")
-                .help("listen for incoming client connections on ip:port")
-                .default_value("127.0.0.1:4432"),
-        )
-        .arg(
-            Arg::new("tls-key")
-                .short('k')
-                .long("tls-key")
-                .help("path to TLS key for client postgres connections")
-                .required(true),
-        )
-        .arg(
-            Arg::new("tls-cert")
-                .short('c')
-                .long("tls-cert")
-                .help("path to TLS cert for client postgres connections")
-                .required(true),
-        )
-        .arg(
-            Arg::new("dest")
-                .short('d')
-                .long("destination")
-                .help("append this domain zone to the SNI hostname to get the destination address")
-                .required(true),
-        )
-}
+//! A stand-alone program that routes connections, e.g. from
+//! `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`.
+//!
+//! This allows connecting to pods/services running in the same Kubernetes cluster from
+//! the outside. Similar to an ingress controller for HTTPS.
 
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
-    let _logging_guard = proxy::logging::init().await?;
-    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
-    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
-
-    Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
-
-    let args = cli().get_matches();
-    let destination: String = args.get_one::<String>("dest").unwrap().parse()?;
-
-    // Configure TLS
-    let (tls_config, tls_server_end_point): (Arc<rustls::ServerConfig>, TlsServerEndPoint) = match (
-        args.get_one::<String>("tls-key"),
-        args.get_one::<String>("tls-cert"),
-    ) {
-        (Some(key_path), Some(cert_path)) => {
-            let key = {
-                let key_bytes = std::fs::read(key_path).context("TLS key file")?;
-
-                let mut keys =
-                    rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec();
-
-                ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
-                PrivateKeyDer::Pkcs8(
-                    keys.pop()
-                        .unwrap()
-                        .context(format!("Failed to read TLS keys at '{key_path}'"))?,
-                )
-            };
-
-            let cert_chain_bytes = std::fs::read(cert_path)
-                .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
-
-            let cert_chain: Vec<_> = {
-                rustls_pemfile::certs(&mut &cert_chain_bytes[..])
-                .try_collect()
-                .with_context(|| {
-                    format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.")
-                })?
-            };
-
-            // needed for channel bindings
-            let first_cert = cert_chain.first().context("missing certificate")?;
-            let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
-
-            let tls_config =
-                rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
-                    .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
-                    .context("ring should support TLS1.2 and TLS1.3")?
-                    .with_no_client_auth()
-                    .with_single_cert(cert_chain, key)?
-                    .into();
-
-            (tls_config, tls_server_end_point)
-        }
-        _ => bail!("tls-key and tls-cert must be specified"),
-    };
-
-    // Start listening for incoming client connections
-    let proxy_address: SocketAddr = args.get_one::<String>("listen").unwrap().parse()?;
-    info!("Starting sni router on {proxy_address}");
-    let proxy_listener = TcpListener::bind(proxy_address).await?;
-
-    let cancellation_token = CancellationToken::new();
-
-    let main = tokio::spawn(task_main(
-        Arc::new(destination),
-        tls_config,
-        tls_server_end_point,
-        proxy_listener,
-        cancellation_token.clone(),
-    ));
-    let signals_task = tokio::spawn(proxy::signals::handle(cancellation_token, || {}));
-
-    // the signal task cant ever succeed.
-    // the main task can error, or can succeed on cancellation.
-    // we want to immediately exit on either of these cases
-    let signal = match futures::future::select(signals_task, main).await {
-        Either::Left((res, _)) => proxy::error::flatten_err(res)?,
-        Either::Right((res, _)) => return proxy::error::flatten_err(res),
-    };
-
-    // maintenance tasks return `Infallible` success values, this is an impossible value
-    // so this match statically ensures that there are no possibilities for that value
-    match signal {}
-}
-
-async fn task_main(
-    dest_suffix: Arc<String>,
-    tls_config: Arc<rustls::ServerConfig>,
-    tls_server_end_point: TlsServerEndPoint,
-    listener: tokio::net::TcpListener,
-    cancellation_token: CancellationToken,
-) -> anyhow::Result<()> {
-    // When set for the server socket, the keepalive setting
-    // will be inherited by all accepted client sockets.
-    socket2::SockRef::from(&listener).set_keepalive(true)?;
-
-    let connections = tokio_util::task::task_tracker::TaskTracker::new();
-
-    while let Some(accept_result) =
-        run_until_cancelled(listener.accept(), &cancellation_token).await
-    {
-        let (socket, peer_addr) = accept_result?;
-
-        let session_id = uuid::Uuid::new_v4();
-        let tls_config = Arc::clone(&tls_config);
-        let dest_suffix = Arc::clone(&dest_suffix);
-
-        connections.spawn(
-            async move {
-                socket
-                    .set_nodelay(true)
-                    .context("failed to set socket option")?;
-
-                info!(%peer_addr, "serving");
-                let ctx = RequestContext::new(
-                    session_id,
-                    ConnectionInfo {
-                        addr: peer_addr,
-                        extra: None,
-                    },
-                    proxy::metrics::Protocol::SniRouter,
-                    "sni",
-                );
-                handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await
-            }
-            .unwrap_or_else(|e| {
-                // Acknowledge that the task has finished with an error.
-                error!("per-client task finished with an error: {e:#}");
-            })
-            .instrument(tracing::info_span!("handle_client", ?session_id)),
-        );
-    }
-
-    connections.close();
-    drop(listener);
-
-    connections.wait().await;
-
-    info!("all client connections have finished");
-    Ok(())
-}
-
-const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
-
-async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
-    ctx: &RequestContext,
-    raw_stream: S,
-    tls_config: Arc<rustls::ServerConfig>,
-    tls_server_end_point: TlsServerEndPoint,
-) -> anyhow::Result<Stream<S>> {
-    let mut stream = PqStream::new(Stream::from_raw(raw_stream));
-
-    let msg = stream.read_startup_packet().await?;
-    use pq_proto::FeStartupPacket::*;
-
-    match msg {
-        SslRequest { direct: false } => {
-            stream
-                .write_message(&pq_proto::BeMessage::EncryptionResponse(true))
-                .await?;
-
-            // Upgrade raw stream into a secure TLS-backed stream.
-            // NOTE: We've consumed `tls`; this fact will be used later.
-
-            let (raw, read_buf) = stream.into_inner();
-            // TODO: Normally, client doesn't send any data before
-            // server says TLS handshake is ok and read_buf is empty.
-            // However, you could imagine pipelining of postgres
-            // SSLRequest + TLS ClientHello in one hunk similar to
-            // pipelining in our node js driver. We should probably
-            // support that by chaining read_buf with the stream.
-            if !read_buf.is_empty() {
-                bail!("data is sent before server replied with EncryptionResponse");
-            }
-
-            Ok(Stream::Tls {
-                tls: Box::new(
-                    raw.upgrade(tls_config, !ctx.has_private_peer_addr())
-                        .await?,
-                ),
-                tls_server_end_point,
-            })
-        }
-        unexpected => {
-            info!(
-                ?unexpected,
-                "unexpected startup packet, rejecting connection"
-            );
-            stream
-                .throw_error_str(ERR_INSECURE_CONNECTION, proxy::error::ErrorKind::User)
-                .await?
-        }
-    }
-}
-
-async fn handle_client(
-    ctx: RequestContext,
-    dest_suffix: Arc<String>,
-    tls_config: Arc<rustls::ServerConfig>,
-    tls_server_end_point: TlsServerEndPoint,
-    stream: impl AsyncRead + AsyncWrite + Unpin,
-) -> anyhow::Result<()> {
-    let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?;
-
-    // Cut off first part of the SNI domain
-    // We receive required destination details in the format of
-    //   `{k8s_service_name}--{k8s_namespace}--{port}.non-sni-domain`
-    let sni = tls_stream.sni_hostname().ok_or(anyhow!("SNI missing"))?;
-    let dest: Vec<&str> = sni
-        .split_once('.')
-        .context("invalid SNI")?
-        .0
-        .splitn(3, "--")
-        .collect();
-    let port = dest[2].parse::<u16>().context("invalid port")?;
-    let destination = format!("{}.{}.{}:{}", dest[0], dest[1], dest_suffix, port);
-
-    info!("destination: {}", destination);
-
-    let mut client = tokio::net::TcpStream::connect(destination).await?;
-
-    // doesn't yet matter as pg-sni-router doesn't report analytics logs
-    ctx.set_success();
-    ctx.log_connect();
-
-    // Starting from here we only proxy the client's traffic.
-    info!("performing the proxy pass...");
-
-    match copy_bidirectional_client_compute(&mut tls_stream, &mut client).await {
-        Ok(_) => Ok(()),
-        Err(ErrorSource::Client(err)) => Err(err).context("client"),
-        Err(ErrorSource::Compute(err)) => Err(err).context("compute"),
-    }
+    proxy::binary::pg_sni_router::run().await
 }
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index de685a82c6..7d4b44841d 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,831 +1,7 @@
-use std::net::SocketAddr;
-use std::pin::pin;
-use std::sync::Arc;
-use std::time::Duration;
-
-use anyhow::bail;
-use futures::future::Either;
-use proxy::auth::backend::jwt::JwkCache;
-use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned};
-use proxy::cancellation::{handle_cancel_messages, CancellationHandler};
-use proxy::config::{
-    self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig,
-    ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2,
-};
-use proxy::context::parquet::ParquetUploadArgs;
-use proxy::http::health_server::AppMetrics;
-use proxy::metrics::Metrics;
-use proxy::rate_limiter::{
-    EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, WakeComputeRateLimiter,
-};
-use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
-use proxy::redis::kv_ops::RedisKVClient;
-use proxy::redis::{elasticache, notifications};
-use proxy::scram::threadpool::ThreadPool;
-use proxy::serverless::cancel_set::CancelSet;
-use proxy::serverless::GlobalConnPoolOptions;
-use proxy::tls::client_config::compute_client_config_with_root_certs;
-use proxy::{auth, control_plane, http, serverless, usage_metrics};
-use remote_storage::RemoteStorageConfig;
-use tokio::net::TcpListener;
-use tokio::task::JoinSet;
-use tokio_util::sync::CancellationToken;
-use tracing::{info, warn, Instrument};
-use utils::sentry_init::init_sentry;
-use utils::{project_build_tag, project_git_version};
-
-project_git_version!(GIT_VERSION);
-project_build_tag!(BUILD_TAG);
-
-use clap::{Parser, ValueEnum};
-
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
-#[derive(Clone, Debug, ValueEnum)]
-enum AuthBackendType {
-    #[value(name("cplane-v1"), alias("control-plane"))]
-    ControlPlaneV1,
-
-    #[value(name("link"), alias("control-redirect"))]
-    ConsoleRedirect,
-
-    #[cfg(feature = "testing")]
-    Postgres,
-}
-
-/// Neon proxy/router
-#[derive(Parser)]
-#[command(version = GIT_VERSION, about)]
-struct ProxyCliArgs {
-    /// Name of the region this proxy is deployed in
-    #[clap(long, default_value_t = String::new())]
-    region: String,
-    /// listen for incoming client connections on ip:port
-    #[clap(short, long, default_value = "127.0.0.1:4432")]
-    proxy: String,
-    #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)]
-    auth_backend: AuthBackendType,
-    /// listen for management callback connection on ip:port
-    #[clap(short, long, default_value = "127.0.0.1:7000")]
-    mgmt: String,
-    /// listen for incoming http connections (metrics, etc) on ip:port
-    #[clap(long, default_value = "127.0.0.1:7001")]
-    http: String,
-    /// listen for incoming wss connections on ip:port
-    #[clap(long)]
-    wss: Option<String>,
-    /// redirect unauthenticated users to the given uri in case of console redirect auth
-    #[clap(short, long, default_value = "http://localhost:3000/psql_session/")]
-    uri: String,
-    /// cloud API endpoint for authenticating users
-    #[clap(
-        short,
-        long,
-        default_value = "http://localhost:3000/authenticate_proxy_request/"
-    )]
-    auth_endpoint: String,
-    /// JWT used to connect to control plane.
-    #[clap(
-        long,
-        value_name = "JWT",
-        default_value = "",
-        env = "NEON_PROXY_TO_CONTROLPLANE_TOKEN"
-    )]
-    control_plane_token: Arc<str>,
-    /// if this is not local proxy, this toggles whether we accept jwt or passwords for http
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    is_auth_broker: bool,
-    /// path to TLS key for client postgres connections
-    ///
-    /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
-    #[clap(short = 'k', long, alias = "ssl-key")]
-    tls_key: Option<String>,
-    /// path to TLS cert for client postgres connections
-    ///
-    /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
-    #[clap(short = 'c', long, alias = "ssl-cert")]
-    tls_cert: Option<String>,
-    /// Allow writing TLS session keys to the given file pointed to by the environment variable `SSLKEYLOGFILE`.
-    #[clap(long, alias = "allow-ssl-keylogfile")]
-    allow_tls_keylogfile: bool,
-    /// path to directory with TLS certificates for client postgres connections
-    #[clap(long)]
-    certs_dir: Option<String>,
-    /// timeout for the TLS handshake
-    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
-    handshake_timeout: tokio::time::Duration,
-    /// http endpoint to receive periodic metric updates
-    #[clap(long)]
-    metric_collection_endpoint: Option<String>,
-    /// how often metrics should be sent to a collection endpoint
-    #[clap(long)]
-    metric_collection_interval: Option<String>,
-    /// cache for `wake_compute` api method (use `size=0` to disable)
-    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
-    wake_compute_cache: String,
-    /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
-    #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
-    wake_compute_lock: String,
-    /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
-    #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
-    connect_compute_lock: String,
-    #[clap(flatten)]
-    sql_over_http: SqlOverHttpArgs,
-    /// timeout for scram authentication protocol
-    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
-    scram_protocol_timeout: tokio::time::Duration,
-    /// size of the threadpool for password hashing
-    #[clap(long, default_value_t = 4)]
-    scram_thread_pool_size: u8,
-    /// Endpoint rate limiter max number of requests per second.
-    ///
-    /// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
-    /// Can be given multiple times for different bucket sizes.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
-    endpoint_rps_limit: Vec<RateBucketInfo>,
-    /// Wake compute rate limiter max number of requests per second.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
-    wake_compute_limit: Vec<RateBucketInfo>,
-    /// Whether the auth rate limiter actually takes effect (for testing)
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    auth_rate_limit_enabled: bool,
-    /// Authentication rate limiter max number of hashes per second.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
-    auth_rate_limit: Vec<RateBucketInfo>,
-    /// The IP subnet to use when considering whether two IP addresses are considered the same.
-    #[clap(long, default_value_t = 64)]
-    auth_rate_limit_ip_subnet: u8,
-    /// Redis rate limiter max number of requests per second.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)]
-    redis_rps_limit: Vec<RateBucketInfo>,
-    /// Cancellation channel size (max queue size for redis kv client)
-    #[clap(long, default_value = "1024")]
-    cancellation_ch_size: usize,
-    /// cache for `allowed_ips` (use `size=0` to disable)
-    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
-    allowed_ips_cache: String,
-    /// cache for `role_secret` (use `size=0` to disable)
-    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
-    role_secret_cache: String,
-    /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
-    #[clap(long)]
-    redis_notifications: Option<String>,
-    /// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain".
-    #[clap(long, default_value = "irsa")]
-    redis_auth_type: String,
-    /// redis host for streaming connections (might be different from the notifications host)
-    #[clap(long)]
-    redis_host: Option<String>,
-    /// redis port for streaming connections (might be different from the notifications host)
-    #[clap(long)]
-    redis_port: Option<u16>,
-    /// redis cluster name, used in aws elasticache
-    #[clap(long)]
-    redis_cluster_name: Option<String>,
-    /// redis user_id, used in aws elasticache
-    #[clap(long)]
-    redis_user_id: Option<String>,
-    /// aws region to retrieve credentials
-    #[clap(long, default_value_t = String::new())]
-    aws_region: String,
-    /// cache for `project_info` (use `size=0` to disable)
-    #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
-    project_info_cache: String,
-    /// cache for all valid endpoints
-    #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)]
-    endpoint_cache_config: String,
-    #[clap(flatten)]
-    parquet_upload: ParquetUploadArgs,
-
-    /// interval for backup metric collection
-    #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
-    metric_backup_collection_interval: std::time::Duration,
-    /// remote storage configuration for backup metric collection
-    /// Encoded as toml (same format as pageservers), eg
-    /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
-    #[clap(long, value_parser = remote_storage_from_toml)]
-    metric_backup_collection_remote_storage: Option<RemoteStorageConfig>,
-    /// chunk size for backup metric collection
-    /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
-    #[clap(long, default_value = "4194304")]
-    metric_backup_collection_chunk_size: usize,
-    /// Whether to retry the connection to the compute node
-    #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
-    connect_to_compute_retry: String,
-    /// Whether to retry the wake_compute request
-    #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)]
-    wake_compute_retry: String,
-
-    /// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    is_private_access_proxy: bool,
-
-    /// Configure whether all incoming requests have a Proxy Protocol V2 packet.
-    // TODO(conradludgate): switch default to rejected or required once we've updated all deployments
-    #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Supported)]
-    proxy_protocol_v2: ProxyProtocolV2,
-
-    /// Time the proxy waits for the webauth session to be confirmed by the control plane.
-    // TODO: rename to `console_redirect_confirmation_timeout`.
-    #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)]
-    webauth_confirmation_timeout: std::time::Duration,
-}
-
-#[derive(clap::Args, Clone, Copy, Debug)]
-struct SqlOverHttpArgs {
-    /// timeout for http connection requests
-    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
-    sql_over_http_timeout: tokio::time::Duration,
-
-    /// Whether the SQL over http pool is opt-in
-    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    sql_over_http_pool_opt_in: bool,
-
-    /// How many connections to pool for each endpoint. Excess connections are discarded
-    #[clap(long, default_value_t = 20)]
-    sql_over_http_pool_max_conns_per_endpoint: usize,
-
-    /// How many connections to pool for each endpoint. Excess connections are discarded
-    #[clap(long, default_value_t = 20000)]
-    sql_over_http_pool_max_total_conns: usize,
-
-    /// How long pooled connections should remain idle for before closing
-    #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
-    sql_over_http_idle_timeout: tokio::time::Duration,
-
-    /// Duration each shard will wait on average before a GC sweep.
-    /// A longer time will causes sweeps to take longer but will interfere less frequently.
-    #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
-    sql_over_http_pool_gc_epoch: tokio::time::Duration,
-
-    /// How many shards should the global pool have. Must be a power of two.
-    /// More shards will introduce less contention for pool operations, but can
-    /// increase memory used by the pool
-    #[clap(long, default_value_t = 128)]
-    sql_over_http_pool_shards: usize,
-
-    #[clap(long, default_value_t = 10000)]
-    sql_over_http_client_conn_threshold: u64,
-
-    #[clap(long, default_value_t = 64)]
-    sql_over_http_cancel_set_shards: usize,
-
-    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
-    sql_over_http_max_request_size_bytes: usize,
-
-    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
-    sql_over_http_max_response_size_bytes: usize,
-}
-
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
-    let _logging_guard = proxy::logging::init().await?;
-    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
-    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
-
-    // TODO: refactor these to use labels
-    info!("Version: {GIT_VERSION}");
-    info!("Build_tag: {BUILD_TAG}");
-    let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
-        revision: GIT_VERSION,
-        build_tag: BUILD_TAG,
-    });
-
-    let jemalloc = match proxy::jemalloc::MetricRecorder::new() {
-        Ok(t) => Some(t),
-        Err(e) => {
-            tracing::error!(error = ?e, "could not start jemalloc metrics loop");
-            None
-        }
-    };
-
-    let args = ProxyCliArgs::parse();
-    let config = build_config(&args)?;
-    let auth_backend = build_auth_backend(&args)?;
-
-    match auth_backend {
-        Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"),
-        Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"),
-    };
-    info!("Using region: {}", args.aws_region);
-
-    // TODO: untangle the config args
-    let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) {
-        ("plain", redis_url) => match redis_url {
-            None => {
-                bail!("plain auth requires redis_notifications to be set");
-            }
-            Some(url) => Some(
-                ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()),
-            ),
-        },
-        ("irsa", _) => match (&args.redis_host, args.redis_port) {
-            (Some(host), Some(port)) => Some(
-                ConnectionWithCredentialsProvider::new_with_credentials_provider(
-                    host.to_string(),
-                    port,
-                    elasticache::CredentialsProvider::new(
-                        args.aws_region,
-                        args.redis_cluster_name,
-                        args.redis_user_id,
-                    )
-                    .await,
-                ),
-            ),
-            (None, None) => {
-                warn!("irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client");
-                None
-            }
-            _ => {
-                bail!("redis-host and redis-port must be specified together");
-            }
-        },
-        _ => {
-            bail!("unknown auth type given");
-        }
-    };
-
-    let redis_notifications_client = if let Some(url) = args.redis_notifications {
-        Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()))
-    } else {
-        regional_redis_client.clone()
-    };
-
-    // Check that we can bind to address before further initialization
-    let http_address: SocketAddr = args.http.parse()?;
-    info!("Starting http on {http_address}");
-    let http_listener = TcpListener::bind(http_address).await?.into_std()?;
-
-    let mgmt_address: SocketAddr = args.mgmt.parse()?;
-    info!("Starting mgmt on {mgmt_address}");
-    let mgmt_listener = TcpListener::bind(mgmt_address).await?;
-
-    let proxy_listener = if !args.is_auth_broker {
-        let proxy_address: SocketAddr = args.proxy.parse()?;
-        info!("Starting proxy on {proxy_address}");
-
-        Some(TcpListener::bind(proxy_address).await?)
-    } else {
-        None
-    };
-
-    // TODO: rename the argument to something like serverless.
-    // It now covers more than just websockets, it also covers SQL over HTTP.
-    let serverless_listener = if let Some(serverless_address) = args.wss {
-        let serverless_address: SocketAddr = serverless_address.parse()?;
-        info!("Starting wss on {serverless_address}");
-        Some(TcpListener::bind(serverless_address).await?)
-    } else if args.is_auth_broker {
-        bail!("wss arg must be present for auth-broker")
-    } else {
-        None
-    };
-
-    let cancellation_token = CancellationToken::new();
-
-    let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone());
-    RateBucketInfo::validate(redis_rps_limit)?;
-
-    let redis_kv_client = regional_redis_client
-        .as_ref()
-        .map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit));
-
-    // channel size should be higher than redis client limit to avoid blocking
-    let cancel_ch_size = args.cancellation_ch_size;
-    let (tx_cancel, rx_cancel) = tokio::sync::mpsc::channel(cancel_ch_size);
-    let cancellation_handler = Arc::new(CancellationHandler::new(
-        &config.connect_to_compute,
-        Some(tx_cancel),
-    ));
-
-    // bit of a hack - find the min rps and max rps supported and turn it into
-    // leaky bucket config instead
-    let max = args
-        .endpoint_rps_limit
-        .iter()
-        .map(|x| x.rps())
-        .max_by(f64::total_cmp)
-        .unwrap_or(EndpointRateLimiter::DEFAULT.max);
-    let rps = args
-        .endpoint_rps_limit
-        .iter()
-        .map(|x| x.rps())
-        .min_by(f64::total_cmp)
-        .unwrap_or(EndpointRateLimiter::DEFAULT.rps);
-    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
-        LeakyBucketConfig { rps, max },
-        64,
-    ));
-
-    // client facing tasks. these will exit on error or on cancellation
-    // cancellation returns Ok(())
-    let mut client_tasks = JoinSet::new();
-    match auth_backend {
-        Either::Left(auth_backend) => {
-            if let Some(proxy_listener) = proxy_listener {
-                client_tasks.spawn(proxy::proxy::task_main(
-                    config,
-                    auth_backend,
-                    proxy_listener,
-                    cancellation_token.clone(),
-                    cancellation_handler.clone(),
-                    endpoint_rate_limiter.clone(),
-                ));
-            }
-
-            if let Some(serverless_listener) = serverless_listener {
-                client_tasks.spawn(serverless::task_main(
-                    config,
-                    auth_backend,
-                    serverless_listener,
-                    cancellation_token.clone(),
-                    cancellation_handler.clone(),
-                    endpoint_rate_limiter.clone(),
-                ));
-            }
-        }
-        Either::Right(auth_backend) => {
-            if let Some(proxy_listener) = proxy_listener {
-                client_tasks.spawn(proxy::console_redirect_proxy::task_main(
-                    config,
-                    auth_backend,
-                    proxy_listener,
-                    cancellation_token.clone(),
-                    cancellation_handler.clone(),
-                ));
-            }
-        }
-    }
-
-    client_tasks.spawn(proxy::context::parquet::worker(
-        cancellation_token.clone(),
-        args.parquet_upload,
-    ));
-
-    // maintenance tasks. these never return unless there's an error
-    let mut maintenance_tasks = JoinSet::new();
-    maintenance_tasks.spawn(proxy::signals::handle(cancellation_token.clone(), || {}));
-    maintenance_tasks.spawn(http::health_server::task_main(
-        http_listener,
-        AppMetrics {
-            jemalloc,
-            neon_metrics,
-            proxy: proxy::metrics::Metrics::get(),
-        },
-    ));
-    maintenance_tasks.spawn(control_plane::mgmt::task_main(mgmt_listener));
-
-    if let Some(metrics_config) = &config.metric_collection {
-        // TODO: Add gc regardles of the metric collection being enabled.
-        maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
-    }
-
-    if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend {
-        if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
-            match (redis_notifications_client, regional_redis_client.clone()) {
-                (None, None) => {}
-                (client1, client2) => {
-                    let cache = api.caches.project_info.clone();
-                    if let Some(client) = client1 {
-                        maintenance_tasks.spawn(notifications::task_main(
-                            client,
-                            cache.clone(),
-                            args.region.clone(),
-                        ));
-                    }
-                    if let Some(client) = client2 {
-                        maintenance_tasks.spawn(notifications::task_main(
-                            client,
-                            cache.clone(),
-                            args.region.clone(),
-                        ));
-                    }
-                    maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
-                }
-            }
-
-            if let Some(mut redis_kv_client) = redis_kv_client {
-                maintenance_tasks.spawn(async move {
-                    redis_kv_client.try_connect().await?;
-                    handle_cancel_messages(&mut redis_kv_client, rx_cancel).await
-                });
-            }
-
-            if let Some(regional_redis_client) = regional_redis_client {
-                let cache = api.caches.endpoints_cache.clone();
-                let con = regional_redis_client;
-                let span = tracing::info_span!("endpoints_cache");
-                maintenance_tasks.spawn(
-                    async move { cache.do_read(con, cancellation_token.clone()).await }
-                        .instrument(span),
-                );
-            }
-        }
-    }
-
-    let maintenance = loop {
-        // get one complete task
-        match futures::future::select(
-            pin!(maintenance_tasks.join_next()),
-            pin!(client_tasks.join_next()),
-        )
-        .await
-        {
-            // exit immediately on maintenance task completion
-            Either::Left((Some(res), _)) => break proxy::error::flatten_err(res)?,
-            // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
-            Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
-            // exit immediately on client task error
-            Either::Right((Some(res), _)) => proxy::error::flatten_err(res)?,
-            // exit if all our client tasks have shutdown gracefully
-            Either::Right((None, _)) => return Ok(()),
-        }
-    };
-
-    // maintenance tasks return Infallible success values, this is an impossible value
-    // so this match statically ensures that there are no possibilities for that value
-    match maintenance {}
-}
-
-/// ProxyConfig is created at proxy startup, and lives forever.
-fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
-    let thread_pool = ThreadPool::new(args.scram_thread_pool_size);
-    Metrics::install(thread_pool.metrics.clone());
-
-    let tls_config = match (&args.tls_key, &args.tls_cert) {
-        (Some(key_path), Some(cert_path)) => Some(config::configure_tls(
-            key_path,
-            cert_path,
-            args.certs_dir.as_ref(),
-            args.allow_tls_keylogfile,
-        )?),
-        (None, None) => None,
-        _ => bail!("either both or neither tls-key and tls-cert must be specified"),
-    };
-
-    let backup_metric_collection_config = config::MetricBackupCollectionConfig {
-        interval: args.metric_backup_collection_interval,
-        remote_storage_config: args.metric_backup_collection_remote_storage.clone(),
-        chunk_size: args.metric_backup_collection_chunk_size,
-    };
-
-    let metric_collection = match (
-        &args.metric_collection_endpoint,
-        &args.metric_collection_interval,
-    ) {
-        (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig {
-            endpoint: endpoint.parse()?,
-            interval: humantime::parse_duration(interval)?,
-            backup_metric_collection_config,
-        }),
-        (None, None) => None,
-        _ => bail!(
-            "either both or neither metric-collection-endpoint \
-             and metric-collection-interval must be specified"
-        ),
-    };
-
-    let config::ConcurrencyLockOptions {
-        shards,
-        limiter,
-        epoch,
-        timeout,
-    } = args.connect_compute_lock.parse()?;
-    info!(
-        ?limiter,
-        shards,
-        ?epoch,
-        "Using NodeLocks (connect_compute)"
-    );
-    let connect_compute_locks = control_plane::locks::ApiLocks::new(
-        "connect_compute_lock",
-        limiter,
-        shards,
-        timeout,
-        epoch,
-        &Metrics::get().proxy.connect_compute_lock,
-    )?;
-
-    let http_config = HttpConfig {
-        accept_websockets: !args.is_auth_broker,
-        pool_options: GlobalConnPoolOptions {
-            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
-            gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
-            pool_shards: args.sql_over_http.sql_over_http_pool_shards,
-            idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
-            opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
-            max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
-        },
-        cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
-        client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
-        max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes,
-        max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
-    };
-    let authentication_config = AuthenticationConfig {
-        jwks_cache: JwkCache::default(),
-        thread_pool,
-        scram_protocol_timeout: args.scram_protocol_timeout,
-        rate_limiter_enabled: args.auth_rate_limit_enabled,
-        rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
-        rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
-        ip_allowlist_check_enabled: !args.is_private_access_proxy,
-        is_vpc_acccess_proxy: args.is_private_access_proxy,
-        is_auth_broker: args.is_auth_broker,
-        accept_jwts: args.is_auth_broker,
-        console_redirect_confirmation_timeout: args.webauth_confirmation_timeout,
-    };
-
-    let compute_config = ComputeConfig {
-        retry: config::RetryConfig::parse(&args.connect_to_compute_retry)?,
-        tls: Arc::new(compute_client_config_with_root_certs()?),
-        timeout: Duration::from_secs(2),
-    };
-
-    let config = ProxyConfig {
-        tls_config,
-        metric_collection,
-        http_config,
-        authentication_config,
-        proxy_protocol_v2: args.proxy_protocol_v2,
-        handshake_timeout: args.handshake_timeout,
-        region: args.region.clone(),
-        wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
-        connect_compute_locks,
-        connect_to_compute: compute_config,
-    };
-
-    let config = Box::leak(Box::new(config));
-
-    tokio::spawn(config.connect_compute_locks.garbage_collect_worker());
-
-    Ok(config)
-}
-
-/// auth::Backend is created at proxy startup, and lives forever.
-fn build_auth_backend(
-    args: &ProxyCliArgs,
-) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
-    match &args.auth_backend {
-        AuthBackendType::ControlPlaneV1 => {
-            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
-            let project_info_cache_config: ProjectInfoCacheOptions =
-                args.project_info_cache.parse()?;
-            let endpoint_cache_config: config::EndpointCacheConfig =
-                args.endpoint_cache_config.parse()?;
-
-            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
-            info!(
-                "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
-            );
-            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
-            let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
-                wake_compute_cache_config,
-                project_info_cache_config,
-                endpoint_cache_config,
-            )));
-
-            let config::ConcurrencyLockOptions {
-                shards,
-                limiter,
-                epoch,
-                timeout,
-            } = args.wake_compute_lock.parse()?;
-            info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
-            let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
-                "wake_compute_lock",
-                limiter,
-                shards,
-                timeout,
-                epoch,
-                &Metrics::get().wake_compute_lock,
-            )?));
-            tokio::spawn(locks.garbage_collect_worker());
-
-            let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
-
-            let endpoint = http::Endpoint::new(url, http::new_client());
-
-            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
-            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
-            let wake_compute_endpoint_rate_limiter =
-                Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
-
-            let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new(
-                endpoint,
-                args.control_plane_token.clone(),
-                caches,
-                locks,
-                wake_compute_endpoint_rate_limiter,
-            );
-
-            let api = control_plane::client::ControlPlaneClient::ProxyV1(api);
-            let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
-            let config = Box::leak(Box::new(auth_backend));
-
-            Ok(Either::Left(config))
-        }
-
-        #[cfg(feature = "testing")]
-        AuthBackendType::Postgres => {
-            let url = args.auth_endpoint.parse()?;
-            let api = control_plane::client::mock::MockControlPlane::new(
-                url,
-                !args.is_private_access_proxy,
-            );
-            let api = control_plane::client::ControlPlaneClient::PostgresMock(api);
-
-            let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
-
-            let config = Box::leak(Box::new(auth_backend));
-
-            Ok(Either::Left(config))
-        }
-
-        AuthBackendType::ConsoleRedirect => {
-            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
-            let project_info_cache_config: ProjectInfoCacheOptions =
-                args.project_info_cache.parse()?;
-            let endpoint_cache_config: config::EndpointCacheConfig =
-                args.endpoint_cache_config.parse()?;
-
-            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
-            info!(
-                "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
-            );
-            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
-            let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
-                wake_compute_cache_config,
-                project_info_cache_config,
-                endpoint_cache_config,
-            )));
-
-            let config::ConcurrencyLockOptions {
-                shards,
-                limiter,
-                epoch,
-                timeout,
-            } = args.wake_compute_lock.parse()?;
-            info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
-            let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
-                "wake_compute_lock",
-                limiter,
-                shards,
-                timeout,
-                epoch,
-                &Metrics::get().wake_compute_lock,
-            )?));
-
-            let url = args.uri.clone().parse()?;
-            let ep_url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
-            let endpoint = http::Endpoint::new(ep_url, http::new_client());
-            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
-            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
-            let wake_compute_endpoint_rate_limiter =
-                Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
-
-            // Since we use only get_allowed_ips_and_secret() wake_compute_endpoint_rate_limiter
-            // and locks are not used in ConsoleRedirectBackend,
-            // but they are required by the NeonControlPlaneClient
-            let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new(
-                endpoint,
-                args.control_plane_token.clone(),
-                caches,
-                locks,
-                wake_compute_endpoint_rate_limiter,
-            );
-
-            let backend = ConsoleRedirectBackend::new(url, api);
-            let config = Box::leak(Box::new(backend));
-
-            Ok(Either::Right(config))
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::time::Duration;
-
-    use clap::Parser;
-    use proxy::rate_limiter::RateBucketInfo;
-
-    #[test]
-    fn parse_endpoint_rps_limit() {
-        let config = super::ProxyCliArgs::parse_from([
-            "proxy",
-            "--endpoint-rps-limit",
-            "100@1s",
-            "--endpoint-rps-limit",
-            "20@30s",
-        ]);
-
-        assert_eq!(
-            config.endpoint_rps_limit,
-            vec![
-                RateBucketInfo::new(100, Duration::from_secs(1)),
-                RateBucketInfo::new(20, Duration::from_secs(30)),
-            ]
-        );
-    }
+    proxy::binary::proxy::run().await
 }
diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs
new file mode 100644
index 0000000000..e0d8515375
--- /dev/null
+++ b/proxy/src/binary/local_proxy.rs
@@ -0,0 +1,410 @@
+use std::net::SocketAddr;
+use std::pin::pin;
+use std::str::FromStr;
+use std::sync::Arc;
+use std::time::Duration;
+
+use crate::auth::backend::jwt::JwkCache;
+use crate::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP};
+use crate::auth::{self};
+use crate::cancellation::CancellationHandler;
+use crate::config::{
+    self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig,
+};
+use crate::control_plane::locks::ApiLocks;
+use crate::control_plane::messages::{EndpointJwksResponse, JwksSettings};
+use crate::http::health_server::AppMetrics;
+use crate::intern::RoleNameInt;
+use crate::metrics::{Metrics, ThreadPoolMetrics};
+use crate::rate_limiter::{
+    BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo,
+};
+use crate::scram::threadpool::ThreadPool;
+use crate::serverless::cancel_set::CancelSet;
+use crate::serverless::{self, GlobalConnPoolOptions};
+use crate::tls::client_config::compute_client_config_with_root_certs;
+use crate::types::RoleName;
+use crate::url::ApiUrl;
+use anyhow::{bail, ensure, Context};
+use camino::{Utf8Path, Utf8PathBuf};
+use compute_api::spec::LocalProxySpec;
+use futures::future::Either;
+
+project_git_version!(GIT_VERSION);
+project_build_tag!(BUILD_TAG);
+
+use clap::Parser;
+use thiserror::Error;
+use tokio::net::TcpListener;
+use tokio::sync::Notify;
+use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
+use tracing::{debug, error, info, warn};
+use utils::sentry_init::init_sentry;
+use utils::{pid_file, project_build_tag, project_git_version};
+
+/// Neon proxy/router
+#[derive(Parser)]
+#[command(version = GIT_VERSION, about)]
+struct LocalProxyCliArgs {
+    /// listen for incoming metrics connections on ip:port
+    #[clap(long, default_value = "127.0.0.1:7001")]
+    metrics: String,
+    /// listen for incoming http connections on ip:port
+    #[clap(long)]
+    http: String,
+    /// timeout for the TLS handshake
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    handshake_timeout: tokio::time::Duration,
+    /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
+    #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
+    connect_compute_lock: String,
+    #[clap(flatten)]
+    sql_over_http: SqlOverHttpArgs,
+    /// User rate limiter max number of requests per second.
+    ///
+    /// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
+    /// Can be given multiple times for different bucket sizes.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
+    user_rps_limit: Vec<RateBucketInfo>,
+    /// Whether the auth rate limiter actually takes effect (for testing)
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    auth_rate_limit_enabled: bool,
+    /// Authentication rate limiter max number of hashes per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
+    auth_rate_limit: Vec<RateBucketInfo>,
+    /// The IP subnet to use when considering whether two IP addresses are considered the same.
+    #[clap(long, default_value_t = 64)]
+    auth_rate_limit_ip_subnet: u8,
+    /// Whether to retry the connection to the compute node
+    #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
+    connect_to_compute_retry: String,
+    /// Address of the postgres server
+    #[clap(long, default_value = "127.0.0.1:5432")]
+    postgres: SocketAddr,
+    /// Address of the internal compute-ctl api service
+    #[clap(long, default_value = "http://127.0.0.1:3081/")]
+    compute_ctl: ApiUrl,
+    /// Path of the local proxy config file
+    #[clap(long, default_value = "./local_proxy.json")]
+    config_path: Utf8PathBuf,
+    /// Path of the local proxy PID file
+    #[clap(long, default_value = "./local_proxy.pid")]
+    pid_path: Utf8PathBuf,
+}
+
+#[derive(clap::Args, Clone, Copy, Debug)]
+struct SqlOverHttpArgs {
+    /// How many connections to pool for each endpoint. Excess connections are discarded
+    #[clap(long, default_value_t = 200)]
+    sql_over_http_pool_max_total_conns: usize,
+
+    /// How long pooled connections should remain idle for before closing
+    #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
+    sql_over_http_idle_timeout: tokio::time::Duration,
+
+    #[clap(long, default_value_t = 100)]
+    sql_over_http_client_conn_threshold: u64,
+
+    #[clap(long, default_value_t = 16)]
+    sql_over_http_cancel_set_shards: usize,
+
+    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
+    sql_over_http_max_request_size_bytes: usize,
+
+    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
+    sql_over_http_max_response_size_bytes: usize,
+}
+
+pub async fn run() -> anyhow::Result<()> {
+    let _logging_guard = crate::logging::init_local_proxy()?;
+    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
+
+    Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
+
+    // TODO: refactor these to use labels
+    debug!("Version: {GIT_VERSION}");
+    debug!("Build_tag: {BUILD_TAG}");
+    let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
+        revision: GIT_VERSION,
+        build_tag: BUILD_TAG,
+    });
+
+    let jemalloc = match crate::jemalloc::MetricRecorder::new() {
+        Ok(t) => Some(t),
+        Err(e) => {
+            tracing::error!(error = ?e, "could not start jemalloc metrics loop");
+            None
+        }
+    };
+
+    let args = LocalProxyCliArgs::parse();
+    let config = build_config(&args)?;
+    let auth_backend = build_auth_backend(&args);
+
+    // before we bind to any ports, write the process ID to a file
+    // so that compute-ctl can find our process later
+    // in order to trigger the appropriate SIGHUP on config change.
+    //
+    // This also claims a "lock" that makes sure only one instance
+    // of local_proxy runs at a time.
+    let _process_guard = loop {
+        match pid_file::claim_for_current_process(&args.pid_path) {
+            Ok(guard) => break guard,
+            Err(e) => {
+                // compute-ctl might have tried to read the pid-file to let us
+                // know about some config change. We should try again.
+                error!(path=?args.pid_path, "could not claim PID file guard: {e:?}");
+                tokio::time::sleep(Duration::from_secs(1)).await;
+            }
+        }
+    };
+
+    let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?;
+    let http_listener = TcpListener::bind(args.http).await?;
+    let shutdown = CancellationToken::new();
+
+    // todo: should scale with CU
+    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
+        LeakyBucketConfig {
+            rps: 10.0,
+            max: 100.0,
+        },
+        16,
+    ));
+
+    let mut maintenance_tasks = JoinSet::new();
+
+    let refresh_config_notify = Arc::new(Notify::new());
+    maintenance_tasks.spawn(crate::signals::handle(shutdown.clone(), {
+        let refresh_config_notify = Arc::clone(&refresh_config_notify);
+        move || {
+            refresh_config_notify.notify_one();
+        }
+    }));
+
+    // trigger the first config load **after** setting up the signal hook
+    // to avoid the race condition where:
+    // 1. No config file registered when local_proxy starts up
+    // 2. The config file is written but the signal hook is not yet received
+    // 3. local_proxy completes startup but has no config loaded, despite there being a registerd config.
+    refresh_config_notify.notify_one();
+    tokio::spawn(refresh_config_loop(args.config_path, refresh_config_notify));
+
+    maintenance_tasks.spawn(crate::http::health_server::task_main(
+        metrics_listener,
+        AppMetrics {
+            jemalloc,
+            neon_metrics,
+            proxy: crate::metrics::Metrics::get(),
+        },
+    ));
+
+    let task = serverless::task_main(
+        config,
+        auth_backend,
+        http_listener,
+        shutdown.clone(),
+        Arc::new(CancellationHandler::new(&config.connect_to_compute, None)),
+        endpoint_rate_limiter,
+    );
+
+    match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await {
+        // exit immediately on maintenance task completion
+        Either::Left((Some(res), _)) => match crate::error::flatten_err(res)? {},
+        // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
+        Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
+        // exit immediately on client task error
+        Either::Right((res, _)) => res?,
+    }
+
+    Ok(())
+}
+
+/// ProxyConfig is created at proxy startup, and lives forever.
+fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
+    let config::ConcurrencyLockOptions {
+        shards,
+        limiter,
+        epoch,
+        timeout,
+    } = args.connect_compute_lock.parse()?;
+    info!(
+        ?limiter,
+        shards,
+        ?epoch,
+        "Using NodeLocks (connect_compute)"
+    );
+    let connect_compute_locks = ApiLocks::new(
+        "connect_compute_lock",
+        limiter,
+        shards,
+        timeout,
+        epoch,
+        &Metrics::get().proxy.connect_compute_lock,
+    );
+
+    let http_config = HttpConfig {
+        accept_websockets: false,
+        pool_options: GlobalConnPoolOptions {
+            gc_epoch: Duration::from_secs(60),
+            pool_shards: 2,
+            idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
+            opt_in: false,
+
+            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_total_conns,
+            max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
+        },
+        cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
+        client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
+        max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes,
+        max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
+    };
+
+    let compute_config = ComputeConfig {
+        retry: RetryConfig::parse(RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)?,
+        tls: Arc::new(compute_client_config_with_root_certs()?),
+        timeout: Duration::from_secs(2),
+    };
+
+    Ok(Box::leak(Box::new(ProxyConfig {
+        tls_config: None,
+        metric_collection: None,
+        http_config,
+        authentication_config: AuthenticationConfig {
+            jwks_cache: JwkCache::default(),
+            thread_pool: ThreadPool::new(0),
+            scram_protocol_timeout: Duration::from_secs(10),
+            rate_limiter_enabled: false,
+            rate_limiter: BucketRateLimiter::new(vec![]),
+            rate_limit_ip_subnet: 64,
+            ip_allowlist_check_enabled: true,
+            is_vpc_acccess_proxy: false,
+            is_auth_broker: false,
+            accept_jwts: true,
+            console_redirect_confirmation_timeout: Duration::ZERO,
+        },
+        proxy_protocol_v2: config::ProxyProtocolV2::Rejected,
+        handshake_timeout: Duration::from_secs(10),
+        region: "local".into(),
+        wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?,
+        connect_compute_locks,
+        connect_to_compute: compute_config,
+    })))
+}
+
+/// auth::Backend is created at proxy startup, and lives forever.
+fn build_auth_backend(args: &LocalProxyCliArgs) -> &'static auth::Backend<'static, ()> {
+    let auth_backend = crate::auth::Backend::Local(crate::auth::backend::MaybeOwned::Owned(
+        LocalBackend::new(args.postgres, args.compute_ctl.clone()),
+    ));
+
+    Box::leak(Box::new(auth_backend))
+}
+
+#[derive(Error, Debug)]
+enum RefreshConfigError {
+    #[error(transparent)]
+    Read(#[from] std::io::Error),
+    #[error(transparent)]
+    Parse(#[from] serde_json::Error),
+    #[error(transparent)]
+    Validate(anyhow::Error),
+}
+
+async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc<Notify>) {
+    let mut init = true;
+    loop {
+        rx.notified().await;
+
+        match refresh_config_inner(&path).await {
+            Ok(()) => {}
+            // don't log for file not found errors if this is the first time we are checking
+            // for computes that don't use local_proxy, this is not an error.
+            Err(RefreshConfigError::Read(e))
+                if init && e.kind() == std::io::ErrorKind::NotFound =>
+            {
+                debug!(error=?e, ?path, "could not read config file");
+            }
+            Err(e) => {
+                error!(error=?e, ?path, "could not read config file");
+            }
+        }
+
+        init = false;
+    }
+}
+
+async fn refresh_config_inner(path: &Utf8Path) -> Result<(), RefreshConfigError> {
+    let bytes = tokio::fs::read(&path).await?;
+    let data: LocalProxySpec = serde_json::from_slice(&bytes)?;
+
+    let mut jwks_set = vec![];
+
+    fn parse_jwks_settings(jwks: compute_api::spec::JwksSettings) -> anyhow::Result<JwksSettings> {
+        let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?;
+
+        ensure!(
+            jwks_url.has_authority()
+                && (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"),
+            "Invalid JWKS url. Must be HTTP",
+        );
+
+        ensure!(
+            jwks_url.host().is_some_and(|h| h != url::Host::Domain("")),
+            "Invalid JWKS url. No domain listed",
+        );
+
+        // clear username, password and ports
+        jwks_url
+            .set_username("")
+            .expect("url can be a base and has a valid host and is not a file. should not error");
+        jwks_url
+            .set_password(None)
+            .expect("url can be a base and has a valid host and is not a file. should not error");
+        // local testing is hard if we need to have a specific restricted port
+        if cfg!(not(feature = "testing")) {
+            jwks_url.set_port(None).expect(
+                "url can be a base and has a valid host and is not a file. should not error",
+            );
+        }
+
+        // clear query params
+        jwks_url.set_fragment(None);
+        jwks_url.query_pairs_mut().clear().finish();
+
+        if jwks_url.scheme() != "https" {
+            // local testing is hard if we need to set up https support.
+            if cfg!(not(feature = "testing")) {
+                jwks_url
+                    .set_scheme("https")
+                    .expect("should not error to set the scheme to https if it was http");
+            } else {
+                warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS");
+            }
+        }
+
+        Ok(JwksSettings {
+            id: jwks.id,
+            jwks_url,
+            _provider_name: jwks.provider_name,
+            jwt_audience: jwks.jwt_audience,
+            role_names: jwks
+                .role_names
+                .into_iter()
+                .map(RoleName::from)
+                .map(|s| RoleNameInt::from(&s))
+                .collect(),
+        })
+    }
+
+    for jwks in data.jwks.into_iter().flatten() {
+        jwks_set.push(parse_jwks_settings(jwks).map_err(RefreshConfigError::Validate)?);
+    }
+
+    info!("successfully loaded new config");
+    JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set })));
+
+    Ok(())
+}
diff --git a/proxy/src/binary/mod.rs b/proxy/src/binary/mod.rs
new file mode 100644
index 0000000000..dc07d3e675
--- /dev/null
+++ b/proxy/src/binary/mod.rs
@@ -0,0 +1,7 @@
+//! All binaries have the body of their main() defined here, so that the code
+//! is also covered by code style configs in lib.rs and the unused-code check is
+//! more effective when practically all modules are private to the lib.
+
+pub mod local_proxy;
+pub mod pg_sni_router;
+pub mod proxy;
diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs
new file mode 100644
index 0000000000..235e9674c6
--- /dev/null
+++ b/proxy/src/binary/pg_sni_router.rs
@@ -0,0 +1,304 @@
+/// A stand-alone program that routes connections, e.g. from
+/// `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`.
+///
+/// This allows connecting to pods/services running in the same Kubernetes cluster from
+/// the outside. Similar to an ingress controller for HTTPS.
+use std::{net::SocketAddr, sync::Arc};
+
+use crate::context::RequestContext;
+use crate::metrics::{Metrics, ThreadPoolMetrics};
+use crate::protocol2::ConnectionInfo;
+use crate::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
+use crate::stream::{PqStream, Stream};
+use crate::tls::TlsServerEndPoint;
+use anyhow::{anyhow, bail, ensure, Context};
+use clap::Arg;
+use futures::future::Either;
+use futures::TryFutureExt;
+use itertools::Itertools;
+use rustls::crypto::ring;
+use rustls::pki_types::PrivateKeyDer;
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::net::TcpListener;
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info, Instrument};
+use utils::project_git_version;
+use utils::sentry_init::init_sentry;
+
+project_git_version!(GIT_VERSION);
+
+fn cli() -> clap::Command {
+    clap::Command::new("Neon proxy/router")
+        .version(GIT_VERSION)
+        .arg(
+            Arg::new("listen")
+                .short('l')
+                .long("listen")
+                .help("listen for incoming client connections on ip:port")
+                .default_value("127.0.0.1:4432"),
+        )
+        .arg(
+            Arg::new("tls-key")
+                .short('k')
+                .long("tls-key")
+                .help("path to TLS key for client postgres connections")
+                .required(true),
+        )
+        .arg(
+            Arg::new("tls-cert")
+                .short('c')
+                .long("tls-cert")
+                .help("path to TLS cert for client postgres connections")
+                .required(true),
+        )
+        .arg(
+            Arg::new("dest")
+                .short('d')
+                .long("destination")
+                .help("append this domain zone to the SNI hostname to get the destination address")
+                .required(true),
+        )
+}
+
+pub async fn run() -> anyhow::Result<()> {
+    let _logging_guard = crate::logging::init().await?;
+    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
+
+    Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
+
+    let args = cli().get_matches();
+    let destination: String = args
+        .get_one::<String>("dest")
+        .expect("string argument defined")
+        .parse()?;
+
+    // Configure TLS
+    let (tls_config, tls_server_end_point): (Arc<rustls::ServerConfig>, TlsServerEndPoint) = match (
+        args.get_one::<String>("tls-key"),
+        args.get_one::<String>("tls-cert"),
+    ) {
+        (Some(key_path), Some(cert_path)) => {
+            let key = {
+                let key_bytes = std::fs::read(key_path).context("TLS key file")?;
+
+                let mut keys =
+                    rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec();
+
+                ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
+                PrivateKeyDer::Pkcs8(
+                    keys.pop()
+                        .expect("keys should not be empty")
+                        .context(format!("Failed to read TLS keys at '{key_path}'"))?,
+                )
+            };
+
+            let cert_chain_bytes = std::fs::read(cert_path)
+                .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
+
+            let cert_chain: Vec<_> = {
+                rustls_pemfile::certs(&mut &cert_chain_bytes[..])
+                .try_collect()
+                .with_context(|| {
+                    format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.")
+                })?
+            };
+
+            // needed for channel bindings
+            let first_cert = cert_chain.first().context("missing certificate")?;
+            let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
+
+            let tls_config =
+                rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
+                    .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
+                    .context("ring should support TLS1.2 and TLS1.3")?
+                    .with_no_client_auth()
+                    .with_single_cert(cert_chain, key)?
+                    .into();
+
+            (tls_config, tls_server_end_point)
+        }
+        _ => bail!("tls-key and tls-cert must be specified"),
+    };
+
+    // Start listening for incoming client connections
+    let proxy_address: SocketAddr = args
+        .get_one::<String>("listen")
+        .expect("string argument defined")
+        .parse()?;
+    info!("Starting sni router on {proxy_address}");
+    let proxy_listener = TcpListener::bind(proxy_address).await?;
+
+    let cancellation_token = CancellationToken::new();
+
+    let main = tokio::spawn(task_main(
+        Arc::new(destination),
+        tls_config,
+        tls_server_end_point,
+        proxy_listener,
+        cancellation_token.clone(),
+    ));
+    let signals_task = tokio::spawn(crate::signals::handle(cancellation_token, || {}));
+
+    // the signal task cant ever succeed.
+    // the main task can error, or can succeed on cancellation.
+    // we want to immediately exit on either of these cases
+    let signal = match futures::future::select(signals_task, main).await {
+        Either::Left((res, _)) => crate::error::flatten_err(res)?,
+        Either::Right((res, _)) => return crate::error::flatten_err(res),
+    };
+
+    // maintenance tasks return `Infallible` success values, this is an impossible value
+    // so this match statically ensures that there are no possibilities for that value
+    match signal {}
+}
+
+async fn task_main(
+    dest_suffix: Arc<String>,
+    tls_config: Arc<rustls::ServerConfig>,
+    tls_server_end_point: TlsServerEndPoint,
+    listener: tokio::net::TcpListener,
+    cancellation_token: CancellationToken,
+) -> anyhow::Result<()> {
+    // When set for the server socket, the keepalive setting
+    // will be inherited by all accepted client sockets.
+    socket2::SockRef::from(&listener).set_keepalive(true)?;
+
+    let connections = tokio_util::task::task_tracker::TaskTracker::new();
+
+    while let Some(accept_result) =
+        run_until_cancelled(listener.accept(), &cancellation_token).await
+    {
+        let (socket, peer_addr) = accept_result?;
+
+        let session_id = uuid::Uuid::new_v4();
+        let tls_config = Arc::clone(&tls_config);
+        let dest_suffix = Arc::clone(&dest_suffix);
+
+        connections.spawn(
+            async move {
+                socket
+                    .set_nodelay(true)
+                    .context("failed to set socket option")?;
+
+                info!(%peer_addr, "serving");
+                let ctx = RequestContext::new(
+                    session_id,
+                    ConnectionInfo {
+                        addr: peer_addr,
+                        extra: None,
+                    },
+                    crate::metrics::Protocol::SniRouter,
+                    "sni",
+                );
+                handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await
+            }
+            .unwrap_or_else(|e| {
+                // Acknowledge that the task has finished with an error.
+                error!("per-client task finished with an error: {e:#}");
+            })
+            .instrument(tracing::info_span!("handle_client", ?session_id)),
+        );
+    }
+
+    connections.close();
+    drop(listener);
+
+    connections.wait().await;
+
+    info!("all client connections have finished");
+    Ok(())
+}
+
+const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
+
+async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
+    ctx: &RequestContext,
+    raw_stream: S,
+    tls_config: Arc<rustls::ServerConfig>,
+    tls_server_end_point: TlsServerEndPoint,
+) -> anyhow::Result<Stream<S>> {
+    let mut stream = PqStream::new(Stream::from_raw(raw_stream));
+
+    let msg = stream.read_startup_packet().await?;
+    use pq_proto::FeStartupPacket::SslRequest;
+
+    match msg {
+        SslRequest { direct: false } => {
+            stream
+                .write_message(&pq_proto::BeMessage::EncryptionResponse(true))
+                .await?;
+
+            // Upgrade raw stream into a secure TLS-backed stream.
+            // NOTE: We've consumed `tls`; this fact will be used later.
+
+            let (raw, read_buf) = stream.into_inner();
+            // TODO: Normally, client doesn't send any data before
+            // server says TLS handshake is ok and read_buf is empty.
+            // However, you could imagine pipelining of postgres
+            // SSLRequest + TLS ClientHello in one hunk similar to
+            // pipelining in our node js driver. We should probably
+            // support that by chaining read_buf with the stream.
+            if !read_buf.is_empty() {
+                bail!("data is sent before server replied with EncryptionResponse");
+            }
+
+            Ok(Stream::Tls {
+                tls: Box::new(
+                    raw.upgrade(tls_config, !ctx.has_private_peer_addr())
+                        .await?,
+                ),
+                tls_server_end_point,
+            })
+        }
+        unexpected => {
+            info!(
+                ?unexpected,
+                "unexpected startup packet, rejecting connection"
+            );
+            stream
+                .throw_error_str(ERR_INSECURE_CONNECTION, crate::error::ErrorKind::User)
+                .await?
+        }
+    }
+}
+
+async fn handle_client(
+    ctx: RequestContext,
+    dest_suffix: Arc<String>,
+    tls_config: Arc<rustls::ServerConfig>,
+    tls_server_end_point: TlsServerEndPoint,
+    stream: impl AsyncRead + AsyncWrite + Unpin,
+) -> anyhow::Result<()> {
+    let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?;
+
+    // Cut off first part of the SNI domain
+    // We receive required destination details in the format of
+    //   `{k8s_service_name}--{k8s_namespace}--{port}.non-sni-domain`
+    let sni = tls_stream.sni_hostname().ok_or(anyhow!("SNI missing"))?;
+    let dest: Vec<&str> = sni
+        .split_once('.')
+        .context("invalid SNI")?
+        .0
+        .splitn(3, "--")
+        .collect();
+    let port = dest[2].parse::<u16>().context("invalid port")?;
+    let destination = format!("{}.{}.{}:{}", dest[0], dest[1], dest_suffix, port);
+
+    info!("destination: {}", destination);
+
+    let mut client = tokio::net::TcpStream::connect(destination).await?;
+
+    // doesn't yet matter as pg-sni-router doesn't report analytics logs
+    ctx.set_success();
+    ctx.log_connect();
+
+    // Starting from here we only proxy the client's traffic.
+    info!("performing the proxy pass...");
+
+    match copy_bidirectional_client_compute(&mut tls_stream, &mut client).await {
+        Ok(_) => Ok(()),
+        Err(ErrorSource::Client(err)) => Err(err).context("client"),
+        Err(ErrorSource::Compute(err)) => Err(err).context("compute"),
+    }
+}
diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs
new file mode 100644
index 0000000000..e38c49ca10
--- /dev/null
+++ b/proxy/src/binary/proxy.rs
@@ -0,0 +1,827 @@
+use std::net::SocketAddr;
+use std::pin::pin;
+use std::sync::Arc;
+use std::time::Duration;
+
+use crate::auth::backend::jwt::JwkCache;
+use crate::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned};
+use crate::cancellation::{handle_cancel_messages, CancellationHandler};
+use crate::config::{
+    self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig,
+    ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2,
+};
+use crate::context::parquet::ParquetUploadArgs;
+use crate::http::health_server::AppMetrics;
+use crate::metrics::Metrics;
+use crate::rate_limiter::{
+    EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, WakeComputeRateLimiter,
+};
+use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
+use crate::redis::kv_ops::RedisKVClient;
+use crate::redis::{elasticache, notifications};
+use crate::scram::threadpool::ThreadPool;
+use crate::serverless::cancel_set::CancelSet;
+use crate::serverless::GlobalConnPoolOptions;
+use crate::tls::client_config::compute_client_config_with_root_certs;
+use crate::{auth, control_plane, http, serverless, usage_metrics};
+use anyhow::bail;
+use futures::future::Either;
+use remote_storage::RemoteStorageConfig;
+use tokio::net::TcpListener;
+use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
+use tracing::{info, warn, Instrument};
+use utils::sentry_init::init_sentry;
+use utils::{project_build_tag, project_git_version};
+
+project_git_version!(GIT_VERSION);
+project_build_tag!(BUILD_TAG);
+
+use clap::{Parser, ValueEnum};
+
+#[derive(Clone, Debug, ValueEnum)]
+enum AuthBackendType {
+    #[value(name("cplane-v1"), alias("control-plane"))]
+    ControlPlaneV1,
+
+    #[value(name("link"), alias("control-redirect"))]
+    ConsoleRedirect,
+
+    #[cfg(any(test, feature = "testing"))]
+    Postgres,
+}
+
+/// Neon proxy/router
+#[derive(Parser)]
+#[command(version = GIT_VERSION, about)]
+struct ProxyCliArgs {
+    /// Name of the region this proxy is deployed in
+    #[clap(long, default_value_t = String::new())]
+    region: String,
+    /// listen for incoming client connections on ip:port
+    #[clap(short, long, default_value = "127.0.0.1:4432")]
+    proxy: String,
+    #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)]
+    auth_backend: AuthBackendType,
+    /// listen for management callback connection on ip:port
+    #[clap(short, long, default_value = "127.0.0.1:7000")]
+    mgmt: String,
+    /// listen for incoming http connections (metrics, etc) on ip:port
+    #[clap(long, default_value = "127.0.0.1:7001")]
+    http: String,
+    /// listen for incoming wss connections on ip:port
+    #[clap(long)]
+    wss: Option<String>,
+    /// redirect unauthenticated users to the given uri in case of console redirect auth
+    #[clap(short, long, default_value = "http://localhost:3000/psql_session/")]
+    uri: String,
+    /// cloud API endpoint for authenticating users
+    #[clap(
+        short,
+        long,
+        default_value = "http://localhost:3000/authenticate_proxy_request/"
+    )]
+    auth_endpoint: String,
+    /// JWT used to connect to control plane.
+    #[clap(
+        long,
+        value_name = "JWT",
+        default_value = "",
+        env = "NEON_PROXY_TO_CONTROLPLANE_TOKEN"
+    )]
+    control_plane_token: Arc<str>,
+    /// if this is not local proxy, this toggles whether we accept jwt or passwords for http
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    is_auth_broker: bool,
+    /// path to TLS key for client postgres connections
+    ///
+    /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
+    #[clap(short = 'k', long, alias = "ssl-key")]
+    tls_key: Option<String>,
+    /// path to TLS cert for client postgres connections
+    ///
+    /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
+    #[clap(short = 'c', long, alias = "ssl-cert")]
+    tls_cert: Option<String>,
+    /// Allow writing TLS session keys to the given file pointed to by the environment variable `SSLKEYLOGFILE`.
+    #[clap(long, alias = "allow-ssl-keylogfile")]
+    allow_tls_keylogfile: bool,
+    /// path to directory with TLS certificates for client postgres connections
+    #[clap(long)]
+    certs_dir: Option<String>,
+    /// timeout for the TLS handshake
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    handshake_timeout: tokio::time::Duration,
+    /// http endpoint to receive periodic metric updates
+    #[clap(long)]
+    metric_collection_endpoint: Option<String>,
+    /// how often metrics should be sent to a collection endpoint
+    #[clap(long)]
+    metric_collection_interval: Option<String>,
+    /// cache for `wake_compute` api method (use `size=0` to disable)
+    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
+    wake_compute_cache: String,
+    /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
+    #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
+    wake_compute_lock: String,
+    /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
+    #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
+    connect_compute_lock: String,
+    #[clap(flatten)]
+    sql_over_http: SqlOverHttpArgs,
+    /// timeout for scram authentication protocol
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    scram_protocol_timeout: tokio::time::Duration,
+    /// size of the threadpool for password hashing
+    #[clap(long, default_value_t = 4)]
+    scram_thread_pool_size: u8,
+    /// Endpoint rate limiter max number of requests per second.
+    ///
+    /// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
+    /// Can be given multiple times for different bucket sizes.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
+    endpoint_rps_limit: Vec<RateBucketInfo>,
+    /// Wake compute rate limiter max number of requests per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
+    wake_compute_limit: Vec<RateBucketInfo>,
+    /// Whether the auth rate limiter actually takes effect (for testing)
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    auth_rate_limit_enabled: bool,
+    /// Authentication rate limiter max number of hashes per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
+    auth_rate_limit: Vec<RateBucketInfo>,
+    /// The IP subnet to use when considering whether two IP addresses are considered the same.
+    #[clap(long, default_value_t = 64)]
+    auth_rate_limit_ip_subnet: u8,
+    /// Redis rate limiter max number of requests per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)]
+    redis_rps_limit: Vec<RateBucketInfo>,
+    /// Cancellation channel size (max queue size for redis kv client)
+    #[clap(long, default_value = "1024")]
+    cancellation_ch_size: usize,
+    /// cache for `allowed_ips` (use `size=0` to disable)
+    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
+    allowed_ips_cache: String,
+    /// cache for `role_secret` (use `size=0` to disable)
+    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
+    role_secret_cache: String,
+    /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
+    #[clap(long)]
+    redis_notifications: Option<String>,
+    /// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain".
+    #[clap(long, default_value = "irsa")]
+    redis_auth_type: String,
+    /// redis host for streaming connections (might be different from the notifications host)
+    #[clap(long)]
+    redis_host: Option<String>,
+    /// redis port for streaming connections (might be different from the notifications host)
+    #[clap(long)]
+    redis_port: Option<u16>,
+    /// redis cluster name, used in aws elasticache
+    #[clap(long)]
+    redis_cluster_name: Option<String>,
+    /// redis user_id, used in aws elasticache
+    #[clap(long)]
+    redis_user_id: Option<String>,
+    /// aws region to retrieve credentials
+    #[clap(long, default_value_t = String::new())]
+    aws_region: String,
+    /// cache for `project_info` (use `size=0` to disable)
+    #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
+    project_info_cache: String,
+    /// cache for all valid endpoints
+    #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)]
+    endpoint_cache_config: String,
+    #[clap(flatten)]
+    parquet_upload: ParquetUploadArgs,
+
+    /// interval for backup metric collection
+    #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
+    metric_backup_collection_interval: std::time::Duration,
+    /// remote storage configuration for backup metric collection
+    /// Encoded as toml (same format as pageservers), eg
+    /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
+    #[clap(long, value_parser = remote_storage_from_toml)]
+    metric_backup_collection_remote_storage: Option<RemoteStorageConfig>,
+    /// chunk size for backup metric collection
+    /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
+    #[clap(long, default_value = "4194304")]
+    metric_backup_collection_chunk_size: usize,
+    /// Whether to retry the connection to the compute node
+    #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
+    connect_to_compute_retry: String,
+    /// Whether to retry the wake_compute request
+    #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)]
+    wake_compute_retry: String,
+
+    /// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    is_private_access_proxy: bool,
+
+    /// Configure whether all incoming requests have a Proxy Protocol V2 packet.
+    // TODO(conradludgate): switch default to rejected or required once we've updated all deployments
+    #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Supported)]
+    proxy_protocol_v2: ProxyProtocolV2,
+
+    /// Time the proxy waits for the webauth session to be confirmed by the control plane.
+    // TODO: rename to `console_redirect_confirmation_timeout`.
+    #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)]
+    webauth_confirmation_timeout: std::time::Duration,
+}
+
+#[derive(clap::Args, Clone, Copy, Debug)]
+struct SqlOverHttpArgs {
+    /// timeout for http connection requests
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    sql_over_http_timeout: tokio::time::Duration,
+
+    /// Whether the SQL over http pool is opt-in
+    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    sql_over_http_pool_opt_in: bool,
+
+    /// How many connections to pool for each endpoint. Excess connections are discarded
+    #[clap(long, default_value_t = 20)]
+    sql_over_http_pool_max_conns_per_endpoint: usize,
+
+    /// How many connections to pool for each endpoint. Excess connections are discarded
+    #[clap(long, default_value_t = 20000)]
+    sql_over_http_pool_max_total_conns: usize,
+
+    /// How long pooled connections should remain idle for before closing
+    #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
+    sql_over_http_idle_timeout: tokio::time::Duration,
+
+    /// Duration each shard will wait on average before a GC sweep.
+    /// A longer time will causes sweeps to take longer but will interfere less frequently.
+    #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
+    sql_over_http_pool_gc_epoch: tokio::time::Duration,
+
+    /// How many shards should the global pool have. Must be a power of two.
+    /// More shards will introduce less contention for pool operations, but can
+    /// increase memory used by the pool
+    #[clap(long, default_value_t = 128)]
+    sql_over_http_pool_shards: usize,
+
+    #[clap(long, default_value_t = 10000)]
+    sql_over_http_client_conn_threshold: u64,
+
+    #[clap(long, default_value_t = 64)]
+    sql_over_http_cancel_set_shards: usize,
+
+    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
+    sql_over_http_max_request_size_bytes: usize,
+
+    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
+    sql_over_http_max_response_size_bytes: usize,
+}
+
+pub async fn run() -> anyhow::Result<()> {
+    let _logging_guard = crate::logging::init().await?;
+    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
+
+    // TODO: refactor these to use labels
+    info!("Version: {GIT_VERSION}");
+    info!("Build_tag: {BUILD_TAG}");
+    let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
+        revision: GIT_VERSION,
+        build_tag: BUILD_TAG,
+    });
+
+    let jemalloc = match crate::jemalloc::MetricRecorder::new() {
+        Ok(t) => Some(t),
+        Err(e) => {
+            tracing::error!(error = ?e, "could not start jemalloc metrics loop");
+            None
+        }
+    };
+
+    let args = ProxyCliArgs::parse();
+    let config = build_config(&args)?;
+    let auth_backend = build_auth_backend(&args)?;
+
+    match auth_backend {
+        Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"),
+        Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"),
+    };
+    info!("Using region: {}", args.aws_region);
+
+    // TODO: untangle the config args
+    let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) {
+        ("plain", redis_url) => match redis_url {
+            None => {
+                bail!("plain auth requires redis_notifications to be set");
+            }
+            Some(url) => Some(
+                ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()),
+            ),
+        },
+        ("irsa", _) => match (&args.redis_host, args.redis_port) {
+            (Some(host), Some(port)) => Some(
+                ConnectionWithCredentialsProvider::new_with_credentials_provider(
+                    host.to_string(),
+                    port,
+                    elasticache::CredentialsProvider::new(
+                        args.aws_region,
+                        args.redis_cluster_name,
+                        args.redis_user_id,
+                    )
+                    .await,
+                ),
+            ),
+            (None, None) => {
+                warn!("irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client");
+                None
+            }
+            _ => {
+                bail!("redis-host and redis-port must be specified together");
+            }
+        },
+        _ => {
+            bail!("unknown auth type given");
+        }
+    };
+
+    let redis_notifications_client = if let Some(url) = args.redis_notifications {
+        Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
+    } else {
+        regional_redis_client.clone()
+    };
+
+    // Check that we can bind to address before further initialization
+    let http_address: SocketAddr = args.http.parse()?;
+    info!("Starting http on {http_address}");
+    let http_listener = TcpListener::bind(http_address).await?.into_std()?;
+
+    let mgmt_address: SocketAddr = args.mgmt.parse()?;
+    info!("Starting mgmt on {mgmt_address}");
+    let mgmt_listener = TcpListener::bind(mgmt_address).await?;
+
+    let proxy_listener = if args.is_auth_broker {
+        None
+    } else {
+        let proxy_address: SocketAddr = args.proxy.parse()?;
+        info!("Starting proxy on {proxy_address}");
+
+        Some(TcpListener::bind(proxy_address).await?)
+    };
+
+    // TODO: rename the argument to something like serverless.
+    // It now covers more than just websockets, it also covers SQL over HTTP.
+    let serverless_listener = if let Some(serverless_address) = args.wss {
+        let serverless_address: SocketAddr = serverless_address.parse()?;
+        info!("Starting wss on {serverless_address}");
+        Some(TcpListener::bind(serverless_address).await?)
+    } else if args.is_auth_broker {
+        bail!("wss arg must be present for auth-broker")
+    } else {
+        None
+    };
+
+    let cancellation_token = CancellationToken::new();
+
+    let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone());
+    RateBucketInfo::validate(redis_rps_limit)?;
+
+    let redis_kv_client = regional_redis_client
+        .as_ref()
+        .map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit));
+
+    // channel size should be higher than redis client limit to avoid blocking
+    let cancel_ch_size = args.cancellation_ch_size;
+    let (tx_cancel, rx_cancel) = tokio::sync::mpsc::channel(cancel_ch_size);
+    let cancellation_handler = Arc::new(CancellationHandler::new(
+        &config.connect_to_compute,
+        Some(tx_cancel),
+    ));
+
+    // bit of a hack - find the min rps and max rps supported and turn it into
+    // leaky bucket config instead
+    let max = args
+        .endpoint_rps_limit
+        .iter()
+        .map(|x| x.rps())
+        .max_by(f64::total_cmp)
+        .unwrap_or(EndpointRateLimiter::DEFAULT.max);
+    let rps = args
+        .endpoint_rps_limit
+        .iter()
+        .map(|x| x.rps())
+        .min_by(f64::total_cmp)
+        .unwrap_or(EndpointRateLimiter::DEFAULT.rps);
+    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
+        LeakyBucketConfig { rps, max },
+        64,
+    ));
+
+    // client facing tasks. these will exit on error or on cancellation
+    // cancellation returns Ok(())
+    let mut client_tasks = JoinSet::new();
+    match auth_backend {
+        Either::Left(auth_backend) => {
+            if let Some(proxy_listener) = proxy_listener {
+                client_tasks.spawn(crate::proxy::task_main(
+                    config,
+                    auth_backend,
+                    proxy_listener,
+                    cancellation_token.clone(),
+                    cancellation_handler.clone(),
+                    endpoint_rate_limiter.clone(),
+                ));
+            }
+
+            if let Some(serverless_listener) = serverless_listener {
+                client_tasks.spawn(serverless::task_main(
+                    config,
+                    auth_backend,
+                    serverless_listener,
+                    cancellation_token.clone(),
+                    cancellation_handler.clone(),
+                    endpoint_rate_limiter.clone(),
+                ));
+            }
+        }
+        Either::Right(auth_backend) => {
+            if let Some(proxy_listener) = proxy_listener {
+                client_tasks.spawn(crate::console_redirect_proxy::task_main(
+                    config,
+                    auth_backend,
+                    proxy_listener,
+                    cancellation_token.clone(),
+                    cancellation_handler.clone(),
+                ));
+            }
+        }
+    }
+
+    client_tasks.spawn(crate::context::parquet::worker(
+        cancellation_token.clone(),
+        args.parquet_upload,
+    ));
+
+    // maintenance tasks. these never return unless there's an error
+    let mut maintenance_tasks = JoinSet::new();
+    maintenance_tasks.spawn(crate::signals::handle(cancellation_token.clone(), || {}));
+    maintenance_tasks.spawn(http::health_server::task_main(
+        http_listener,
+        AppMetrics {
+            jemalloc,
+            neon_metrics,
+            proxy: crate::metrics::Metrics::get(),
+        },
+    ));
+    maintenance_tasks.spawn(control_plane::mgmt::task_main(mgmt_listener));
+
+    if let Some(metrics_config) = &config.metric_collection {
+        // TODO: Add gc regardles of the metric collection being enabled.
+        maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
+    }
+
+    #[cfg_attr(not(any(test, feature = "testing")), expect(irrefutable_let_patterns))]
+    if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend {
+        if let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
+            match (redis_notifications_client, regional_redis_client.clone()) {
+                (None, None) => {}
+                (client1, client2) => {
+                    let cache = api.caches.project_info.clone();
+                    if let Some(client) = client1 {
+                        maintenance_tasks.spawn(notifications::task_main(
+                            client,
+                            cache.clone(),
+                            args.region.clone(),
+                        ));
+                    }
+                    if let Some(client) = client2 {
+                        maintenance_tasks.spawn(notifications::task_main(
+                            client,
+                            cache.clone(),
+                            args.region.clone(),
+                        ));
+                    }
+                    maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
+                }
+            }
+
+            if let Some(mut redis_kv_client) = redis_kv_client {
+                maintenance_tasks.spawn(async move {
+                    redis_kv_client.try_connect().await?;
+                    handle_cancel_messages(&mut redis_kv_client, rx_cancel).await
+                });
+            }
+
+            if let Some(regional_redis_client) = regional_redis_client {
+                let cache = api.caches.endpoints_cache.clone();
+                let con = regional_redis_client;
+                let span = tracing::info_span!("endpoints_cache");
+                maintenance_tasks.spawn(
+                    async move { cache.do_read(con, cancellation_token.clone()).await }
+                        .instrument(span),
+                );
+            }
+        }
+    }
+
+    let maintenance = loop {
+        // get one complete task
+        match futures::future::select(
+            pin!(maintenance_tasks.join_next()),
+            pin!(client_tasks.join_next()),
+        )
+        .await
+        {
+            // exit immediately on maintenance task completion
+            Either::Left((Some(res), _)) => break crate::error::flatten_err(res)?,
+            // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
+            Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
+            // exit immediately on client task error
+            Either::Right((Some(res), _)) => crate::error::flatten_err(res)?,
+            // exit if all our client tasks have shutdown gracefully
+            Either::Right((None, _)) => return Ok(()),
+        }
+    };
+
+    // maintenance tasks return Infallible success values, this is an impossible value
+    // so this match statically ensures that there are no possibilities for that value
+    match maintenance {}
+}
+
+/// ProxyConfig is created at proxy startup, and lives forever.
+fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
+    let thread_pool = ThreadPool::new(args.scram_thread_pool_size);
+    Metrics::install(thread_pool.metrics.clone());
+
+    let tls_config = match (&args.tls_key, &args.tls_cert) {
+        (Some(key_path), Some(cert_path)) => Some(config::configure_tls(
+            key_path,
+            cert_path,
+            args.certs_dir.as_ref(),
+            args.allow_tls_keylogfile,
+        )?),
+        (None, None) => None,
+        _ => bail!("either both or neither tls-key and tls-cert must be specified"),
+    };
+
+    let backup_metric_collection_config = config::MetricBackupCollectionConfig {
+        remote_storage_config: args.metric_backup_collection_remote_storage.clone(),
+        chunk_size: args.metric_backup_collection_chunk_size,
+    };
+
+    let metric_collection = match (
+        &args.metric_collection_endpoint,
+        &args.metric_collection_interval,
+    ) {
+        (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig {
+            endpoint: endpoint.parse()?,
+            interval: humantime::parse_duration(interval)?,
+            backup_metric_collection_config,
+        }),
+        (None, None) => None,
+        _ => bail!(
+            "either both or neither metric-collection-endpoint \
+             and metric-collection-interval must be specified"
+        ),
+    };
+
+    let config::ConcurrencyLockOptions {
+        shards,
+        limiter,
+        epoch,
+        timeout,
+    } = args.connect_compute_lock.parse()?;
+    info!(
+        ?limiter,
+        shards,
+        ?epoch,
+        "Using NodeLocks (connect_compute)"
+    );
+    let connect_compute_locks = control_plane::locks::ApiLocks::new(
+        "connect_compute_lock",
+        limiter,
+        shards,
+        timeout,
+        epoch,
+        &Metrics::get().proxy.connect_compute_lock,
+    );
+
+    let http_config = HttpConfig {
+        accept_websockets: !args.is_auth_broker,
+        pool_options: GlobalConnPoolOptions {
+            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
+            gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
+            pool_shards: args.sql_over_http.sql_over_http_pool_shards,
+            idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
+            opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
+            max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
+        },
+        cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
+        client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
+        max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes,
+        max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
+    };
+    let authentication_config = AuthenticationConfig {
+        jwks_cache: JwkCache::default(),
+        thread_pool,
+        scram_protocol_timeout: args.scram_protocol_timeout,
+        rate_limiter_enabled: args.auth_rate_limit_enabled,
+        rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
+        rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
+        ip_allowlist_check_enabled: !args.is_private_access_proxy,
+        is_vpc_acccess_proxy: args.is_private_access_proxy,
+        is_auth_broker: args.is_auth_broker,
+        accept_jwts: args.is_auth_broker,
+        console_redirect_confirmation_timeout: args.webauth_confirmation_timeout,
+    };
+
+    let compute_config = ComputeConfig {
+        retry: config::RetryConfig::parse(&args.connect_to_compute_retry)?,
+        tls: Arc::new(compute_client_config_with_root_certs()?),
+        timeout: Duration::from_secs(2),
+    };
+
+    let config = ProxyConfig {
+        tls_config,
+        metric_collection,
+        http_config,
+        authentication_config,
+        proxy_protocol_v2: args.proxy_protocol_v2,
+        handshake_timeout: args.handshake_timeout,
+        region: args.region.clone(),
+        wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
+        connect_compute_locks,
+        connect_to_compute: compute_config,
+    };
+
+    let config = Box::leak(Box::new(config));
+
+    tokio::spawn(config.connect_compute_locks.garbage_collect_worker());
+
+    Ok(config)
+}
+
+/// auth::Backend is created at proxy startup, and lives forever.
+fn build_auth_backend(
+    args: &ProxyCliArgs,
+) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
+    match &args.auth_backend {
+        AuthBackendType::ControlPlaneV1 => {
+            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
+            let project_info_cache_config: ProjectInfoCacheOptions =
+                args.project_info_cache.parse()?;
+            let endpoint_cache_config: config::EndpointCacheConfig =
+                args.endpoint_cache_config.parse()?;
+
+            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
+            info!(
+                "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
+            );
+            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
+            let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
+                wake_compute_cache_config,
+                project_info_cache_config,
+                endpoint_cache_config,
+            )));
+
+            let config::ConcurrencyLockOptions {
+                shards,
+                limiter,
+                epoch,
+                timeout,
+            } = args.wake_compute_lock.parse()?;
+            info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
+            let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
+                "wake_compute_lock",
+                limiter,
+                shards,
+                timeout,
+                epoch,
+                &Metrics::get().wake_compute_lock,
+            )));
+            tokio::spawn(locks.garbage_collect_worker());
+
+            let url: crate::url::ApiUrl = args.auth_endpoint.parse()?;
+
+            let endpoint = http::Endpoint::new(url, http::new_client());
+
+            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
+            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
+            let wake_compute_endpoint_rate_limiter =
+                Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
+
+            let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new(
+                endpoint,
+                args.control_plane_token.clone(),
+                caches,
+                locks,
+                wake_compute_endpoint_rate_limiter,
+            );
+
+            let api = control_plane::client::ControlPlaneClient::ProxyV1(api);
+            let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
+            let config = Box::leak(Box::new(auth_backend));
+
+            Ok(Either::Left(config))
+        }
+
+        #[cfg(any(test, feature = "testing"))]
+        AuthBackendType::Postgres => {
+            let url = args.auth_endpoint.parse()?;
+            let api = control_plane::client::mock::MockControlPlane::new(
+                url,
+                !args.is_private_access_proxy,
+            );
+            let api = control_plane::client::ControlPlaneClient::PostgresMock(api);
+
+            let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
+
+            let config = Box::leak(Box::new(auth_backend));
+
+            Ok(Either::Left(config))
+        }
+
+        AuthBackendType::ConsoleRedirect => {
+            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
+            let project_info_cache_config: ProjectInfoCacheOptions =
+                args.project_info_cache.parse()?;
+            let endpoint_cache_config: config::EndpointCacheConfig =
+                args.endpoint_cache_config.parse()?;
+
+            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
+            info!(
+                "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
+            );
+            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
+            let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
+                wake_compute_cache_config,
+                project_info_cache_config,
+                endpoint_cache_config,
+            )));
+
+            let config::ConcurrencyLockOptions {
+                shards,
+                limiter,
+                epoch,
+                timeout,
+            } = args.wake_compute_lock.parse()?;
+            info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
+            let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
+                "wake_compute_lock",
+                limiter,
+                shards,
+                timeout,
+                epoch,
+                &Metrics::get().wake_compute_lock,
+            )));
+
+            let url = args.uri.clone().parse()?;
+            let ep_url: crate::url::ApiUrl = args.auth_endpoint.parse()?;
+            let endpoint = http::Endpoint::new(ep_url, http::new_client());
+            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
+            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
+            let wake_compute_endpoint_rate_limiter =
+                Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
+
+            // Since we use only get_allowed_ips_and_secret() wake_compute_endpoint_rate_limiter
+            // and locks are not used in ConsoleRedirectBackend,
+            // but they are required by the NeonControlPlaneClient
+            let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new(
+                endpoint,
+                args.control_plane_token.clone(),
+                caches,
+                locks,
+                wake_compute_endpoint_rate_limiter,
+            );
+
+            let backend = ConsoleRedirectBackend::new(url, api);
+            let config = Box::leak(Box::new(backend));
+
+            Ok(Either::Right(config))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::time::Duration;
+
+    use crate::rate_limiter::RateBucketInfo;
+    use clap::Parser;
+
+    #[test]
+    fn parse_endpoint_rps_limit() {
+        let config = super::ProxyCliArgs::parse_from([
+            "proxy",
+            "--endpoint-rps-limit",
+            "100@1s",
+            "--endpoint-rps-limit",
+            "20@30s",
+        ]);
+
+        assert_eq!(
+            config.endpoint_rps_limit,
+            vec![
+                RateBucketInfo::new(100, Duration::from_secs(1)),
+                RateBucketInfo::new(20, Duration::from_secs(30)),
+            ]
+        );
+    }
+}
diff --git a/proxy/src/compute_ctl/mod.rs b/proxy/src/compute_ctl/mod.rs
index 60fdf107d4..ab3179afb2 100644
--- a/proxy/src/compute_ctl/mod.rs
+++ b/proxy/src/compute_ctl/mod.rs
@@ -42,14 +42,14 @@ pub enum Privilege {
 #[derive(Error, Debug)]
 pub enum ComputeCtlError {
     #[error("connection error: {0}")]
-    ConnectionError(#[source] reqwest_middleware::Error),
+    Connection(#[source] reqwest_middleware::Error),
     #[error("request error [{status}]: {body:?}")]
-    RequestError {
+    Request {
         status: StatusCode,
         body: Option<GenericAPIError>,
     },
     #[error("response parsing error: {0}")]
-    ResponseError(#[source] reqwest::Error),
+    Response(#[source] reqwest::Error),
 }
 
 impl ComputeCtlApi {
@@ -89,14 +89,14 @@ impl ComputeCtlApi {
             .json(req)
             .send()
             .await
-            .map_err(ComputeCtlError::ConnectionError)?;
+            .map_err(ComputeCtlError::Connection)?;
 
         let status = resp.status();
         if status.is_client_error() || status.is_server_error() {
             let body = resp.json().await.ok();
-            return Err(ComputeCtlError::RequestError { status, body });
+            return Err(ComputeCtlError::Request { status, body });
         }
 
-        resp.json().await.map_err(ComputeCtlError::ResponseError)
+        resp.json().await.map_err(ComputeCtlError::Response)
     }
 }
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 1dcd37712e..460e0cff54 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -151,7 +151,6 @@ impl FromStr for EndpointCacheConfig {
 }
 #[derive(Debug)]
 pub struct MetricBackupCollectionConfig {
-    pub interval: Duration,
     pub remote_storage_config: Option<RemoteStorageConfig>,
     pub chunk_size: usize,
 }
diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs
index a06943726e..c28ff4789d 100644
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -212,15 +212,15 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
         timeout: Duration,
         epoch: std::time::Duration,
         metrics: &'static ApiLockMetrics,
-    ) -> prometheus::Result<Self> {
-        Ok(Self {
+    ) -> Self {
+        Self {
             name,
             node_locks: ClashMap::with_shard_amount(shards),
             config,
             timeout,
             epoch,
             metrics,
-        })
+        }
     }
 
     pub(crate) async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index 5883d02b92..8d6b2e96f5 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -361,7 +361,8 @@ pub struct EndpointJwksResponse {
 pub struct JwksSettings {
     pub id: String,
     pub jwks_url: url::Url,
-    pub provider_name: String,
+    #[serde(rename = "provider_name")]
+    pub _provider_name: String,
     pub jwt_audience: Option<String>,
     pub role_names: Vec<RoleNameInt>,
 }
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index c56474edd7..a9e5fbc85b 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -72,34 +72,36 @@
 // List of temporarily allowed lints to unblock beta/nightly.
 #![allow(unknown_lints)]
 
-pub mod auth;
-pub mod cache;
-pub mod cancellation;
-pub mod compute;
-pub mod compute_ctl;
-pub mod config;
-pub mod console_redirect_proxy;
-pub mod context;
-pub mod control_plane;
-pub mod error;
+pub mod binary;
+
+mod auth;
+mod cache;
+mod cancellation;
+mod compute;
+mod compute_ctl;
+mod config;
+mod console_redirect_proxy;
+mod context;
+mod control_plane;
+mod error;
 mod ext;
-pub mod http;
-pub mod intern;
-pub mod jemalloc;
-pub mod logging;
-pub mod metrics;
-pub mod parse;
-pub mod protocol2;
-pub mod proxy;
-pub mod rate_limiter;
-pub mod redis;
-pub mod sasl;
-pub mod scram;
-pub mod serverless;
-pub mod signals;
-pub mod stream;
-pub mod tls;
-pub mod types;
-pub mod url;
-pub mod usage_metrics;
-pub mod waiters;
+mod http;
+mod intern;
+mod jemalloc;
+mod logging;
+mod metrics;
+mod parse;
+mod protocol2;
+mod proxy;
+mod rate_limiter;
+mod redis;
+mod sasl;
+mod scram;
+mod serverless;
+mod signals;
+mod stream;
+mod tls;
+mod types;
+mod url;
+mod usage_metrics;
+mod waiters;
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 25bcc81108..f3447e063e 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -205,7 +205,7 @@ pub enum Protocol {
 }
 
 impl Protocol {
-    pub fn as_str(&self) -> &'static str {
+    pub fn as_str(self) -> &'static str {
         match self {
             Protocol::Http => "http",
             Protocol::Ws => "ws",
@@ -385,6 +385,7 @@ pub enum Waiting {
 
 #[derive(FixedCardinalityLabel, Copy, Clone)]
 #[label(singleton = "kind")]
+#[allow(clippy::enum_variant_names)]
 pub enum RedisMsgKind {
     HSet,
     HSetMultiple,
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index 30d8b83e60..186fece4b2 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -5,9 +5,6 @@ use pq_proto::CancelKeyData;
 use tokio::sync::Mutex;
 use uuid::Uuid;
 
-use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
-use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
-
 pub trait CancellationPublisherMut: Send + Sync + 'static {
     #[allow(async_fn_in_trait)]
     async fn try_publish(
@@ -79,36 +76,3 @@ impl<P: CancellationPublisherMut> CancellationPublisher for Arc<Mutex<P>> {
             .await
     }
 }
-
-pub struct RedisPublisherClient {
-    #[allow(dead_code)]
-    client: ConnectionWithCredentialsProvider,
-    _region_id: String,
-    _limiter: GlobalRateLimiter,
-}
-
-impl RedisPublisherClient {
-    pub fn new(
-        client: ConnectionWithCredentialsProvider,
-        region_id: String,
-        info: &'static [RateBucketInfo],
-    ) -> anyhow::Result<Self> {
-        Ok(Self {
-            client,
-            _region_id: region_id,
-            _limiter: GlobalRateLimiter::new(info.into()),
-        })
-    }
-
-    #[allow(dead_code)]
-    pub(crate) async fn try_connect(&mut self) -> anyhow::Result<()> {
-        match self.client.connect().await {
-            Ok(()) => {}
-            Err(e) => {
-                tracing::error!("failed to connect to redis: {e}");
-                return Err(e);
-            }
-        }
-        Ok(())
-    }
-}
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 8739ce49f9..2eee3b7165 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -23,7 +23,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info};
 use typed_json::json;
 use url::Url;
-use urlencoding;
 use utils::http::error::ApiError;
 use uuid::Uuid;
 

From cd51ed2f8621bdcb42f54045387a99ebba953186 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 11 Feb 2025 20:09:41 +0000
Subject: [PATCH 437/583] tests: parametrize test_graceful_cluster_restart on
 AZ count (#10427)

## Problem

In https://github.com/neondatabase/neon/pull/10411 fill logic changes
such that it benefits us to test it with & without AZs set up. I didn't
extend the test inline in that PR because there were overlapping test
changes in flight to add `num_az` parameter.

## Summary of changes

- Parameterise test on AZ count (1 or 2)
- When AZ count is 2, use a different balance check that just asserts
the _tenants_ are balanced (since AZ affinity is chosen on a per-tenant
basis)
---
 .../regress/test_storage_controller.py        | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 11a4d09202..2750826aec 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2139,12 +2139,18 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto
         workload.validate()
 
 
-def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("num_azs", [1, 2])
+def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder, num_azs: int):
     """
     Graceful reststart of storage controller clusters use the drain and
     fill hooks in order to migrate attachments away from pageservers before
     restarting. In practice, Ansible will drive this process.
+
+    Test is parametrized on the number of AZs to exercise the AZ-driven behavior
+    of reliably moving shards back to their home AZ, and the behavior for AZ-agnostic
+    tenants where we fill based on a target shard count.
     """
+    neon_env_builder.num_azs = num_azs
     neon_env_builder.num_pageservers = 2
     env = neon_env_builder.init_configs()
     env.start()
@@ -2174,8 +2180,15 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
         min_shard_count = min(shard_counts.values())
         max_shard_count = max(shard_counts.values())
 
-        flake_factor = 5 / 100
-        assert max_shard_count - min_shard_count <= int(total_shards * flake_factor)
+        if num_azs == 1:
+            # AZ-agnostic case: we expect all nodes to have the same number of shards, within some bound
+            flake_factor = 5 / 100
+            assert max_shard_count - min_shard_count <= int(total_shards * flake_factor)
+        else:
+            # AZ-driven case: we expect tenants to have been round-robin allocated to AZs,
+            # and after the restart they should all be back in their home AZ, so difference
+            # should be at most a single shard's tenants
+            assert max_shard_count - min_shard_count <= shard_count_per_tenant
 
     # Perform a graceful rolling restart
     for ps in env.pageservers:

From b5e09fdaf35d60c6eabc5795b72291df2a0d0eea Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 11 Feb 2025 22:10:06 +0200
Subject: [PATCH 438/583] Re-order Dockerfile steps for putting together final
 compute image (#10736)

Run "apt install" first, and only then COPY the files from the
intermediary build layers to the final image. This way, if you modify
any of the sources that trigger e.g. rebuilding compute_ctl, the "apt
install" step can still be cached.
---
 compute/compute-node.Dockerfile | 88 ++++++++++++++++-----------------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 4357256093..47637b4684 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1773,48 +1773,6 @@ ARG DEBIAN_VERSION
 # Use strict mode for bash to catch errors early
 SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
 
-# Add user postgres
-RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
-    echo "postgres:test_console_pass" | chpasswd && \
-    mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
-    mkdir /var/db/postgres/pgbouncer && \
-    chown -R postgres:postgres /var/db/postgres && \
-    chmod 0750 /var/db/postgres/compute && \
-    chmod 0750 /var/db/postgres/pgbouncer && \
-    echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
-    # create folder for file cache
-    mkdir -p -m 777 /neon/cache
-
-# aws cli is used by fast_import
-COPY --from=awscli /usr/local/aws-cli /usr/local/aws-cli
-
-COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
-COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/compute_ctl /usr/local/bin/compute_ctl
-COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/fast_import /usr/local/bin/fast_import
-
-# pgbouncer and its config
-COPY --from=pgbouncer         /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
-COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini
-
-# local_proxy and its config
-COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/local_proxy /usr/local/bin/local_proxy
-RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
-
-# Metrics exporter binaries and configuration files
-COPY --from=exporters ./postgres_exporter /bin/postgres_exporter
-COPY --from=exporters ./pgbouncer_exporter /bin/pgbouncer_exporter
-COPY --from=exporters ./sql_exporter /bin/sql_exporter
-
-COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml
-
-COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml               /etc/sql_exporter.yml
-COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml             /etc/neon_collector.yml
-COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml   /etc/sql_exporter_autoscaling.yml
-COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
-
-# Create remote extension download directory
-RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
-
 # Install:
 # libreadline8 for psql
 # liblz4-1 for lz4
@@ -1825,10 +1783,8 @@ RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/loca
 # libboost* for rdkit
 # ca-certificates for communicating with s3 by compute_ctl
 # libevent for pgbouncer
-
 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
     echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc
-
 RUN apt update && \
     case $DEBIAN_VERSION in \
       # Version-specific installs for Bullseye (PG14-PG16):
@@ -1871,6 +1827,50 @@ RUN apt update && \
     apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
 
+# Add user postgres
+RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
+    echo "postgres:test_console_pass" | chpasswd && \
+    mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
+    mkdir /var/db/postgres/pgbouncer && \
+    chown -R postgres:postgres /var/db/postgres && \
+    chmod 0750 /var/db/postgres/compute && \
+    chmod 0750 /var/db/postgres/pgbouncer && \
+    # create folder for file cache
+    mkdir -p -m 777 /neon/cache && \
+    # Create remote extension download directory
+    mkdir /usr/local/download_extensions && \
+    chown -R postgres:postgres /usr/local/download_extensions
+
+# aws cli is used by fast_import
+COPY --from=awscli /usr/local/aws-cli /usr/local/aws-cli
+
+# pgbouncer and its config
+COPY --from=pgbouncer         /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
+COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini
+
+COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
+COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/compute_ctl /usr/local/bin/compute_ctl
+COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/fast_import /usr/local/bin/fast_import
+
+# local_proxy and its config
+COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/local_proxy /usr/local/bin/local_proxy
+RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
+
+# Metrics exporter binaries and configuration files
+COPY --from=exporters ./postgres_exporter /bin/postgres_exporter
+COPY --from=exporters ./pgbouncer_exporter /bin/pgbouncer_exporter
+COPY --from=exporters ./sql_exporter /bin/sql_exporter
+
+COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml
+
+COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml               /etc/sql_exporter.yml
+COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml             /etc/neon_collector.yml
+COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml   /etc/sql_exporter_autoscaling.yml
+COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
+
+# Make the libraries we built available
+RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
+
 ENV LANG=en_US.utf8
 USER postgres
 ENTRYPOINT ["/usr/local/bin/compute_ctl"]

From 9491154eae566f19fc3b7814124eed0e13bb7baf Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 11 Feb 2025 21:23:17 +0000
Subject: [PATCH 439/583] build(deps): bump cryptography from 43.0.1 to 44.0.1
 in the pip group (#10773)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 poetry.lock | 74 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 39 insertions(+), 35 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index c471d3e69c..fd200159b9 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1030,52 +1030,56 @@ files = [
 
 [[package]]
 name = "cryptography"
-version = "43.0.1"
+version = "44.0.1"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
-python-versions = ">=3.7"
+python-versions = "!=3.9.0,!=3.9.1,>=3.7"
 groups = ["main"]
 files = [
-    {file = "cryptography-43.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d"},
-    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062"},
-    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68aaecc4178e90719e95298515979814bda0cbada1256a4485414860bd7ab962"},
-    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:de41fd81a41e53267cb020bb3a7212861da53a7d39f863585d13ea11049cf277"},
-    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f98bf604c82c416bc829e490c700ca1553eafdf2912a91e23a79d97d9801372a"},
-    {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:61ec41068b7b74268fa86e3e9e12b9f0c21fcf65434571dbb13d954bceb08042"},
-    {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:014f58110f53237ace6a408b5beb6c427b64e084eb451ef25a28308270086494"},
-    {file = "cryptography-43.0.1-cp37-abi3-win32.whl", hash = "sha256:2bd51274dcd59f09dd952afb696bf9c61a7a49dfc764c04dd33ef7a6b502a1e2"},
-    {file = "cryptography-43.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:666ae11966643886c2987b3b721899d250855718d6d9ce41b521252a17985f4d"},
-    {file = "cryptography-43.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:ac119bb76b9faa00f48128b7f5679e1d8d437365c5d26f1c2c3f0da4ce1b553d"},
-    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bbcce1a551e262dfbafb6e6252f1ae36a248e615ca44ba302df077a846a8806"},
-    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58d4e9129985185a06d849aa6df265bdd5a74ca6e1b736a77959b498e0505b85"},
-    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d03a475165f3134f773d1388aeb19c2d25ba88b6a9733c5c590b9ff7bbfa2e0c"},
-    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:511f4273808ab590912a93ddb4e3914dfd8a388fed883361b02dea3791f292e1"},
-    {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:80eda8b3e173f0f247f711eef62be51b599b5d425c429b5d4ca6a05e9e856baa"},
-    {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38926c50cff6f533f8a2dae3d7f19541432610d114a70808f0926d5aaa7121e4"},
-    {file = "cryptography-43.0.1-cp39-abi3-win32.whl", hash = "sha256:a575913fb06e05e6b4b814d7f7468c2c660e8bb16d8d5a1faf9b33ccc569dd47"},
-    {file = "cryptography-43.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:d75601ad10b059ec832e78823b348bfa1a59f6b8d545db3a24fd44362a1564cb"},
-    {file = "cryptography-43.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ea25acb556320250756e53f9e20a4177515f012c9eaea17eb7587a8c4d8ae034"},
-    {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c1332724be35d23a854994ff0b66530119500b6053d0bd3363265f7e5e77288d"},
-    {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fba1007b3ef89946dbbb515aeeb41e30203b004f0b4b00e5e16078b518563289"},
-    {file = "cryptography-43.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5b43d1ea6b378b54a1dc99dd8a2b5be47658fe9a7ce0a58ff0b55f4b43ef2b84"},
-    {file = "cryptography-43.0.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:88cce104c36870d70c49c7c8fd22885875d950d9ee6ab54df2745f83ba0dc365"},
-    {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:9d3cdb25fa98afdd3d0892d132b8d7139e2c087da1712041f6b762e4f807cc96"},
-    {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e710bf40870f4db63c3d7d929aa9e09e4e7ee219e703f949ec4073b4294f6172"},
-    {file = "cryptography-43.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7c05650fe8023c5ed0d46793d4b7d7e6cd9c04e68eabe5b0aeea836e37bdcec2"},
-    {file = "cryptography-43.0.1.tar.gz", hash = "sha256:203e92a75716d8cfb491dc47c79e17d0d9207ccffcbcb35f598fbe463ae3444d"},
+    {file = "cryptography-44.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:bf688f615c29bfe9dfc44312ca470989279f0e94bb9f631f85e3459af8efc009"},
+    {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd7c7e2d71d908dc0f8d2027e1604102140d84b155e658c20e8ad1304317691f"},
+    {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887143b9ff6bad2b7570da75a7fe8bbf5f65276365ac259a5d2d5147a73775f2"},
+    {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:322eb03ecc62784536bc173f1483e76747aafeb69c8728df48537eb431cd1911"},
+    {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:21377472ca4ada2906bc313168c9dc7b1d7ca417b63c1c3011d0c74b7de9ae69"},
+    {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:df978682c1504fc93b3209de21aeabf2375cb1571d4e61907b3e7a2540e83026"},
+    {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:eb3889330f2a4a148abead555399ec9a32b13b7c8ba969b72d8e500eb7ef84cd"},
+    {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:8e6a85a93d0642bd774460a86513c5d9d80b5c002ca9693e63f6e540f1815ed0"},
+    {file = "cryptography-44.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6f76fdd6fd048576a04c5210d53aa04ca34d2ed63336d4abd306d0cbe298fddf"},
+    {file = "cryptography-44.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6c8acf6f3d1f47acb2248ec3ea261171a671f3d9428e34ad0357148d492c7864"},
+    {file = "cryptography-44.0.1-cp37-abi3-win32.whl", hash = "sha256:24979e9f2040c953a94bf3c6782e67795a4c260734e5264dceea65c8f4bae64a"},
+    {file = "cryptography-44.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:fd0ee90072861e276b0ff08bd627abec29e32a53b2be44e41dbcdf87cbee2b00"},
+    {file = "cryptography-44.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:a2d8a7045e1ab9b9f803f0d9531ead85f90c5f2859e653b61497228b18452008"},
+    {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8272f257cf1cbd3f2e120f14c68bff2b6bdfcc157fafdee84a1b795efd72862"},
+    {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e8d181e90a777b63f3f0caa836844a1182f1f265687fac2115fcf245f5fbec3"},
+    {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:436df4f203482f41aad60ed1813811ac4ab102765ecae7a2bbb1dbb66dcff5a7"},
+    {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4f422e8c6a28cf8b7f883eb790695d6d45b0c385a2583073f3cec434cc705e1a"},
+    {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:72198e2b5925155497a5a3e8c216c7fb3e64c16ccee11f0e7da272fa93b35c4c"},
+    {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:2a46a89ad3e6176223b632056f321bc7de36b9f9b93b2cc1cccf935a3849dc62"},
+    {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:53f23339864b617a3dfc2b0ac8d5c432625c80014c25caac9082314e9de56f41"},
+    {file = "cryptography-44.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:888fcc3fce0c888785a4876ca55f9f43787f4c5c1cc1e2e0da71ad481ff82c5b"},
+    {file = "cryptography-44.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:00918d859aa4e57db8299607086f793fa7813ae2ff5a4637e318a25ef82730f7"},
+    {file = "cryptography-44.0.1-cp39-abi3-win32.whl", hash = "sha256:9b336599e2cb77b1008cb2ac264b290803ec5e8e89d618a5e978ff5eb6f715d9"},
+    {file = "cryptography-44.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:e403f7f766ded778ecdb790da786b418a9f2394f36e8cc8b796cc056ab05f44f"},
+    {file = "cryptography-44.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1f9a92144fa0c877117e9748c74501bea842f93d21ee00b0cf922846d9d0b183"},
+    {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:610a83540765a8d8ce0f351ce42e26e53e1f774a6efb71eb1b41eb01d01c3d12"},
+    {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:5fed5cd6102bb4eb843e3315d2bf25fede494509bddadb81e03a859c1bc17b83"},
+    {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:f4daefc971c2d1f82f03097dc6f216744a6cd2ac0f04c68fb935ea2ba2a0d420"},
+    {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:94f99f2b943b354a5b6307d7e8d19f5c423a794462bde2bf310c770ba052b1c4"},
+    {file = "cryptography-44.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d9c5b9f698a83c8bd71e0f4d3f9f839ef244798e5ffe96febfa9714717db7af7"},
+    {file = "cryptography-44.0.1.tar.gz", hash = "sha256:f51f5705ab27898afda1aaa430f34ad90dc117421057782022edf0600bec5f14"},
 ]
 
 [package.dependencies]
 cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""}
 
 [package.extras]
-docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"]
-docstest = ["pyenchant (>=1.6.11)", "readme-renderer", "sphinxcontrib-spelling (>=4.0.1)"]
-nox = ["nox"]
-pep8test = ["check-sdist", "click", "mypy", "ruff"]
-sdist = ["build"]
+docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=3.0.0)"]
+docstest = ["pyenchant (>=3)", "readme-renderer (>=30.0)", "sphinxcontrib-spelling (>=7.3.1)"]
+nox = ["nox (>=2024.4.15)", "nox[uv] (>=2024.3.2)"]
+pep8test = ["check-sdist", "click (>=8.0.1)", "mypy (>=1.4)", "ruff (>=0.3.6)"]
+sdist = ["build (>=1.0.0)"]
 ssh = ["bcrypt (>=3.1.5)"]
-test = ["certifi", "cryptography-vectors (==43.0.1)", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
+test = ["certifi (>=2024)", "cryptography-vectors (==44.0.1)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"]
 test-randomorder = ["pytest-randomly"]
 
 [[package]]

From 635b67508b51415d3f20bef05d0ef60d4736a190 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 12 Feb 2025 00:06:53 +0200
Subject: [PATCH 440/583] Split utils::http to separate crate (#10753)

Avoids compiling the crate and its dependencies into binaries that don't
need them. Shrinks the compute_ctl binary from about 31MB to 28MB in the
release-line-debug-size-lto profile.
---
 Cargo.lock                                    | 50 +++++++++++++++----
 Cargo.toml                                    |  2 +
 compute_tools/src/http/routes/failpoints.rs   | 16 +++++-
 control_plane/Cargo.toml                      |  1 +
 control_plane/src/safekeeper.rs               |  4 +-
 libs/http-utils/Cargo.toml                    | 37 ++++++++++++++
 .../src/http => http-utils/src}/endpoint.rs   |  8 +--
 .../src/http => http-utils/src}/error.rs      | 11 ++++
 libs/http-utils/src/failpoints.rs             | 50 +++++++++++++++++++
 .../src/http => http-utils/src}/json.rs       |  0
 .../src/http/mod.rs => http-utils/src/lib.rs} |  4 ++
 libs/{utils => http-utils}/src/pprof.rs       |  0
 .../src/http => http-utils/src}/request.rs    |  0
 libs/utils/Cargo.toml                         | 14 ------
 libs/utils/src/auth.rs                        | 11 +---
 libs/utils/src/failpoint_support.rs           | 49 ------------------
 libs/utils/src/lib.rs                         |  7 ---
 pageserver/Cargo.toml                         |  1 +
 pageserver/client/Cargo.toml                  |  1 +
 pageserver/client/src/mgmt_api.rs             |  7 +--
 pageserver/src/bin/pageserver.rs              |  2 +-
 pageserver/src/http/routes.rs                 | 29 ++++++-----
 pageserver/src/tenant/mgr.rs                  |  4 +-
 .../src/tenant/timeline/detach_ancestor.rs    |  3 +-
 proxy/Cargo.toml                              |  1 +
 proxy/src/http/health_server.rs               |  8 +--
 proxy/src/serverless/http_util.rs             | 10 ++--
 proxy/src/serverless/mod.rs                   |  2 +-
 proxy/src/serverless/sql_over_http.rs         |  2 +-
 safekeeper/Cargo.toml                         |  1 +
 safekeeper/client/Cargo.toml                  |  1 +
 safekeeper/client/src/mgmt_api.rs             |  2 +-
 safekeeper/src/http/mod.rs                    |  2 +-
 safekeeper/src/http/routes.rs                 | 21 ++++----
 safekeeper/src/timeline.rs                    |  2 +-
 storage_controller/Cargo.toml                 |  1 +
 storage_controller/src/http.rs                | 23 ++++-----
 storage_controller/src/main.rs                |  2 +-
 storage_controller/src/peer_client.rs         |  3 +-
 storage_controller/src/scheduler.rs           |  3 +-
 storage_controller/src/service.rs             |  2 +-
 workspace_hack/Cargo.toml                     |  3 +-
 42 files changed, 238 insertions(+), 162 deletions(-)
 create mode 100644 libs/http-utils/Cargo.toml
 rename libs/{utils/src/http => http-utils/src}/endpoint.rs (99%)
 rename libs/{utils/src/http => http-utils/src}/error.rs (93%)
 create mode 100644 libs/http-utils/src/failpoints.rs
 rename libs/{utils/src/http => http-utils/src}/json.rs (100%)
 rename libs/{utils/src/http/mod.rs => http-utils/src/lib.rs} (82%)
 rename libs/{utils => http-utils}/src/pprof.rs (100%)
 rename libs/{utils/src/http => http-utils/src}/request.rs (100%)

diff --git a/Cargo.lock b/Cargo.lock
index 3f06a74c5e..30b7130bbf 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1433,6 +1433,7 @@ dependencies = [
  "comfy-table",
  "compute_api",
  "futures",
+ "http-utils",
  "humantime",
  "humantime-serde",
  "hyper 0.14.30",
@@ -2757,6 +2758,38 @@ dependencies = [
  "url",
 ]
 
+[[package]]
+name = "http-utils"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "backtrace",
+ "bytes",
+ "fail",
+ "flate2",
+ "hyper 0.14.30",
+ "inferno 0.12.0",
+ "itertools 0.10.5",
+ "jemalloc_pprof",
+ "metrics",
+ "once_cell",
+ "pprof",
+ "regex",
+ "routerify",
+ "serde",
+ "serde_json",
+ "serde_path_to_error",
+ "thiserror 1.0.69",
+ "tokio",
+ "tokio-stream",
+ "tokio-util",
+ "tracing",
+ "url",
+ "utils",
+ "uuid",
+ "workspace_hack",
+]
+
 [[package]]
 name = "httparse"
 version = "1.8.0"
@@ -4111,6 +4144,7 @@ dependencies = [
  "futures",
  "hex",
  "hex-literal",
+ "http-utils",
  "humantime",
  "humantime-serde",
  "hyper 0.14.30",
@@ -4211,6 +4245,7 @@ dependencies = [
  "anyhow",
  "bytes",
  "futures",
+ "http-utils",
  "pageserver_api",
  "postgres",
  "reqwest",
@@ -4917,6 +4952,7 @@ dependencies = [
  "hostname",
  "http 1.1.0",
  "http-body-util",
+ "http-utils",
  "humantime",
  "humantime-serde",
  "hyper 0.14.30",
@@ -5764,6 +5800,7 @@ dependencies = [
  "futures",
  "hex",
  "http 1.1.0",
+ "http-utils",
  "humantime",
  "hyper 0.14.30",
  "itertools 0.10.5",
@@ -5828,6 +5865,7 @@ dependencies = [
 name = "safekeeper_client"
 version = "0.1.0"
 dependencies = [
+ "http-utils",
  "reqwest",
  "safekeeper_api",
  "serde",
@@ -6410,6 +6448,7 @@ dependencies = [
  "fail",
  "futures",
  "hex",
+ "http-utils",
  "humantime",
  "hyper 0.14.30",
  "itertools 0.10.5",
@@ -7574,48 +7613,38 @@ dependencies = [
  "criterion",
  "diatomic-waker",
  "fail",
- "flate2",
  "futures",
  "git-version",
  "hex",
  "hex-literal",
  "humantime",
- "hyper 0.14.30",
  "inferno 0.12.0",
- "itertools 0.10.5",
- "jemalloc_pprof",
  "jsonwebtoken",
  "metrics",
  "nix 0.27.1",
  "once_cell",
  "pin-project-lite",
  "postgres_connection",
- "pprof",
  "pq_proto",
  "rand 0.8.5",
  "regex",
- "routerify",
  "scopeguard",
  "sentry",
  "serde",
  "serde_assert",
  "serde_json",
- "serde_path_to_error",
  "serde_with",
  "signal-hook",
  "strum",
  "strum_macros",
  "thiserror 1.0.69",
  "tokio",
- "tokio-stream",
  "tokio-tar",
  "tokio-util",
  "toml_edit",
  "tracing",
  "tracing-error",
  "tracing-subscriber",
- "url",
- "uuid",
  "walkdir",
 ]
 
@@ -8210,6 +8239,7 @@ dependencies = [
  "tracing-core",
  "tracing-log",
  "url",
+ "uuid",
  "zerocopy",
  "zeroize",
  "zstd",
diff --git a/Cargo.toml b/Cargo.toml
index 76b54ae1d8..7228623c6b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,6 +18,7 @@ members = [
     "storage_scrubber",
     "workspace_hack",
     "libs/compute_api",
+    "libs/http-utils",
     "libs/pageserver_api",
     "libs/postgres_ffi",
     "libs/safekeeper_api",
@@ -229,6 +230,7 @@ azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rus
 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
+http-utils = { version = "0.1", path = "./libs/http-utils/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
 pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
diff --git a/compute_tools/src/http/routes/failpoints.rs b/compute_tools/src/http/routes/failpoints.rs
index 2ec4511676..836417d784 100644
--- a/compute_tools/src/http/routes/failpoints.rs
+++ b/compute_tools/src/http/routes/failpoints.rs
@@ -1,7 +1,21 @@
 use axum::response::{IntoResponse, Response};
 use http::StatusCode;
+use serde::{Deserialize, Serialize};
 use tracing::info;
-use utils::failpoint_support::{apply_failpoint, ConfigureFailpointsRequest};
+use utils::failpoint_support::apply_failpoint;
+
+pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
+
+/// Information for configuring a single fail point
+#[derive(Debug, Serialize, Deserialize)]
+pub struct FailpointConfig {
+    /// Name of the fail point
+    pub name: String,
+    /// List of actions to take, using the format described in `fail::cfg`
+    ///
+    /// We also support `actions = "exit"` to cause the fail point to immediately exit.
+    pub actions: String,
+}
 
 use crate::http::{extract::Json, JsonResponse};
 
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index f718102847..162c49ec7c 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -33,6 +33,7 @@ postgres_backend.workspace = true
 safekeeper_api.workspace = true
 postgres_connection.workspace = true
 storage_broker.workspace = true
+http-utils.workspace = true
 utils.workspace = true
 whoami.workspace = true
 
diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index f0c3722925..ce7751fb14 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -17,8 +17,10 @@ use camino::Utf8PathBuf;
 use postgres_connection::PgConnectionConfig;
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
+
+use http_utils::error::HttpErrorBody;
 use utils::auth::{Claims, Scope};
-use utils::{http::error::HttpErrorBody, id::NodeId};
+use utils::id::NodeId;
 
 use crate::{
     background_process,
diff --git a/libs/http-utils/Cargo.toml b/libs/http-utils/Cargo.toml
new file mode 100644
index 0000000000..d72e4bd012
--- /dev/null
+++ b/libs/http-utils/Cargo.toml
@@ -0,0 +1,37 @@
+[package]
+name = "http-utils"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+backtrace.workspace = true
+bytes.workspace = true
+inferno.workspace = true
+fail.workspace = true
+flate2.workspace = true
+hyper0.workspace = true
+itertools.workspace = true
+jemalloc_pprof.workspace = true
+once_cell.workspace = true
+pprof.workspace = true
+regex.workspace = true
+routerify.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+serde_path_to_error.workspace = true
+thiserror.workspace = true
+tracing.workspace = true
+tokio.workspace = true
+tokio-util.workspace = true
+url.workspace = true
+uuid.workspace = true
+
+# to use tokio channels as streams, this is faster to compile than async_stream
+# why is it only here? no other crate should use it, streams are rarely needed.
+tokio-stream = { version = "0.1.14" }
+
+metrics.workspace = true
+utils.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/libs/utils/src/http/endpoint.rs b/libs/http-utils/src/endpoint.rs
similarity index 99%
rename from libs/utils/src/http/endpoint.rs
rename to libs/http-utils/src/endpoint.rs
index 9f38373ca0..be97b341d1 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/http-utils/src/endpoint.rs
@@ -1,7 +1,6 @@
-use crate::auth::{AuthError, Claims, SwappableJwtAuth};
-use crate::http::error::{api_error_handler, route_error_handler, ApiError};
-use crate::http::request::{get_query_param, parse_query_param};
+use crate::error::{api_error_handler, route_error_handler, ApiError};
 use crate::pprof;
+use crate::request::{get_query_param, parse_query_param};
 use ::pprof::protos::Message as _;
 use ::pprof::ProfilerGuardBuilder;
 use anyhow::{anyhow, Context};
@@ -19,6 +18,7 @@ use tokio::sync::{mpsc, Mutex, Notify};
 use tokio_stream::wrappers::ReceiverStream;
 use tokio_util::io::ReaderStream;
 use tracing::{debug, info, info_span, warn, Instrument};
+use utils::auth::{AuthError, Claims, SwappableJwtAuth};
 
 use std::future::Future;
 use std::io::Write as _;
@@ -718,9 +718,9 @@ pub fn check_permission_with(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use futures::future::poll_fn;
     use hyper::service::Service;
     use routerify::RequestServiceBuilder;
+    use std::future::poll_fn;
     use std::net::{IpAddr, SocketAddr};
 
     #[tokio::test]
diff --git a/libs/utils/src/http/error.rs b/libs/http-utils/src/error.rs
similarity index 93%
rename from libs/utils/src/http/error.rs
rename to libs/http-utils/src/error.rs
index 02fc9e3b99..746305caec 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/http-utils/src/error.rs
@@ -5,6 +5,8 @@ use std::error::Error as StdError;
 use thiserror::Error;
 use tracing::{error, info, warn};
 
+use utils::auth::AuthError;
+
 #[derive(Debug, Error)]
 pub enum ApiError {
     #[error("Bad request: {0:#?}")]
@@ -96,6 +98,15 @@ impl ApiError {
     }
 }
 
+impl From<AuthError> for ApiError {
+    fn from(_value: AuthError) -> Self {
+        // Don't pass on the value of the AuthError as a precautionary measure.
+        // Being intentionally vague in public error communication hurts debugability
+        // but it is more secure.
+        ApiError::Forbidden("JWT authentication error".to_string())
+    }
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct HttpErrorBody {
     pub msg: String,
diff --git a/libs/http-utils/src/failpoints.rs b/libs/http-utils/src/failpoints.rs
new file mode 100644
index 0000000000..8a1e0c8cf0
--- /dev/null
+++ b/libs/http-utils/src/failpoints.rs
@@ -0,0 +1,50 @@
+use crate::error::ApiError;
+use crate::json::{json_request, json_response};
+
+use hyper::{Body, Request, Response, StatusCode};
+use serde::{Deserialize, Serialize};
+use tokio_util::sync::CancellationToken;
+
+use utils::failpoint_support::apply_failpoint;
+
+pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
+
+/// Information for configuring a single fail point
+#[derive(Debug, Serialize, Deserialize)]
+pub struct FailpointConfig {
+    /// Name of the fail point
+    pub name: String,
+    /// List of actions to take, using the format described in `fail::cfg`
+    ///
+    /// We also support `actions = "exit"` to cause the fail point to immediately exit.
+    pub actions: String,
+}
+
+/// Configure failpoints through http.
+pub async fn failpoints_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    if !fail::has_failpoints() {
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "Cannot manage failpoints because neon was compiled without failpoints support"
+        )));
+    }
+
+    let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
+    for fp in failpoints {
+        tracing::info!("cfg failpoint: {} {}", fp.name, fp.actions);
+
+        // We recognize one extra "action" that's not natively recognized
+        // by the failpoints crate: exit, to immediately kill the process
+        let cfg_result = apply_failpoint(&fp.name, &fp.actions);
+
+        if let Err(err_msg) = cfg_result {
+            return Err(ApiError::BadRequest(anyhow::anyhow!(
+                "Failed to configure failpoints: {err_msg}"
+            )));
+        }
+    }
+
+    json_response(StatusCode::OK, ())
+}
diff --git a/libs/utils/src/http/json.rs b/libs/http-utils/src/json.rs
similarity index 100%
rename from libs/utils/src/http/json.rs
rename to libs/http-utils/src/json.rs
diff --git a/libs/utils/src/http/mod.rs b/libs/http-utils/src/lib.rs
similarity index 82%
rename from libs/utils/src/http/mod.rs
rename to libs/http-utils/src/lib.rs
index 74ed6bb5b2..ae6a27aaa8 100644
--- a/libs/utils/src/http/mod.rs
+++ b/libs/http-utils/src/lib.rs
@@ -1,8 +1,12 @@
 pub mod endpoint;
 pub mod error;
+pub mod failpoints;
 pub mod json;
+pub mod pprof;
 pub mod request;
 
+extern crate hyper0 as hyper;
+
 /// Current fast way to apply simple http routing in various Neon binaries.
 /// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed.
 pub use routerify::{ext::RequestExt, RouterBuilder, RouterService};
diff --git a/libs/utils/src/pprof.rs b/libs/http-utils/src/pprof.rs
similarity index 100%
rename from libs/utils/src/pprof.rs
rename to libs/http-utils/src/pprof.rs
diff --git a/libs/utils/src/http/request.rs b/libs/http-utils/src/request.rs
similarity index 100%
rename from libs/utils/src/http/request.rs
rename to libs/http-utils/src/request.rs
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index edb451a02c..0f10300959 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -21,23 +21,17 @@ bytes.workspace = true
 camino.workspace = true
 chrono.workspace = true
 diatomic-waker.workspace = true
-flate2.workspace = true
 git-version.workspace = true
 hex = { workspace = true, features = ["serde"] }
 humantime.workspace = true
-hyper0 = { workspace = true, features = ["full"] }
 inferno.workspace = true
-itertools.workspace = true
 fail.workspace = true
 futures = { workspace = true }
-jemalloc_pprof.workspace = true
 jsonwebtoken.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
-pprof.workspace = true
 regex.workspace = true
-routerify.workspace = true
 serde.workspace = true
 serde_with.workspace = true
 serde_json.workspace = true
@@ -54,8 +48,6 @@ rand.workspace = true
 scopeguard.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
-url.workspace = true
-uuid.workspace = true
 walkdir.workspace = true
 
 pq_proto.workspace = true
@@ -64,12 +56,6 @@ metrics.workspace = true
 
 const_format.workspace = true
 
-# to use tokio channels as streams, this is faster to compile than async_stream
-# why is it only here? no other crate should use it, streams are rarely needed.
-tokio-stream = { version = "0.1.14" }
-
-serde_path_to_error.workspace = true
-
 [dev-dependencies]
 byteorder.workspace = true
 bytes.workspace = true
diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index f7acc61ac1..4bfd0ab055 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -10,7 +10,7 @@ use jsonwebtoken::{
 };
 use serde::{Deserialize, Serialize};
 
-use crate::{http::error::ApiError, id::TenantId};
+use crate::id::TenantId;
 
 /// Algorithm to use. We require EdDSA.
 const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
@@ -90,15 +90,6 @@ impl Display for AuthError {
     }
 }
 
-impl From<AuthError> for ApiError {
-    fn from(_value: AuthError) -> Self {
-        // Don't pass on the value of the AuthError as a precautionary measure.
-        // Being intentionally vague in public error communication hurts debugability
-        // but it is more secure.
-        ApiError::Forbidden("JWT authentication error".to_string())
-    }
-}
-
 pub struct JwtAuth {
     decoding_keys: Vec<DecodingKey>,
     validation: Validation,
diff --git a/libs/utils/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs
index 272c6ebb26..fc998ad9a9 100644
--- a/libs/utils/src/failpoint_support.rs
+++ b/libs/utils/src/failpoint_support.rs
@@ -1,13 +1,6 @@
 //! Failpoint support code shared between pageserver and safekeepers.
 
-use crate::http::{
-    error::ApiError,
-    json::{json_request, json_response},
-};
-use hyper::{Body, Request, Response, StatusCode};
-use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
-use tracing::*;
 
 /// Declare a failpoint that can use to `pause` failpoint action.
 /// We don't want to block the executor thread, hence, spawn_blocking + await.
@@ -184,45 +177,3 @@ fn exit_failpoint() {
     tracing::info!("Exit requested by failpoint");
     std::process::exit(1);
 }
-
-pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
-
-/// Information for configuring a single fail point
-#[derive(Debug, Serialize, Deserialize)]
-pub struct FailpointConfig {
-    /// Name of the fail point
-    pub name: String,
-    /// List of actions to take, using the format described in `fail::cfg`
-    ///
-    /// We also support `actions = "exit"` to cause the fail point to immediately exit.
-    pub actions: String,
-}
-
-/// Configure failpoints through http.
-pub async fn failpoints_handler(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    if !fail::has_failpoints() {
-        return Err(ApiError::BadRequest(anyhow::anyhow!(
-            "Cannot manage failpoints because neon was compiled without failpoints support"
-        )));
-    }
-
-    let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
-    for fp in failpoints {
-        info!("cfg failpoint: {} {}", fp.name, fp.actions);
-
-        // We recognize one extra "action" that's not natively recognized
-        // by the failpoints crate: exit, to immediately kill the process
-        let cfg_result = apply_failpoint(&fp.name, &fp.actions);
-
-        if let Err(err_msg) = cfg_result {
-            return Err(ApiError::BadRequest(anyhow::anyhow!(
-                "Failed to configure failpoints: {err_msg}"
-            )));
-        }
-    }
-
-    json_response(StatusCode::OK, ())
-}
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 1fb18e9e9a..820ff2d5ea 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -2,8 +2,6 @@
 //! between other crates in this repository.
 #![deny(clippy::undocumented_unsafe_blocks)]
 
-extern crate hyper0 as hyper;
-
 pub mod backoff;
 
 /// `Lsn` type implements common tasks on Log Sequence Numbers
@@ -33,9 +31,6 @@ pub mod shard;
 mod hex;
 pub use hex::Hex;
 
-// http endpoint utils
-pub mod http;
-
 // definition of the Generation type for pageserver attachment APIs
 pub mod generation;
 
@@ -96,8 +91,6 @@ pub mod circuit_breaker;
 
 pub mod try_rcu;
 
-pub mod pprof;
-
 pub mod guard_arc_swap;
 
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 6e4eaa0efd..41ac3b69b8 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -79,6 +79,7 @@ pq_proto.workspace = true
 remote_storage.workspace = true
 storage_broker.workspace = true
 tenant_size_model.workspace = true
+http-utils.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
 reqwest.workspace = true
diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml
index f582d307a7..db77a395e0 100644
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -11,6 +11,7 @@ testing = [ "pageserver_api/testing" ]
 pageserver_api.workspace = true
 thiserror.workspace = true
 reqwest = { workspace = true, features = [ "stream" ] }
+http-utils.workspace = true
 utils.workspace = true
 serde.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 0359bfcd0b..da7ec5abce 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,11 +1,12 @@
 use std::{collections::HashMap, error::Error as _};
 
 use bytes::Bytes;
-use detach_ancestor::AncestorDetached;
-use pageserver_api::{models::*, shard::TenantShardId};
 use reqwest::{IntoUrl, Method, StatusCode};
+
+use detach_ancestor::AncestorDetached;
+use http_utils::error::HttpErrorBody;
+use pageserver_api::{models::*, shard::TenantShardId};
 use utils::{
-    http::error::HttpErrorBody,
     id::{TenantId, TimelineId},
     lsn::Lsn,
 };
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 5764728505..fa098e9364 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -592,7 +592,7 @@ fn start_pageserver(
         let router = http::make_router(router_state, launch_ts, http_auth.clone())?
             .build()
             .map_err(|err| anyhow!(err))?;
-        let service = utils::http::RouterService::new(router).unwrap();
+        let service = http_utils::RouterService::new(router).unwrap();
         let server = hyper0::Server::from_tcp(http_listener)?
             .serve(service)
             .with_graceful_shutdown({
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 94f7510a4a..1d5edaa571 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -13,6 +13,12 @@ use enumset::EnumSet;
 use futures::future::join_all;
 use futures::StreamExt;
 use futures::TryFutureExt;
+use http_utils::endpoint::{
+    profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span,
+};
+use http_utils::failpoints::failpoints_handler;
+use http_utils::request::must_parse_query_param;
+use http_utils::request::{get_request_param, must_get_query_param, parse_query_param};
 use humantime::format_rfc3339;
 use hyper::header;
 use hyper::StatusCode;
@@ -60,13 +66,6 @@ use tokio::time::Instant;
 use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::auth::JwtAuth;
-use utils::failpoint_support::failpoints_handler;
-use utils::http::endpoint::{
-    profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span,
-};
-use utils::http::request::must_parse_query_param;
-use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
 
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -104,6 +103,13 @@ use crate::tenant::OffloadedTimeline;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::DEFAULT_PG_VERSION;
 use crate::{disk_usage_eviction_task, tenant};
+use http_utils::{
+    endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
+    error::{ApiError, HttpErrorBody},
+    json::{json_request, json_request_maybe, json_response},
+    request::parse_request_param,
+    RequestExt, RouterBuilder,
+};
 use pageserver_api::models::{
     StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
     TimelineInfo,
@@ -111,13 +117,6 @@ use pageserver_api::models::{
 use utils::{
     auth::SwappableJwtAuth,
     generation::Generation,
-    http::{
-        endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
-        error::{ApiError, HttpErrorBody},
-        json::{json_request, json_request_maybe, json_response},
-        request::parse_request_param,
-        RequestExt, RouterBuilder,
-    },
     id::{TenantId, TimelineId},
     lsn::Lsn,
 };
@@ -561,7 +560,7 @@ async fn reload_auth_validation_keys_handler(
     let key_path = config.auth_validation_public_key_path.as_ref().unwrap();
     info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}");
 
-    match JwtAuth::from_key_path(key_path) {
+    match utils::auth::JwtAuth::from_key_path(key_path) {
         Ok(new_auth) => {
             shared_auth.swap(new_auth);
             json_response(StatusCode::OK, ())
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index dfa89a765c..22ee560dbf 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2816,8 +2816,8 @@ where
 }
 
 use {
-    crate::tenant::gc_result::GcResult, pageserver_api::models::TimelineGcRequest,
-    utils::http::error::ApiError,
+    crate::tenant::gc_result::GcResult, http_utils::error::ApiError,
+    pageserver_api::models::TimelineGcRequest,
 };
 
 #[cfg(test)]
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index b6347d1219..e0084d3eef 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -14,11 +14,12 @@ use crate::{
     virtual_file::{MaybeFatalIo, VirtualFile},
 };
 use anyhow::Context;
+use http_utils::error::ApiError;
 use pageserver_api::{models::detach_ancestor::AncestorDetached, shard::ShardIdentity};
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
-use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
+use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn};
 
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum Error {
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index d7880ea7b9..3aa6ac3a76 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -37,6 +37,7 @@ hex.workspace = true
 hmac.workspace = true
 hostname.workspace = true
 http.workspace = true
+http-utils.workspace = true
 humantime.workspace = true
 humantime-serde.workspace = true
 hyper0.workspace = true
diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs
index 6ca091feb7..141f319567 100644
--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
@@ -3,16 +3,16 @@ use std::net::TcpListener;
 use std::sync::{Arc, Mutex};
 
 use anyhow::{anyhow, bail};
+use http_utils::endpoint::{self, request_span};
+use http_utils::error::ApiError;
+use http_utils::json::json_response;
+use http_utils::{RouterBuilder, RouterService};
 use hyper0::header::CONTENT_TYPE;
 use hyper0::{Body, Request, Response, StatusCode};
 use measured::text::BufferedTextEncoder;
 use measured::MetricGroup;
 use metrics::NeonMetrics;
 use tracing::{info, info_span};
-use utils::http::endpoint::{self, request_span};
-use utils::http::error::ApiError;
-use utils::http::json::json_response;
-use utils::http::{RouterBuilder, RouterService};
 
 use crate::ext::{LockExt, TaskExt};
 use crate::jemalloc;
diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs
index d5c948777c..95a28663a5 100644
--- a/proxy/src/serverless/http_util.rs
+++ b/proxy/src/serverless/http_util.rs
@@ -6,8 +6,8 @@ use bytes::Bytes;
 use http::{Response, StatusCode};
 use http_body_util::combinators::BoxBody;
 use http_body_util::{BodyExt, Full};
+use http_utils::error::ApiError;
 use serde::Serialize;
-use utils::http::error::ApiError;
 
 /// Like [`ApiError::into_response`]
 pub(crate) fn api_error_into_response(this: ApiError) -> Response<BoxBody<Bytes, hyper::Error>> {
@@ -59,14 +59,14 @@ pub(crate) fn api_error_into_response(this: ApiError) -> Response<BoxBody<Bytes,
     }
 }
 
-/// Same as [`utils::http::error::HttpErrorBody`]
+/// Same as [`http_utils::error::HttpErrorBody`]
 #[derive(Serialize)]
 struct HttpErrorBody {
     pub(crate) msg: String,
 }
 
 impl HttpErrorBody {
-    /// Same as [`utils::http::error::HttpErrorBody::response_from_msg_and_status`]
+    /// Same as [`http_utils::error::HttpErrorBody::response_from_msg_and_status`]
     fn response_from_msg_and_status(
         msg: String,
         status: StatusCode,
@@ -74,7 +74,7 @@ impl HttpErrorBody {
         HttpErrorBody { msg }.to_response(status)
     }
 
-    /// Same as [`utils::http::error::HttpErrorBody::to_response`]
+    /// Same as [`http_utils::error::HttpErrorBody::to_response`]
     fn to_response(&self, status: StatusCode) -> Response<BoxBody<Bytes, hyper::Error>> {
         Response::builder()
             .status(status)
@@ -92,7 +92,7 @@ impl HttpErrorBody {
     }
 }
 
-/// Same as [`utils::http::json::json_response`]
+/// Same as [`http_utils::json::json_response`]
 pub(crate) fn json_response<T: Serialize>(
     status: StatusCode,
     data: T,
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 6888772362..8289500159 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -28,6 +28,7 @@ use futures::TryFutureExt;
 use http::{Method, Response, StatusCode};
 use http_body_util::combinators::BoxBody;
 use http_body_util::{BodyExt, Empty};
+use http_utils::error::ApiError;
 use hyper::body::Incoming;
 use hyper_util::rt::TokioExecutor;
 use hyper_util::server::conn::auto::Builder;
@@ -41,7 +42,6 @@ use tokio_rustls::TlsAcceptor;
 use tokio_util::sync::CancellationToken;
 use tokio_util::task::TaskTracker;
 use tracing::{info, warn, Instrument};
-use utils::http::error::ApiError;
 
 use crate::cancellation::CancellationHandler;
 use crate::config::{ProxyConfig, ProxyProtocolV2};
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 2eee3b7165..5982fe225d 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -8,6 +8,7 @@ use http::header::AUTHORIZATION;
 use http::Method;
 use http_body_util::combinators::BoxBody;
 use http_body_util::{BodyExt, Full};
+use http_utils::error::ApiError;
 use hyper::body::Incoming;
 use hyper::http::{HeaderName, HeaderValue};
 use hyper::{header, HeaderMap, Request, Response, StatusCode};
@@ -23,7 +24,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info};
 use typed_json::json;
 use url::Url;
-use utils::http::error::ApiError;
 use uuid::Uuid;
 
 use super::backend::{LocalProxyConnError, PoolingBackend};
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 0eb511f1cc..d12ebc1030 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -63,6 +63,7 @@ sha2.workspace = true
 sd-notify.workspace = true
 storage_broker.workspace = true
 tokio-stream.workspace = true
+http-utils.workspace = true
 utils.workspace = true
 wal_decoder.workspace = true
 env_logger.workspace = true
diff --git a/safekeeper/client/Cargo.toml b/safekeeper/client/Cargo.toml
index 6c5a52de3a..0b660aaf32 100644
--- a/safekeeper/client/Cargo.toml
+++ b/safekeeper/client/Cargo.toml
@@ -5,6 +5,7 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
+http-utils.workspace = true
 safekeeper_api.workspace = true
 thiserror.workspace = true
 reqwest = { workspace = true, features = [ "stream" ] }
diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs
index f65bfaa6d5..df049f3eba 100644
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -3,11 +3,11 @@
 //! Partially copied from pageserver client; some parts might be better to be
 //! united.
 
+use http_utils::error::HttpErrorBody;
 use reqwest::{IntoUrl, Method, StatusCode};
 use safekeeper_api::models::{TimelineCreateRequest, TimelineStatus};
 use std::error::Error as _;
 use utils::{
-    http::error::HttpErrorBody,
     id::{NodeId, TenantId, TimelineId},
     logging::SecretString,
 };
diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs
index d82a713f8a..6e160b7a5e 100644
--- a/safekeeper/src/http/mod.rs
+++ b/safekeeper/src/http/mod.rs
@@ -14,7 +14,7 @@ pub async fn task_main(
     let router = make_router(conf, global_timelines)
         .build()
         .map_err(|err| anyhow::anyhow!(err))?;
-    let service = utils::http::RouterService::new(router).unwrap();
+    let service = http_utils::RouterService::new(router).unwrap();
     let server = hyper::Server::from_tcp(http_listener)?;
     server.serve(service).await?;
     Ok(()) // unreachable
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 7ec08ecf9a..a64bf1ddd8 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -1,3 +1,4 @@
+use http_utils::failpoints::failpoints_handler;
 use hyper::{Body, Request, Response, StatusCode};
 use safekeeper_api::models;
 use safekeeper_api::models::AcceptorStateStatus;
@@ -17,25 +18,23 @@ use tokio::task;
 use tokio_stream::wrappers::ReceiverStream;
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, Instrument};
-use utils::failpoint_support::failpoints_handler;
-use utils::http::endpoint::{
+
+use http_utils::endpoint::{
     profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span,
-    ChannelWriter,
 };
-use utils::http::request::parse_query_param;
+use http_utils::{
+    endpoint::{self, auth_middleware, check_permission_with, ChannelWriter},
+    error::ApiError,
+    json::{json_request, json_response},
+    request::{ensure_no_body, parse_query_param, parse_request_param},
+    RequestExt, RouterBuilder,
+};
 
 use postgres_ffi::WAL_SEGMENT_SIZE;
 use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest};
 use safekeeper_api::models::{TimelineCreateRequest, TimelineTermBumpRequest};
 use utils::{
     auth::SwappableJwtAuth,
-    http::{
-        endpoint::{self, auth_middleware, check_permission_with},
-        error::ApiError,
-        json::{json_request, json_response},
-        request::{ensure_no_body, parse_request_param},
-        RequestExt, RouterBuilder,
-    },
     id::{TenantId, TenantTimelineId, TimelineId},
     lsn::Lsn,
 };
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 3702a096e0..4341f13824 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -14,6 +14,7 @@ use tokio_util::sync::CancellationToken;
 use utils::id::TenantId;
 use utils::sync::gate::Gate;
 
+use http_utils::error::ApiError;
 use std::cmp::max;
 use std::ops::{Deref, DerefMut};
 use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
@@ -22,7 +23,6 @@ use std::time::Duration;
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::{sync::watch, time::Instant};
 use tracing::*;
-use utils::http::error::ApiError;
 use utils::{
     id::{NodeId, TenantTimelineId},
     lsn::Lsn,
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 63f43cdf62..91d8098cb9 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -55,6 +55,7 @@ diesel-async = { version = "0.5.2", features = ["postgres", "bb8", "async-connec
 diesel_migrations = { version = "2.2.0" }
 scoped-futures = "0.1.4"
 
+http-utils = { path = "../libs/http-utils/" }
 utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
 control_plane = { path = "../control_plane" }
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index ac890b008f..1a56116cad 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -8,6 +8,14 @@ use crate::reconciler::ReconcileError;
 use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT};
 use anyhow::Context;
 use futures::Future;
+use http_utils::{
+    endpoint::{self, auth_middleware, check_permission_with, request_span},
+    error::ApiError,
+    failpoints::failpoints_handler,
+    json::{json_request, json_response},
+    request::{must_get_query_param, parse_query_param, parse_request_param},
+    RequestExt, RouterBuilder,
+};
 use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
@@ -29,20 +37,7 @@ use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
 use utils::auth::{Scope, SwappableJwtAuth};
-use utils::failpoint_support::failpoints_handler;
-use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
-use utils::http::request::{must_get_query_param, parse_query_param, parse_request_param};
-use utils::id::{TenantId, TimelineId};
-
-use utils::{
-    http::{
-        endpoint::{self},
-        error::ApiError,
-        json::{json_request, json_response},
-        RequestExt, RouterBuilder,
-    },
-    id::NodeId,
-};
+use utils::id::{NodeId, TenantId, TimelineId};
 
 use pageserver_api::controller_api::{
     NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest,
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 659c088d51..07279a67ff 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -320,7 +320,7 @@ async fn async_main() -> anyhow::Result<()> {
     let router = make_router(service.clone(), auth, build_info)
         .build()
         .map_err(|err| anyhow!(err))?;
-    let router_service = utils::http::RouterService::new(router).unwrap();
+    let router_service = http_utils::RouterService::new(router).unwrap();
 
     // Start HTTP server
     let server_shutdown = CancellationToken::new();
diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs
index ee4eb55294..1a15bae365 100644
--- a/storage_controller/src/peer_client.rs
+++ b/storage_controller/src/peer_client.rs
@@ -6,9 +6,10 @@ use std::error::Error as _;
 use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 
+use http_utils::error::HttpErrorBody;
 use hyper::Uri;
 use reqwest::{StatusCode, Url};
-use utils::{backoff, http::error::HttpErrorBody};
+use utils::backoff;
 
 #[derive(Debug, Clone)]
 pub(crate) struct PeerClient {
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index f9e72862ae..106a7b2699 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -1,9 +1,10 @@
 use crate::{metrics::NodeLabelGroup, node::Node, tenant_shard::TenantShard};
+use http_utils::error::ApiError;
 use itertools::Itertools;
 use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization};
 use serde::Serialize;
 use std::{collections::HashMap, fmt::Debug};
-use utils::{http::error::ApiError, id::NodeId};
+use utils::id::NodeId;
 
 /// Scenarios in which we cannot find a suitable location for a tenant shard
 #[derive(thiserror::Error, Debug)]
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 4028cd7023..6829663a4c 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -61,6 +61,7 @@ use reqwest::StatusCode;
 use tracing::{instrument, Instrument};
 
 use crate::pageserver_client::PageserverClient;
+use http_utils::error::ApiError;
 use pageserver_api::{
     models::{
         self, LocationConfig, LocationConfigListResponse, LocationConfigMode,
@@ -81,7 +82,6 @@ use utils::{
     completion::Barrier,
     failpoint_support,
     generation::Generation,
-    http::error::ApiError,
     id::{NodeId, TenantId, TimelineId},
     pausable_failpoint,
     sync::gate::Gate,
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 2c65401154..1b7c376560 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -42,7 +42,7 @@ half = { version = "2", default-features = false, features = ["num-traits"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
-hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["full"] }
+hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["client", "http1", "http2", "runtime", "server", "stream"] }
 hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] }
 hyper-util = { version = "0.1", features = ["client-legacy", "http1", "http2", "server", "service"] }
 indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] }
@@ -94,6 +94,7 @@ tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
 tracing-log = { version = "0.2" }
 url = { version = "2", features = ["serde"] }
+uuid = { version = "1", features = ["serde", "v4", "v7"] }
 zerocopy = { version = "0.7", features = ["derive", "simd"] }
 zeroize = { version = "1", features = ["derive", "serde"] }
 zstd = { version = "0.13" }

From 6c83ac3fd28c0c5482244f3b9f010de849d267da Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 11 Feb 2025 23:08:46 +0100
Subject: [PATCH 441/583] pageserver: do all L0 compaction before image
 compaction (#10744)

## Problem

Image compaction can starve out L0 compaction if a tenant has several
timelines with L0 debt.

Touches #10694.
Requires #10740.

## Summary of changes

* Add an initial L0 compaction pass, in order of L0 count.
* Add a tenant option `compaction_l0_first` to control the L0 pass
(disabled by default).
* Add `CompactFlags::OnlyL0Compaction` to run an L0-only compaction
pass.
* Clean up the compaction iteration logic.

A later PR will use separate semaphores for the L0 and image compaction
passes to avoid cross-tenant L0 starvation. That PR will also make image
compaction yield if _any_ of the tenant's timelines have pending L0
compaction to further avoid starvation.
---
 control_plane/src/pageserver.rs               |   5 +
 libs/pageserver_api/src/config.rs             |   4 +
 libs/pageserver_api/src/models.rs             |   6 +
 pageserver/src/tenant.rs                      | 293 ++++++++++--------
 pageserver/src/tenant/config.rs               |  11 +
 pageserver/src/tenant/timeline.rs             |   1 +
 pageserver/src/tenant/timeline/compaction.rs  |  12 +-
 .../regress/test_attach_tenant_config.py      |   3 +-
 8 files changed, 209 insertions(+), 126 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index dd37bfc407..e15b30236e 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -357,6 +357,11 @@ impl PageServerNode {
                 .map(serde_json::from_str)
                 .transpose()
                 .context("Failed to parse 'compaction_algorithm' json")?,
+            compaction_l0_first: settings
+                .remove("compaction_l0_first")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'compaction_l0_first' as a bool")?,
             l0_flush_delay_threshold: settings
                 .remove("l0_flush_delay_threshold")
                 .map(|x| x.parse::<usize>())
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index a00d7838fd..9bc1b6d359 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -264,6 +264,8 @@ pub struct TenantConfigToml {
     /// size exceeds `compaction_upper_limit * checkpoint_distance`.
     pub compaction_upper_limit: usize,
     pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
+    /// If true, compact down L0 across all tenant timelines before doing regular compaction.
+    pub compaction_l0_first: bool,
     /// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure,
     /// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer
     /// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification
@@ -545,6 +547,7 @@ pub mod tenant_conf_defaults {
     // most of our pageservers. Compaction ~50 layers requires about 2GB memory (could be reduced later by optimizing L0 hole
     // calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB.
     pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50;
+    pub const DEFAULT_COMPACTION_L0_FIRST: bool = false;
 
     pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
         crate::models::CompactionAlgorithm::Legacy;
@@ -594,6 +597,7 @@ impl Default for TenantConfigToml {
             compaction_algorithm: crate::models::CompactionAlgorithmSettings {
                 kind: DEFAULT_COMPACTION_ALGORITHM,
             },
+            compaction_l0_first: DEFAULT_COMPACTION_L0_FIRST,
             l0_flush_delay_threshold: None,
             l0_flush_stall_threshold: None,
             l0_flush_wait_upload: DEFAULT_L0_FLUSH_WAIT_UPLOAD,
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 19beb37ab3..335ac4cec5 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -464,6 +464,8 @@ pub struct TenantConfigPatch {
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_l0_first: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub l0_flush_delay_threshold: FieldPatch<usize>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub l0_flush_stall_threshold: FieldPatch<usize>,
@@ -529,6 +531,7 @@ pub struct TenantConfig {
     pub compaction_upper_limit: Option<usize>,
     // defer parsing compaction_algorithm, like eviction_policy
     pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
+    pub compaction_l0_first: Option<bool>,
     pub l0_flush_delay_threshold: Option<usize>,
     pub l0_flush_stall_threshold: Option<usize>,
     pub l0_flush_wait_upload: Option<bool>,
@@ -567,6 +570,7 @@ impl TenantConfig {
             mut compaction_threshold,
             mut compaction_upper_limit,
             mut compaction_algorithm,
+            mut compaction_l0_first,
             mut l0_flush_delay_threshold,
             mut l0_flush_stall_threshold,
             mut l0_flush_wait_upload,
@@ -606,6 +610,7 @@ impl TenantConfig {
             .compaction_upper_limit
             .apply(&mut compaction_upper_limit);
         patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch.compaction_l0_first.apply(&mut compaction_l0_first);
         patch
             .l0_flush_delay_threshold
             .apply(&mut l0_flush_delay_threshold);
@@ -669,6 +674,7 @@ impl TenantConfig {
             compaction_threshold,
             compaction_upper_limit,
             compaction_algorithm,
+            compaction_l0_first,
             l0_flush_delay_threshold,
             l0_flush_stall_threshold,
             l0_flush_wait_upload,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 91df47b250..5f17e8cb60 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -52,7 +52,9 @@ use timeline::compaction::GcCompactionQueue;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
 use timeline::offload::OffloadError;
+use timeline::CompactFlags;
 use timeline::CompactOptions;
+use timeline::CompactionError;
 use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
@@ -2898,150 +2900,181 @@ impl Tenant {
             .await
     }
 
-    /// Perform one compaction iteration.
-    /// This function is periodically called by compactor task.
-    /// Also it can be explicitly requested per timeline through page server
-    /// api's 'compact' command.
+    /// Performs one compaction iteration. Called periodically from the compaction loop. Returns
+    /// whether another compaction iteration is needed (if we yield), or
     ///
-    /// Returns whether we have pending compaction task.
+    /// Compaction can also be explicitly requested for a timeline via the HTTP API.
     async fn compaction_iteration(
         self: &Arc<Self>,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> Result<CompactionOutcome, timeline::CompactionError> {
-        // Don't start doing work during shutdown, or when broken, we do not need those in the logs
+    ) -> Result<CompactionOutcome, CompactionError> {
+        // Don't compact inactive tenants.
         if !self.is_active() {
             return Ok(CompactionOutcome::Done);
         }
 
-        {
-            let conf = self.tenant_conf.load();
-
-            // Note that compaction usually requires deletions, but we don't respect
-            // may_delete_layers_hint here: that is because tenants in AttachedMulti
-            // should proceed with compaction even if they can't do deletion, to avoid
-            // accumulating dangerously deep stacks of L0 layers.  Deletions will be
-            // enqueued inside RemoteTimelineClient, and executed layer if/when we transition
-            // to AttachedSingle state.
-            if !conf.location.may_upload_layers_hint() {
-                info!("Skipping compaction in location state {:?}", conf.location);
-                return Ok(CompactionOutcome::Done);
-            }
-        }
-
-        // Scan through the hashmap and collect a list of all the timelines,
-        // while holding the lock. Then drop the lock and actually perform the
-        // compactions.  We don't want to block everything else while the
-        // compaction runs.
-        let timelines_to_compact_or_offload;
-        {
-            let timelines = self.timelines.lock().unwrap();
-            timelines_to_compact_or_offload = timelines
-                .iter()
-                .filter_map(|(timeline_id, timeline)| {
-                    let (is_active, (can_offload, _)) =
-                        (timeline.is_active(), timeline.can_offload());
-                    let has_no_unoffloaded_children = {
-                        !timelines
-                            .iter()
-                            .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id))
-                    };
-                    let config_allows_offload = self.conf.timeline_offloading
-                        || self
-                            .tenant_conf
-                            .load()
-                            .tenant_conf
-                            .timeline_offloading
-                            .unwrap_or_default();
-                    let can_offload =
-                        can_offload && has_no_unoffloaded_children && config_allows_offload;
-                    if (is_active, can_offload) == (false, false) {
-                        None
-                    } else {
-                        Some((*timeline_id, timeline.clone(), (is_active, can_offload)))
-                    }
-                })
-                .collect::<Vec<_>>();
-            drop(timelines);
-        }
-
-        // Before doing any I/O work, check our circuit breaker
-        if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
-            info!("Skipping compaction due to previous failures");
+        // Don't compact tenants that can't upload layers. We don't check `may_delete_layers_hint`,
+        // since we need to compact L0 even in AttachedMulti to bound read amplification.
+        let location = self.tenant_conf.load().location;
+        if !location.may_upload_layers_hint() {
+            info!("skipping compaction in location state {location:?}");
             return Ok(CompactionOutcome::Done);
         }
 
-        let mut has_pending_task = false;
+        // Don't compact if the circuit breaker is tripped.
+        if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
+            info!("skipping compaction due to previous failures");
+            return Ok(CompactionOutcome::Done);
+        }
+
+        // Collect all timelines to compact, along with offload instructions and L0 counts.
+        let mut compact: Vec<Arc<Timeline>> = Vec::new();
+        let mut offload: HashSet<TimelineId> = HashSet::new();
+        let mut l0_counts: HashMap<TimelineId, usize> = HashMap::new();
 
-        for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload
         {
-            // pending_task_left == None: cannot compact, maybe still pending tasks
-            // pending_task_left == Some(Pending): compaction task left
-            // pending_task_left == Some(Done): no compaction task left
-            let pending_task_left = if *can_compact {
-                let compaction_outcome = timeline
-                    .compact(cancel, EnumSet::empty(), ctx)
-                    .instrument(info_span!("compact_timeline", %timeline_id))
-                    .await
-                    .inspect_err(|e| match e {
-                        timeline::CompactionError::ShuttingDown => (),
-                        timeline::CompactionError::Offload(_) => {
-                            // Failures to offload timelines do not trip the circuit breaker, because
-                            // they do not do lots of writes the way compaction itself does: it is cheap
-                            // to retry, and it would be bad to stop all compaction because of an issue with offloading.
-                        }
-                        timeline::CompactionError::Other(e) => {
-                            self.compaction_circuit_breaker
-                                .lock()
-                                .unwrap()
-                                .fail(&CIRCUIT_BREAKERS_BROKEN, e);
-                        }
-                    })?;
-                if let CompactionOutcome::Pending = compaction_outcome {
-                    Some(CompactionOutcome::Pending)
-                } else {
-                    let queue = {
-                        let guard = self.scheduled_compaction_tasks.lock().unwrap();
-                        guard.get(timeline_id).cloned()
-                    };
-                    if let Some(queue) = queue {
-                        let outcome = queue
-                            .iteration(cancel, ctx, &self.gc_block, timeline)
-                            .await?;
-                        Some(outcome)
-                    } else {
-                        Some(CompactionOutcome::Done)
-                    }
+            let offload_enabled = self.get_timeline_offloading_enabled();
+            let timelines = self.timelines.lock().unwrap();
+            for (&timeline_id, timeline) in timelines.iter() {
+                // Skip inactive timelines.
+                if !timeline.is_active() {
+                    continue;
                 }
-            } else {
-                None
-            };
-            has_pending_task |= pending_task_left == Some(CompactionOutcome::Pending);
-            if pending_task_left == Some(CompactionOutcome::Done) && *can_offload {
-                pausable_failpoint!("before-timeline-auto-offload");
-                match offload_timeline(self, timeline)
-                    .instrument(info_span!("offload_timeline", %timeline_id))
-                    .await
-                {
-                    Err(OffloadError::NotArchived) => {
-                        // Ignore this, we likely raced with unarchival
-                        Ok(())
-                    }
-                    other => other,
-                }?;
+
+                // Schedule the timeline for compaction.
+                compact.push(timeline.clone());
+
+                // Schedule the timeline for offloading if eligible.
+                let can_offload = offload_enabled
+                    && timeline.can_offload().0
+                    && !timelines
+                        .iter()
+                        .any(|(_, tli)| tli.get_ancestor_timeline_id() == Some(timeline_id));
+                if can_offload {
+                    offload.insert(timeline_id);
+                }
+            }
+        } // release timelines lock
+
+        for timeline in &compact {
+            // Collect L0 counts. Can't await while holding lock above.
+            if let Ok(lm) = timeline.layers.read().await.layer_map() {
+                l0_counts.insert(timeline.timeline_id, lm.level0_deltas().len());
             }
         }
 
+        // Pass 1: L0 compaction across all timelines, in order of L0 count. We prioritize this to
+        // bound read amplification.
+        //
+        // TODO: this may spin on one or more ingest-heavy timelines, starving out image/GC
+        // compaction and offloading. We leave that as a potential problem to solve later. Consider
+        // splitting L0 and image/GC compaction to separate background jobs.
+        if self.get_compaction_l0_first() {
+            let compaction_threshold = self.get_compaction_threshold();
+            let compact_l0 = compact
+                .iter()
+                .map(|tli| (tli, l0_counts.get(&tli.timeline_id).copied().unwrap_or(0)))
+                .filter(|&(_, l0)| l0 >= compaction_threshold)
+                .sorted_by_key(|&(_, l0)| l0)
+                .rev()
+                .map(|(tli, _)| tli.clone())
+                .collect_vec();
+
+            let mut has_pending_l0 = false;
+            for timeline in compact_l0 {
+                let outcome = timeline
+                    .compact(cancel, CompactFlags::OnlyL0Compaction.into(), ctx)
+                    .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id))
+                    .await
+                    .inspect_err(|err| self.maybe_trip_compaction_breaker(err))?;
+                has_pending_l0 |= outcome == CompactionOutcome::Pending;
+            }
+            if has_pending_l0 {
+                return Ok(CompactionOutcome::Pending); // do another pass
+            }
+        }
+
+        // Pass 2: image compaction and timeline offloading. If any timelines have accumulated
+        // more L0 layers, they may also be compacted here.
+        //
+        // NB: image compaction may yield if there is pending L0 compaction.
+        //
+        // TODO: it will only yield if there is pending L0 compaction on the same timeline. If a
+        // different timeline needs compaction, it won't. It should check `l0_compaction_trigger`.
+        // We leave this for a later PR.
+        //
+        // TODO: consider ordering timelines by some priority, e.g. time since last full compaction,
+        // amount of L1 delta debt or garbage, offload-eligible timelines first, etc.
+        let mut has_pending = false;
+        for timeline in compact {
+            if !timeline.is_active() {
+                continue;
+            }
+
+            let mut outcome = timeline
+                .compact(cancel, EnumSet::default(), ctx)
+                .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id))
+                .await
+                .inspect_err(|err| self.maybe_trip_compaction_breaker(err))?;
+
+            // If we're done compacting, check the scheduled GC compaction queue for more work.
+            if outcome == CompactionOutcome::Done {
+                let queue = self
+                    .scheduled_compaction_tasks
+                    .lock()
+                    .unwrap()
+                    .get(&timeline.timeline_id)
+                    .cloned();
+                if let Some(queue) = queue {
+                    outcome = queue
+                        .iteration(cancel, ctx, &self.gc_block, &timeline)
+                        .await?;
+                }
+            }
+
+            // If we're done compacting, offload the timeline if requested.
+            if outcome == CompactionOutcome::Done && offload.contains(&timeline.timeline_id) {
+                pausable_failpoint!("before-timeline-auto-offload");
+                offload_timeline(self, &timeline)
+                    .instrument(info_span!("offload_timeline", timeline_id = %timeline.timeline_id))
+                    .await
+                    .or_else(|err| match err {
+                        // Ignore this, we likely raced with unarchival.
+                        OffloadError::NotArchived => Ok(()),
+                        err => Err(err),
+                    })?;
+            }
+
+            has_pending |= outcome == CompactionOutcome::Pending;
+        }
+
+        // Success! Untrip the breaker if necessary.
         self.compaction_circuit_breaker
             .lock()
             .unwrap()
             .success(&CIRCUIT_BREAKERS_UNBROKEN);
 
-        Ok(if has_pending_task {
-            CompactionOutcome::Pending
-        } else {
-            CompactionOutcome::Done
-        })
+        match has_pending {
+            true => Ok(CompactionOutcome::Pending),
+            false => Ok(CompactionOutcome::Done),
+        }
+    }
+
+    /// Trips the compaction circuit breaker if appropriate.
+    pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) {
+        match err {
+            CompactionError::ShuttingDown => (),
+            // Offload failures don't trip the circuit breaker, since they're cheap to retry and
+            // shouldn't block compaction.
+            CompactionError::Offload(_) => {}
+            CompactionError::Other(err) => {
+                self.compaction_circuit_breaker
+                    .lock()
+                    .unwrap()
+                    .fail(&CIRCUIT_BREAKERS_BROKEN, err);
+            }
+        }
     }
 
     /// Cancel scheduled compaction tasks
@@ -3819,6 +3852,13 @@ impl Tenant {
             .unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit)
     }
 
+    pub fn get_compaction_l0_first(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .compaction_l0_first
+            .unwrap_or(self.conf.default_tenant_conf.compaction_l0_first)
+    }
+
     pub fn get_gc_horizon(&self) -> u64 {
         let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
@@ -3873,6 +3913,16 @@ impl Tenant {
             .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
     }
 
+    pub fn get_timeline_offloading_enabled(&self) -> bool {
+        if self.conf.timeline_offloading {
+            return true;
+        }
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .timeline_offloading
+            .unwrap_or(self.conf.default_tenant_conf.timeline_offloading)
+    }
+
     /// Generate an up-to-date TenantManifest based on the state of this Tenant.
     fn build_tenant_manifest(&self) -> TenantManifest {
         let timelines_offloaded = self.timelines_offloaded.lock().unwrap();
@@ -5478,6 +5528,7 @@ pub(crate) mod harness {
                 compaction_threshold: Some(tenant_conf.compaction_threshold),
                 compaction_upper_limit: Some(tenant_conf.compaction_upper_limit),
                 compaction_algorithm: Some(tenant_conf.compaction_algorithm),
+                compaction_l0_first: Some(tenant_conf.compaction_l0_first),
                 l0_flush_delay_threshold: tenant_conf.l0_flush_delay_threshold,
                 l0_flush_stall_threshold: tenant_conf.l0_flush_stall_threshold,
                 l0_flush_wait_upload: Some(tenant_conf.l0_flush_wait_upload),
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index ad13e9e8e4..cff33afffd 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -285,6 +285,10 @@ pub struct TenantConfOpt {
     #[serde(default)]
     pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
 
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub compaction_l0_first: Option<bool>,
+
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
     pub l0_flush_delay_threshold: Option<usize>,
@@ -416,6 +420,9 @@ impl TenantConfOpt {
                 .as_ref()
                 .unwrap_or(&global_conf.compaction_algorithm)
                 .clone(),
+            compaction_l0_first: self
+                .compaction_l0_first
+                .unwrap_or(global_conf.compaction_l0_first),
             l0_flush_delay_threshold: self
                 .l0_flush_delay_threshold
                 .or(global_conf.l0_flush_delay_threshold),
@@ -493,6 +500,7 @@ impl TenantConfOpt {
             mut compaction_threshold,
             mut compaction_upper_limit,
             mut compaction_algorithm,
+            mut compaction_l0_first,
             mut l0_flush_delay_threshold,
             mut l0_flush_stall_threshold,
             mut l0_flush_wait_upload,
@@ -538,6 +546,7 @@ impl TenantConfOpt {
             .compaction_upper_limit
             .apply(&mut compaction_upper_limit);
         patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch.compaction_l0_first.apply(&mut compaction_l0_first);
         patch
             .l0_flush_delay_threshold
             .apply(&mut l0_flush_delay_threshold);
@@ -619,6 +628,7 @@ impl TenantConfOpt {
             compaction_threshold,
             compaction_upper_limit,
             compaction_algorithm,
+            compaction_l0_first,
             l0_flush_delay_threshold,
             l0_flush_stall_threshold,
             l0_flush_wait_upload,
@@ -681,6 +691,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             compaction_period: value.compaction_period.map(humantime),
             compaction_threshold: value.compaction_threshold,
             compaction_upper_limit: value.compaction_upper_limit,
+            compaction_l0_first: value.compaction_l0_first,
             l0_flush_delay_threshold: value.l0_flush_delay_threshold,
             l0_flush_stall_threshold: value.l0_flush_stall_threshold,
             l0_flush_wait_upload: value.l0_flush_wait_upload,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1fbcd6bceb..43811b77f8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -876,6 +876,7 @@ pub(crate) enum CompactFlags {
     ForceRepartition,
     ForceImageLayerCreation,
     ForceL0Compaction,
+    OnlyL0Compaction,
     EnhancedGcBottomMostCompaction,
     DryRun,
 }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 4cbc344669..18b5afd04b 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -609,6 +609,8 @@ pub enum CompactionOutcome {
     /// Still has pending layers to be compacted after this round. Ideally, the scheduler
     /// should immediately schedule another compaction.
     Pending,
+    // TODO: add a skipped variant for cases where we didn't attempt compaction. These currently
+    // return Done, which can lead the caller to believe there is no compaction debt.
 }
 
 impl Timeline {
@@ -715,10 +717,12 @@ impl Timeline {
             l0_compaction_outcome
         };
 
-        if let CompactionOutcome::Pending = l0_compaction_outcome {
-            // Yield and do not do any other kind of compaction. True means
-            // that we have pending L0 compaction tasks and the compaction scheduler
-            // will prioritize compacting this tenant/timeline again.
+        if options.flags.contains(CompactFlags::OnlyL0Compaction) {
+            return Ok(l0_compaction_outcome);
+        }
+
+        if l0_compaction_outcome == CompactionOutcome::Pending {
+            // Yield if we have pending L0 compaction. The scheduler will do another pass.
             info!("skipping image layer generation and shard ancestor compaction due to L0 compaction did not include all layers.");
             return Ok(CompactionOutcome::Pending);
         }
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 7acc64377e..34d56c5cb1 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -140,9 +140,10 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "compaction_period": "1h",
         "compaction_threshold": 13,
         "compaction_upper_limit": 100,
+        "compaction_l0_first": False,
         "l0_flush_delay_threshold": 25,
         "l0_flush_stall_threshold": 42,
-        "l0_flush_wait_upload": True,
+        "l0_flush_wait_upload": False,
         "compaction_target_size": 1048576,
         "checkpoint_distance": 10000,
         "checkpoint_timeout": "13m",

From 71c30e52faa761a306d3bfac7f24a8d76b944955 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 12 Feb 2025 00:43:58 +0100
Subject: [PATCH 442/583] pageserver: properly yield for L0 compaction (#10769)

## Problem

When image compaction yields for L0 compaction, it may not immediately
schedule L0 compaction, because it just goes on to compact the next
pending timeline.

Touches #10694.
Requires #10744.

## Summary of changes

Extend `CompactionOutcome` with `YieldForL0` and `Skipped` variants, and
immediately schedule an L0 compaction pass in the `YieldForL0` case.
---
 pageserver/src/tenant.rs                     | 27 +++++++++++++++-----
 pageserver/src/tenant/tasks.rs               | 13 +++++-----
 pageserver/src/tenant/timeline.rs            |  6 ++---
 pageserver/src/tenant/timeline/compaction.rs | 26 +++++++++++--------
 4 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 5f17e8cb60..8520ae62e8 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2901,7 +2901,8 @@ impl Tenant {
     }
 
     /// Performs one compaction iteration. Called periodically from the compaction loop. Returns
-    /// whether another compaction iteration is needed (if we yield), or
+    /// whether another compaction is needed, if we still have pending work or if we yield for
+    /// immediate L0 compaction.
     ///
     /// Compaction can also be explicitly requested for a timeline via the HTTP API.
     async fn compaction_iteration(
@@ -2911,7 +2912,7 @@ impl Tenant {
     ) -> Result<CompactionOutcome, CompactionError> {
         // Don't compact inactive tenants.
         if !self.is_active() {
-            return Ok(CompactionOutcome::Done);
+            return Ok(CompactionOutcome::Skipped);
         }
 
         // Don't compact tenants that can't upload layers. We don't check `may_delete_layers_hint`,
@@ -2919,13 +2920,13 @@ impl Tenant {
         let location = self.tenant_conf.load().location;
         if !location.may_upload_layers_hint() {
             info!("skipping compaction in location state {location:?}");
-            return Ok(CompactionOutcome::Done);
+            return Ok(CompactionOutcome::Skipped);
         }
 
         // Don't compact if the circuit breaker is tripped.
         if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
             info!("skipping compaction due to previous failures");
-            return Ok(CompactionOutcome::Done);
+            return Ok(CompactionOutcome::Skipped);
         }
 
         // Collect all timelines to compact, along with offload instructions and L0 counts.
@@ -2988,10 +2989,15 @@ impl Tenant {
                     .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id))
                     .await
                     .inspect_err(|err| self.maybe_trip_compaction_breaker(err))?;
-                has_pending_l0 |= outcome == CompactionOutcome::Pending;
+                match outcome {
+                    CompactionOutcome::Done => {}
+                    CompactionOutcome::Skipped => {}
+                    CompactionOutcome::Pending => has_pending_l0 = true,
+                    CompactionOutcome::YieldForL0 => has_pending_l0 = true,
+                }
             }
             if has_pending_l0 {
-                return Ok(CompactionOutcome::Pending); // do another pass
+                return Ok(CompactionOutcome::YieldForL0); // do another pass
             }
         }
 
@@ -3046,7 +3052,14 @@ impl Tenant {
                     })?;
             }
 
-            has_pending |= outcome == CompactionOutcome::Pending;
+            match outcome {
+                CompactionOutcome::Done => {}
+                CompactionOutcome::Skipped => {}
+                CompactionOutcome::Pending => has_pending = true,
+                // This mostly makes sense when the L0-only pass above is enabled, since there's
+                // otherwise no guarantee that we'll start with the timeline that has high L0.
+                CompactionOutcome::YieldForL0 => return Ok(CompactionOutcome::YieldForL0),
+            }
         }
 
         // Success! Untrip the breaker if necessary.
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 5df7351216..1fa01e4229 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -268,13 +268,12 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
         match output {
             Ok(outcome) => {
                 error_run = 0;
-                // If there's more compaction work pending, reschedule immediately. This isn't
-                // necessarily L0 compaction, but that's fine for now.
-                //
-                // TODO: differentiate between L0 compaction and other compaction. The former needs
-                // to be responsive, the latter doesn't.
-                if outcome == CompactionOutcome::Pending {
-                    tenant.l0_compaction_trigger.notify_one();
+                // If there's more compaction work, L0 or not, schedule an immediate run.
+                match outcome {
+                    CompactionOutcome::Done => {}
+                    CompactionOutcome::Skipped => {}
+                    CompactionOutcome::YieldForL0 => tenant.l0_compaction_trigger.notify_one(),
+                    CompactionOutcome::Pending => tenant.l0_compaction_trigger.notify_one(),
                 }
             }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 43811b77f8..afa8efa453 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1815,8 +1815,8 @@ impl Timeline {
         // compaction task goes over it's period (20s) which is quite often in production.
         let (_guard, _permit) = tokio::select! {
             tuple = prepare => { tuple },
-            _ = self.cancel.cancelled() => return Ok(CompactionOutcome::Done),
-            _ = cancel.cancelled() => return Ok(CompactionOutcome::Done),
+            _ = self.cancel.cancelled() => return Ok(CompactionOutcome::Skipped),
+            _ = cancel.cancelled() => return Ok(CompactionOutcome::Skipped),
         };
 
         let last_record_lsn = self.get_last_record_lsn();
@@ -1824,7 +1824,7 @@ impl Timeline {
         // Last record Lsn could be zero in case the timeline was just created
         if !last_record_lsn.is_valid() {
             warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
-            return Ok(CompactionOutcome::Done);
+            return Ok(CompactionOutcome::Skipped);
         }
 
         let result = match self.get_compaction_algorithm_settings().kind {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 18b5afd04b..aea92d34e0 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -609,8 +609,11 @@ pub enum CompactionOutcome {
     /// Still has pending layers to be compacted after this round. Ideally, the scheduler
     /// should immediately schedule another compaction.
     Pending,
-    // TODO: add a skipped variant for cases where we didn't attempt compaction. These currently
-    // return Done, which can lead the caller to believe there is no compaction debt.
+    /// A timeline needs L0 compaction. Yield and schedule an immediate L0 compaction pass (only
+    /// guaranteed when `compaction_l0_first` is enabled).
+    YieldForL0,
+    /// Compaction was skipped, because the timeline is ineligible for compaction.
+    Skipped,
 }
 
 impl Timeline {
@@ -703,10 +706,11 @@ impl Timeline {
                 .unwrap_or(self.get_disk_consistent_lsn());
             l0_min_lsn.max(self.get_ancestor_lsn())
         };
+
         // 1. L0 Compact
-        let l0_compaction_outcome = {
+        let l0_outcome = {
             let timer = self.metrics.compact_time_histo.start_timer();
-            let l0_compaction_outcome = self
+            let l0_outcome = self
                 .compact_level0(
                     target_file_size,
                     options.flags.contains(CompactFlags::ForceL0Compaction),
@@ -714,17 +718,17 @@ impl Timeline {
                 )
                 .await?;
             timer.stop_and_record();
-            l0_compaction_outcome
+            l0_outcome
         };
 
         if options.flags.contains(CompactFlags::OnlyL0Compaction) {
-            return Ok(l0_compaction_outcome);
+            return Ok(l0_outcome);
         }
 
-        if l0_compaction_outcome == CompactionOutcome::Pending {
-            // Yield if we have pending L0 compaction. The scheduler will do another pass.
-            info!("skipping image layer generation and shard ancestor compaction due to L0 compaction did not include all layers.");
-            return Ok(CompactionOutcome::Pending);
+        // Yield if we have pending L0 compaction. The scheduler will do another pass.
+        if l0_outcome == CompactionOutcome::Pending || l0_outcome == CompactionOutcome::YieldForL0 {
+            info!("image/ancestor compaction yielding for L0 compaction");
+            return Ok(CompactionOutcome::YieldForL0);
         }
 
         if l0_l1_boundary_lsn < self.partitioning.read().1 {
@@ -788,7 +792,7 @@ impl Timeline {
                     if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
                         // Yield and do not do any other kind of compaction.
                         info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
-                        return Ok(CompactionOutcome::Pending);
+                        return Ok(CompactionOutcome::YieldForL0);
                     }
                 }
                 Err(err) => {

From 2c4c6e63307cf2bbba5a62d5cb757a32e939f496 Mon Sep 17 00:00:00 2001
From: Mikhail Kot <mikhail@neon.tech>
Date: Wed, 12 Feb 2025 10:52:26 +0000
Subject: [PATCH 443/583] fix(neon): Add tests clarifying postgres sigabrt on
 pageserver unavailability (#10666)

Resolves: https://github.com/neondatabase/neon/issues/5734

When we query pageserver and it's unavailable after some retries,
postgres sigabrt's. This is intended behavior so I've added tests
checking it
---
 .../regress/test_pageserver_restart.py        | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index 835ccbd5d4..21cb780c06 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import random
 from contextlib import closing
 
+import psycopg2.errors as pgerr
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
@@ -226,3 +227,43 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, shard_count: int | N
     # so instead, do a fast shutdown for this one test.
     # See https://github.com/neondatabase/neon/issues/8709
     env.stop(immediate=True)
+
+
+def test_pageserver_lost_and_transaction_aborted(neon_env_builder: NeonEnvBuilder):
+    """
+    If pageserver is unavailable during a transaction abort and target relation is
+    not present in cache, we abort the transaction in ABORT state which triggers a sigabrt.
+    This is _expected_ behavour
+    """
+    env = neon_env_builder.init_start()
+    endpoint = env.endpoints.create_start("main", config_lines=["neon.relsize_hash_size=0"])
+    with closing(endpoint.connect()) as conn, conn.cursor() as cur:
+        cur.execute("CREATE DATABASE test")
+    with (
+        pytest.raises((pgerr.InterfaceError, pgerr.InternalError)),
+        endpoint.connect(dbname="test") as conn,
+        conn.cursor() as cur,
+    ):
+        cur.execute("create table t(b box)")
+        env.pageserver.stop()
+        cur.execute("create index ti on t using gist(b)")
+
+
+def test_pageserver_lost_and_transaction_committed(neon_env_builder: NeonEnvBuilder):
+    """
+    If pageserver is unavailable during a transaction commit and target relation is
+    not present in cache, we abort the transaction in COMMIT state which triggers a sigabrt.
+    This is _expected_ behavour
+    """
+    env = neon_env_builder.init_start()
+    endpoint = env.endpoints.create_start("main", config_lines=["neon.relsize_hash_size=0"])
+    with closing(endpoint.connect()) as conn, conn.cursor() as cur:
+        cur.execute("CREATE DATABASE test")
+    with (
+        pytest.raises((pgerr.InterfaceError, pgerr.InternalError)),
+        endpoint.connect(dbname="test") as conn,
+        conn.cursor() as cur,
+    ):
+        cur.execute("create table t(t boolean)")
+        env.pageserver.stop()
+        cur.execute("drop table t")

From 9537829ccd5cc3c1a2fedabb14a0de0f0d1b590f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 12 Feb 2025 13:43:23 +0200
Subject: [PATCH 444/583] fast_import: Make CPU & memory size configurable
 (#10709)

The old values assumed that you have at least about 18 GB of RAM
available (shared_buffers=10GB and maintenance_work_mem=8GB). That's a
lot when testing locally. Make it configurable, and make the default
assumption much smaller: 256 MB.

This is nice for local testing, but it's also in preparation for
starting to use VMs to run these jobs. When launched in a VM, the
control plane can set these env variables according to the max size of
the VM.

Also change the formula for how RAM is distributed: use 10% of RAM for
shared_buffers, and 70% for maintenance_work_mem. That leaves a good
amount for misc. other stuff and the OS. A very large shared_buffers
setting won't typically help with bulk loading. It won't help with the
network and I/O of processing all the tables, unless maybe if the whole
database fits in shared buffers, but even then it's not much faster than
using local disk. Bulk loading is all sequential I/O. It also won't help
much with index creation, which is also sequential I/O. A large
maintenance_work_mem can be quite useful, however, so that's where we
put most of the RAM.
---
 compute_tools/src/bin/fast_import.rs | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index 1398f443dd..27cf1c2317 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -60,6 +60,16 @@ struct Args {
     pg_lib_dir: Utf8PathBuf,
     #[clap(long)]
     pg_port: Option<u16>, // port to run postgres on, 5432 is default
+
+    /// Number of CPUs in the system. This is used to configure # of
+    /// parallel worker processes, for index creation.
+    #[clap(long, env = "NEON_IMPORTER_NUM_CPUS")]
+    num_cpus: Option<usize>,
+
+    /// Amount of RAM in the system. This is used to configure shared_buffers
+    /// and maintenance_work_mem.
+    #[clap(long, env = "NEON_IMPORTER_MEMORY_MB")]
+    memory_mb: Option<usize>,
 }
 
 #[serde_with::serde_as]
@@ -202,7 +212,16 @@ pub(crate) async fn main() -> anyhow::Result<()> {
     .await
     .context("initdb")?;
 
-    let nproc = num_cpus::get();
+    // If the caller didn't specify CPU / RAM to use for sizing, default to
+    // number of CPUs in the system, and pretty arbitrarily, 256 MB of RAM.
+    let nproc = args.num_cpus.unwrap_or_else(num_cpus::get);
+    let memory_mb = args.memory_mb.unwrap_or(256);
+
+    // Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for
+    // maintenance_work_mem (i.e. for sorting during index creation), and leave the rest
+    // available for misc other stuff that PostgreSQL uses memory for.
+    let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize;
+    let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize;
 
     //
     // Launch postgres process
@@ -212,12 +231,15 @@ pub(crate) async fn main() -> anyhow::Result<()> {
         .arg(&pgdata_dir)
         .args(["-p", &format!("{pg_port}")])
         .args(["-c", "wal_level=minimal"])
-        .args(["-c", "shared_buffers=10GB"])
+        .args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")])
         .args(["-c", "max_wal_senders=0"])
         .args(["-c", "fsync=off"])
         .args(["-c", "full_page_writes=off"])
         .args(["-c", "synchronous_commit=off"])
-        .args(["-c", "maintenance_work_mem=8388608"])
+        .args([
+            "-c",
+            &format!("maintenance_work_mem={maintenance_work_mem_mb}MB"),
+        ])
         .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")])
         .args(["-c", &format!("max_parallel_workers={nproc}")])
         .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])

From 9989d8bfaed44f039052f2d1df9bb623d8b740d1 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 12 Feb 2025 12:35:29 +0000
Subject: [PATCH 445/583] tests: make Workload more determinstic (#10741)

## Problem

Previously, Workload was reconfiguring the compute before each run of
writes, which was meant to be a no-op when nothing changed, but was
actually writing extra data due to an issue being fixed in
https://github.com/neondatabase/neon/pull/10696.

The row counts in tests were too low in some cases, these tests were
only working because of those extra writes that shouldn't have been
happening, and moreover were relying on checkpoints happening.

## Summary of changes

- Only reconfigure compute if the attached pageserver actually changed.
If pageserver is set to None, that means controller is managing
everything, so never reconfigure compute.
- Update tests that wrote too few rows.

---------

Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>
---
 test_runner/fixtures/compute_reconfigure.py  |  5 ++++-
 test_runner/fixtures/neon_cli.py             |  3 +++
 test_runner/fixtures/neon_fixtures.py        |  4 ++++
 test_runner/fixtures/workload.py             |  9 ++++++++-
 test_runner/regress/test_compaction.py       |  4 +---
 test_runner/regress/test_sharding.py         | 13 ++++++++++---
 test_runner/regress/test_storage_scrubber.py |  7 +++++--
 7 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py
index 33f01f80fb..425abef935 100644
--- a/test_runner/fixtures/compute_reconfigure.py
+++ b/test_runner/fixtures/compute_reconfigure.py
@@ -69,7 +69,10 @@ def compute_reconfigure_listener(make_httpserver: HTTPServer):
             # This causes the endpoint to query storage controller for its location, which
             # is redundant since we already have it here, but this avoids extending the
             # neon_local CLI to take full lists of locations
-            reconfigure_threads.submit(lambda workload=workload: workload.reconfigure())  # type: ignore[misc]
+            fut = reconfigure_threads.submit(lambda workload=workload: workload.reconfigure())  # type: ignore[misc]
+
+            # To satisfy semantics of notify-attach API, we must wait for the change to be applied before returning 200
+            fut.result()
 
         return Response(status=200)
 
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index 6a016d2621..97a5a36814 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -487,6 +487,7 @@ class NeonLocalCli(AbstractNeonCli):
         lsn: Lsn | None = None,
         pageserver_id: int | None = None,
         allow_multiple=False,
+        update_catalog: bool = False,
     ) -> subprocess.CompletedProcess[str]:
         args = [
             "endpoint",
@@ -514,6 +515,8 @@ class NeonLocalCli(AbstractNeonCli):
             args.extend(["--pageserver-id", str(pageserver_id)])
         if allow_multiple:
             args.extend(["--allow-multiple"])
+        if update_catalog:
+            args.extend(["--update-catalog"])
 
         res = self.raw_cli(args)
         res.check_returncode()
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 41e9952b8a..2fa82754ef 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3849,6 +3849,7 @@ class Endpoint(PgProtocol, LogUtils):
         config_lines: list[str] | None = None,
         pageserver_id: int | None = None,
         allow_multiple: bool = False,
+        update_catalog: bool = False,
     ) -> Self:
         """
         Create a new Postgres endpoint.
@@ -3874,6 +3875,7 @@ class Endpoint(PgProtocol, LogUtils):
             pg_version=self.env.pg_version,
             pageserver_id=pageserver_id,
             allow_multiple=allow_multiple,
+            update_catalog=update_catalog,
         )
         path = Path("endpoints") / self.endpoint_id / "pgdata"
         self.pgdata_dir = self.env.repo_dir / path
@@ -4288,6 +4290,7 @@ class EndpointFactory:
         hot_standby: bool = False,
         config_lines: list[str] | None = None,
         pageserver_id: int | None = None,
+        update_catalog: bool = False,
     ) -> Endpoint:
         ep = Endpoint(
             self.env,
@@ -4309,6 +4312,7 @@ class EndpointFactory:
             hot_standby=hot_standby,
             config_lines=config_lines,
             pageserver_id=pageserver_id,
+            update_catalog=update_catalog,
         )
 
     def stop_all(self, fail_on_error=True) -> Self:
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index eea0ec2b95..1947a9c3fb 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -53,6 +53,8 @@ class Workload:
         self._endpoint: Endpoint | None = None
         self._endpoint_opts = endpoint_opts or {}
 
+        self._configured_pageserver: int | None = None
+
     def branch(
         self,
         timeline_id: TimelineId,
@@ -92,8 +94,12 @@ class Workload:
                     **self._endpoint_opts,
                 )
                 self._endpoint.start(pageserver_id=pageserver_id)
+                self._configured_pageserver = pageserver_id
             else:
-                self._endpoint.reconfigure(pageserver_id=pageserver_id)
+                if self._configured_pageserver != pageserver_id:
+                    self._configured_pageserver = pageserver_id
+                    self._endpoint.reconfigure(pageserver_id=pageserver_id)
+                    self._endpoint_config = pageserver_id
 
         connstring = self._endpoint.safe_psql(
             "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'"
@@ -122,6 +128,7 @@ class Workload:
 
     def write_rows(self, n: int, pageserver_id: int | None = None, upload: bool = True):
         endpoint = self.endpoint(pageserver_id)
+
         start = self.expect_rows
         end = start + n - 1
         self.expect_rows += n
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index f3347b594e..f10872590c 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -689,9 +689,7 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder)
     env.pageserver.http_client().configure_failpoints((FAILPOINT, "return"))
 
     # Write some data to trigger compaction
-    workload.write_rows(1024, upload=False)
-    workload.write_rows(1024, upload=False)
-    workload.write_rows(1024, upload=False)
+    workload.write_rows(32768, upload=False)
 
     def assert_broken():
         env.pageserver.assert_log_contains(BROKEN_LOG)
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 6f8070e2ba..8910873690 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -91,7 +91,7 @@ def test_sharding_smoke(
     workload.init()
 
     sizes_before = get_sizes()
-    workload.write_rows(256)
+    workload.write_rows(65536)
 
     # Test that we can read data back from a sharded tenant
     workload.validate()
@@ -1368,6 +1368,7 @@ def test_sharding_split_failures(
     workload = Workload(env, tenant_id, timeline_id)
     workload.init()
     workload.write_rows(100)
+    compute_reconfigure_listener.register_workload(workload)
 
     # Put the environment into a failing state (exact meaning depends on `failure`)
     failure.apply(env)
@@ -1546,6 +1547,9 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
                 # Tip: set to 100MB to make the test fail
                 "max_replication_write_lag=1MB",
             ],
+            # We need `neon` extension for calling backpressure functions,
+            # this flag instructs `compute_ctl` to pre-install it.
+            "update_catalog": True,
         },
     )
     workload.init()
@@ -1815,6 +1819,9 @@ def test_sharding_gc(
         # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by
         # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does.
         # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed
-        ps.allowed_errors.append(
-            ".*could not find data for key 020000000000000000000000000000000000.*"
+        ps.allowed_errors.extend(
+            [
+                ".*could not find data for key 020000000000000000000000000000000000.*",
+                ".*could not ingest record.*",
+            ]
         )
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 46038ccbbb..b8253fb125 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -316,8 +316,11 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_
         # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by
         # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does.
         # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed
-        ps.allowed_errors.append(
-            ".*could not find data for key 020000000000000000000000000000000000.*"
+        ps.allowed_errors.extend(
+            [
+                ".*could not find data for key 020000000000000000000000000000000000.*",
+                ".*could not ingest record.*",
+            ]
         )
 
 
From ec354884ea4adf8381a87c8a6486161ca8054e2d Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Wed, 12 Feb 2025 15:03:10 +0100
Subject: [PATCH 446/583] Feat/pin docker images to sha (#10730)

## Problem

With current approach for the base images in `Dockerfiles`, it's hard to
track when image is updated, and as they are base, than update will
invalidate all the layers, as base image changed.
That also becomes more complicated, as we have a number of runners, and
they may have different images with the tag `bookworm-slim`, so that
will lead to invalidate caches, when image build on one runner will be
used on another runners.

To fix that problem, we could pin our base images to the specific sha,
and that not only align images across runners, and also will allow us to
have reproducible build and don't depend on any spontaneous changes in
upstream.

Fix: https://github.com/neondatabase/cloud/issues/24084

## Summary of changes
Beside of the main goal, that PR also included some small changes around
Dockerfiles:

1. Main change: use `SHA` for `bookworm-slim` and `bullseye-slim` debian
images
2. For the layers requiring `curl` we could add `curl` and `unzip` to
the `build-deps` image, and use it as a base image for all the steps,
removing extra dependency on `alpine/curl`
3. added `retry-on-host-error=on` for the `wgetrc` as it happened to me:
fail to resolve hostname
---
 Dockerfile                          | 25 ++++++++-
 build-tools.Dockerfile              | 34 +++++++++++--
 compute/compute-node.Dockerfile     | 78 ++++++++++++++++++-----------
 compute/vm-image-spec-bookworm.yaml |  4 +-
 compute/vm-image-spec-bullseye.yaml |  3 +-
 5 files changed, 105 insertions(+), 39 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 7ba54c8ca5..b399bcf7e4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,6 +10,28 @@ ARG STABLE_PG_VERSION=16
 ARG DEBIAN_VERSION=bookworm
 ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
 
+# Here are the INDEX DIGESTS for the images we use.
+# You can get them following next steps for now:
+# 1. Get an authentication token from DockerHub:
+#    TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token)
+# 2. Using that token, query index for the given tag:
+#    curl -s -H "Authorization: Bearer $TOKEN" \
+#       -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \
+#       "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \
+#       -I | grep -i docker-content-digest
+# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks
+#    and updates on regular bases and in automated way.
+ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7
+ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1
+
+# Here we use ${var/search/replace} syntax, to check
+# if base image is one of the images, we pin image index for.
+# If var will match one the known images, we will replace it with the known sha.
+# If no match, than value will be unaffected, and will process with no-pinned image.
+ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR}
+ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA}
+ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA}
+
 # Build Postgres
 FROM $REPOSITORY/$IMAGE:$TAG AS pg-build
 WORKDIR /home/nonroot
@@ -59,7 +81,7 @@ RUN set -e \
 
 # Build final image
 #
-FROM debian:${DEBIAN_FLAVOR}
+FROM $BASE_IMAGE_SHA
 ARG DEFAULT_PG_VERSION
 WORKDIR /data
 
@@ -112,4 +134,3 @@ EXPOSE 6400
 EXPOSE 9898
 
 CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"]
-
diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 52874d2ef6..fa72ca1bc2 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -1,6 +1,29 @@
 ARG DEBIAN_VERSION=bookworm
+ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
 
-FROM debian:bookworm-slim AS pgcopydb_builder
+# Here are the INDEX DIGESTS for the images we use.
+# You can get them following next steps for now:
+# 1. Get an authentication token from DockerHub:
+#    TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token)
+# 2. Using that token, query index for the given tag:
+#    curl -s -H "Authorization: Bearer $TOKEN" \
+#       -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \
+#       "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \
+#       -I | grep -i docker-content-digest
+# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks
+#    and updates on regular bases and in automated way.
+ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7
+ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1
+
+# Here we use ${var/search/replace} syntax, to check
+# if base image is one of the images, we pin image index for.
+# If var will match one the known images, we will replace it with the known sha.
+# If no match, than value will be unaffected, and will process with no-pinned image.
+ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR}
+ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA}
+ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA}
+
+FROM $BASE_IMAGE_SHA AS pgcopydb_builder
 ARG DEBIAN_VERSION
 
 # Use strict mode for bash to catch errors early
@@ -9,7 +32,7 @@ SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
 # By default, /bin/sh used in debian images will treat '\n' as eol,
 # but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that.
 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
-    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
+    echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \
     echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
 
 COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
@@ -58,7 +81,7 @@ RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
         mkdir -p mkdir -p /pgcopydb/lib && touch /pgcopydb/lib/libpq.so.5; \
     fi
 
-FROM debian:${DEBIAN_VERSION}-slim AS build_tools
+FROM $BASE_IMAGE_SHA AS build_tools
 ARG DEBIAN_VERSION
 
 # Add nonroot user
@@ -75,7 +98,7 @@ COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/p
 COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5
 
 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
-    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
+    echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \
     echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
 
 # System deps
@@ -138,7 +161,8 @@ RUN curl -fsSL \
     --output sql_exporter.tar.gz \
     && mkdir /tmp/sql_exporter \
     && tar xzvf sql_exporter.tar.gz -C /tmp/sql_exporter --strip-components=1 \
-    && mv /tmp/sql_exporter/sql_exporter /usr/local/bin/sql_exporter
+    && mv /tmp/sql_exporter/sql_exporter /usr/local/bin/sql_exporter \
+    && rm sql_exporter.tar.gz
 
 # protobuf-compiler (protoc)
 ENV PROTOC_VERSION=25.1
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 47637b4684..4a7dcf6f95 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -83,7 +83,28 @@ ARG TAG=pinned
 ARG BUILD_TAG
 ARG DEBIAN_VERSION=bookworm
 ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
-ARG ALPINE_CURL_VERSION=8.11.1
+
+# Here are the INDEX DIGESTS for the images we use.
+# You can get them following next steps for now:
+# 1. Get an authentication token from DockerHub:
+#    TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token)
+# 2. Using that token, query index for the given tag:
+#    curl -s -H "Authorization: Bearer $TOKEN" \
+#       -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \
+#       "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \
+#       -I | grep -i docker-content-digest
+# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks
+#    and updates on regular bases and in automated way.
+ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7
+ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1
+
+# Here we use ${var/search/replace} syntax, to check
+# if base image is one of the images, we pin image index for.
+# If var will match one the known images, we will replace it with the known sha.
+# If no match, than value will be unaffected, and will process with no-pinned image.
+ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR}
+ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA}
+ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA}
 
 # By default, build all PostgreSQL extensions. For quick local testing when you don't
 # care about the extensions, pass EXTENSIONS=none or EXTENSIONS=minimal
@@ -94,7 +115,7 @@ ARG EXTENSIONS=all
 # Layer "build-deps"
 #
 #########################################################################################
-FROM debian:$DEBIAN_FLAVOR AS build-deps
+FROM $BASE_IMAGE_SHA AS build-deps
 ARG DEBIAN_VERSION
 
 # Use strict mode for bash to catch errors early
@@ -103,7 +124,7 @@ SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
 # By default, /bin/sh used in debian images will treat '\n' as eol,
 # but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that.
 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
-    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \
     echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
 
 RUN case $DEBIAN_VERSION in \
@@ -127,7 +148,7 @@ RUN case $DEBIAN_VERSION in \
     apt install --no-install-recommends --no-install-suggests -y \
     ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \
     zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \
-    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd \
+    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip \
     $VERSION_INSTALLS \
     && apt clean && rm -rf /var/lib/apt/lists/*
 
@@ -139,11 +160,11 @@ RUN case $DEBIAN_VERSION in \
 #########################################################################################
 FROM build-deps AS pg-build
 ARG PG_VERSION
-COPY vendor/postgres-${PG_VERSION} postgres
+COPY vendor/postgres-${PG_VERSION:?} postgres
 RUN cd postgres && \
     export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
     --with-icu --with-libxml --with-libxslt --with-lz4" && \
-    if [ "${PG_VERSION}" != "v14" ]; then \
+    if [ "${PG_VERSION:?}" != "v14" ]; then \
         # zstd is available only from PG15
         export CONFIGURE_CMD="${CONFIGURE_CMD} --with-zstd"; \
     fi && \
@@ -237,7 +258,7 @@ RUN case "${DEBIAN_VERSION}" in \
 
 # Postgis 3.5.0 supports v17
 WORKDIR /ext-src
-RUN case "${PG_VERSION}" in \
+RUN case "${PG_VERSION:?}" in \
     "v17") \
         export POSTGIS_VERSION=3.5.0 \
         export POSTGIS_CHECKSUM=ca698a22cc2b2b3467ac4e063b43a28413f3004ddd505bdccdd74c56a647f510 \
@@ -312,7 +333,7 @@ FROM build-deps AS pgrouting-src
 ARG DEBIAN_VERSION
 ARG PG_VERSION
 WORKDIR /ext-src
-RUN case "${PG_VERSION}" in \
+RUN case "${PG_VERSION:?}" in \
     "v17") \
         export PGROUTING_VERSION=3.6.2 \
         export PGROUTING_CHECKSUM=f4a1ed79d6f714e52548eca3bb8e5593c6745f1bde92eb5fb858efd8984dffa2 \
@@ -358,7 +379,7 @@ COPY compute/patches/plv8-3.1.10.patch .
 #
 # Use new version only for v17
 # because since v3.2, plv8 doesn't include plcoffee and plls extensions
-RUN case "${PG_VERSION}" in \
+RUN case "${PG_VERSION:?}" in \
     "v17") \
         export PLV8_TAG=v3.2.3 \
     ;; \
@@ -372,7 +393,7 @@ RUN case "${PG_VERSION}" in \
     git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \
     tar -czf plv8.tar.gz --exclude .git plv8-src && \
     cd plv8-src && \
-    if [[ "${PG_VERSION}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi
+    if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi
 
 FROM pg-build AS plv8-build
 ARG PG_VERSION
@@ -392,7 +413,7 @@ RUN \
     find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
     # don't break computes with installed old version of plv8
     cd /usr/local/pgsql/lib/ && \
-    case "${PG_VERSION}" in \
+    case "${PG_VERSION:?}" in \
     "v17") \
         ln -s plv8-3.2.3.so plv8-3.1.8.so && \
         ln -s plv8-3.2.3.so plv8-3.1.5.so && \
@@ -729,7 +750,7 @@ FROM build-deps AS timescaledb-src
 ARG PG_VERSION
 
 WORKDIR /ext-src
-RUN case "${PG_VERSION}" in \
+RUN case "${PG_VERSION:?}" in \
       "v14" | "v15") \
         export TIMESCALEDB_VERSION=2.10.1 \
         export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
@@ -767,7 +788,7 @@ ARG PG_VERSION
 
 # version-specific, has separate releases for each version
 WORKDIR /ext-src
-RUN case "${PG_VERSION}" in \
+RUN case "${PG_VERSION:?}" in \
       "v14") \
         export PG_HINT_PLAN_VERSION=14_1_4_1 \
         export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \
@@ -843,7 +864,7 @@ ARG PG_VERSION
 # https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
 
 WORKDIR /ext-src
-RUN case "${PG_VERSION}" in \
+RUN case "${PG_VERSION:?}" in \
     "v17") \
         export RDKIT_VERSION=Release_2024_09_1 \
         export RDKIT_CHECKSUM=034c00d6e9de323506834da03400761ed8c3721095114369d06805409747a60f \
@@ -970,7 +991,7 @@ ARG PG_VERSION
 #
 # last release v0.40.0 - Jul 22, 2024
 WORKDIR /ext-src
-RUN case "${PG_VERSION}" in \
+RUN case "${PG_VERSION:?}" in \
     "v17") \
         export SEMVER_VERSION=0.40.0 \
         export SEMVER_CHECKSUM=3e50bcc29a0e2e481e7b6d2bc937cadc5f5869f55d983b5a1aafeb49f5425cfc \
@@ -1006,7 +1027,7 @@ ARG PG_VERSION
 # This is our extension, support stopped in favor of pgvector
 # TODO: deprecate it
 WORKDIR /ext-src
-RUN case "${PG_VERSION}" in \
+RUN case "${PG_VERSION:?}" in \
       "v14" | "v15") \
         export PG_EMBEDDING_VERSION=0.3.5 \
         export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \
@@ -1039,7 +1060,7 @@ ARG PG_VERSION
 # This is an experimental extension, never got to real production.
 # !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
 WORKDIR /ext-src
-RUN case "${PG_VERSION}" in "v17") \
+RUN case "${PG_VERSION:?}" in "v17") \
     echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
     esac && \
     wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
@@ -1091,7 +1112,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
 FROM pg-build-nonroot-with-cargo AS rust-extensions-build
 ARG PG_VERSION
 
-RUN case "${PG_VERSION}" in \
+RUN case "${PG_VERSION:?}" in \
         'v17') \
             echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \
     esac && \
@@ -1270,7 +1291,7 @@ FROM build-deps AS pgx_ulid-src
 ARG PG_VERSION
 
 WORKDIR /ext-src
-RUN case "${PG_VERSION}" in \
+RUN case "${PG_VERSION:?}" in \
     "v14" | "v15" | "v16") \
         ;; \
     *) \
@@ -1302,7 +1323,7 @@ FROM build-deps AS pgx_ulid-pgrx12-src
 ARG PG_VERSION
 
 WORKDIR /ext-src
-RUN case "${PG_VERSION}" in \
+RUN case "${PG_VERSION:?}" in \
     "v17") \
         ;; \
     *) \
@@ -1594,7 +1615,7 @@ RUN --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/registry \
 #
 #########################################################################################
 
-FROM debian:$DEBIAN_FLAVOR AS pgbouncer
+FROM $BASE_IMAGE_SHA AS pgbouncer
 RUN set -e \
     && echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries \
     && apt update \
@@ -1624,13 +1645,12 @@ RUN set -e \
 # Layer "exporters"
 #
 #########################################################################################
-FROM alpine/curl:${ALPINE_CURL_VERSION} AS exporters
+FROM build-deps AS exporters
 ARG TARGETARCH
 # Keep sql_exporter version same as in build-tools.Dockerfile and
 # test_runner/regress/test_compute_metrics.py
 # See comment on the top of the file regading `echo`, `-e` and `\n`
-RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc; \
-    if [ "$TARGETARCH" = "amd64" ]; then\
+RUN if [ "$TARGETARCH" = "amd64" ]; then\
         postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
         pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
         sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\
@@ -1654,7 +1674,7 @@ RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 30
 # Layer "awscli"
 #
 #########################################################################################
-FROM alpine/curl:${ALPINE_CURL_VERSION} AS awscli
+FROM build-deps AS awscli
 ARG TARGETARCH
 RUN set -ex; \
     if [ "${TARGETARCH}" = "amd64" ]; then \
@@ -1704,7 +1724,7 @@ USER nonroot
 
 COPY --chown=nonroot compute compute
 
-RUN make PG_VERSION="${PG_VERSION}" -C compute
+RUN make PG_VERSION="${PG_VERSION:?}" -C compute
 
 #########################################################################################
 #
@@ -1737,8 +1757,8 @@ COPY --from=hll-src /ext-src/ /ext-src/
 COPY --from=plpgsql_check-src /ext-src/ /ext-src/
 #COPY --from=timescaledb-src /ext-src/ /ext-src/
 COPY --from=pg_hint_plan-src /ext-src/ /ext-src/
-COPY compute/patches/pg_hint_plan_${PG_VERSION}.patch /ext-src
-RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch
+COPY compute/patches/pg_hint_plan_${PG_VERSION:?}.patch /ext-src
+RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION:?}.patch
 COPY --from=pg_cron-src /ext-src/ /ext-src/
 #COPY --from=pgx_ulid-src /ext-src/ /ext-src/
 #COPY --from=pgx_ulid-pgrx12-src /ext-src/ /ext-src/
@@ -1767,7 +1787,7 @@ ENV PGDATABASE=postgres
 # Put it all together into the final image
 #
 #########################################################################################
-FROM debian:$DEBIAN_FLAVOR
+FROM $BASE_IMAGE_SHA
 ARG DEBIAN_VERSION
 
 # Use strict mode for bash to catch errors early
diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml
index 86caa95f38..568f0b0444 100644
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -74,8 +74,8 @@ build: |
   # At time of migration to bookworm (2024-10-09), debian has a version of libcgroup/cgroup-tools 2.0.2,
   # and it _probably_ can be used as-is. However, we'll build it ourselves to minimise the changeset
   # for debian version migration.
-  #
-  FROM debian:bookworm-slim as libcgroup-builder
+  ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7
+  FROM debian@$BOOKWORM_SLIM_SHA as libcgroup-builder
   ENV LIBCGROUP_VERSION=v2.0.3
 
   RUN set -exu \
diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml
index 2fe50c3a45..124c40cf5d 100644
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -68,7 +68,8 @@ build: |
   # At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
   # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
   # requires cgroup v2, so we'll build cgroup-tools ourselves.
-  FROM debian:bullseye-slim as libcgroup-builder
+  ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1
+  FROM debian@$BULLSEYE_SLIM_SHA as libcgroup-builder
   ENV LIBCGROUP_VERSION=v2.0.3
 
   RUN set -exu \

From f62047ae97cc5185fbb025118d6cc4f18662d944 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 12 Feb 2025 17:12:21 +0100
Subject: [PATCH 447/583] pageserver: add separate semaphore for L0 compaction
 (#10780)

## Problem

L0 compaction frequently gets starved out by other background tasks and
image/GC compaction. L0 compaction must be responsive to keep read
amplification under control.

Touches #10694.
Resolves #10689.

## Summary of changes

Use a separate semaphore for the L0-only compaction pass.

* Add a `CONCURRENT_L0_COMPACTION_TASKS` semaphore and
`BackgroundLoopKind::L0Compaction`.
* Add a setting `compaction_l0_semaphore` (default off via
`compaction_l0_first`).
* Use the L0 semaphore when doing an `OnlyL0Compaction` pass.
* Use the background semaphore when doing a regular compaction pass
(which includes an initial L0 pass).
* While waiting for the background semaphore, yield for L0 compaction if
triggered.
* Add `CompactFlags::NoYield` to disable L0 yielding, and set it for the
HTTP API route.
* Remove the old `use_compaction_semaphore` setting and
compaction-scoped semaphore.
* Remove the warning when waiting for a semaphore; it's noisy and we
have metrics.
---
 control_plane/src/pageserver.rs               |  5 ++
 libs/pageserver_api/src/config.rs             |  7 +-
 libs/pageserver_api/src/models.rs             |  8 +++
 pageserver/src/config.rs                      |  6 --
 pageserver/src/http/routes.rs                 |  1 +
 pageserver/src/tenant.rs                      |  1 +
 pageserver/src/tenant/config.rs               | 13 ++++
 pageserver/src/tenant/tasks.rs                | 57 +++++++---------
 pageserver/src/tenant/timeline.rs             | 66 ++++++++++++++-----
 pageserver/src/tenant/timeline/compaction.rs  |  5 +-
 .../src/tenant/timeline/eviction_task.rs      |  7 +-
 .../regress/test_attach_tenant_config.py      |  1 +
 12 files changed, 110 insertions(+), 67 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index e15b30236e..28d130d9e0 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -362,6 +362,11 @@ impl PageServerNode {
                 .map(|x| x.parse::<bool>())
                 .transpose()
                 .context("Failed to parse 'compaction_l0_first' as a bool")?,
+            compaction_l0_semaphore: settings
+                .remove("compaction_l0_semaphore")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'compaction_l0_semaphore' as a bool")?,
             l0_flush_delay_threshold: settings
                 .remove("l0_flush_delay_threshold")
                 .map(|x| x.parse::<usize>())
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 9bc1b6d359..79f068a47b 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -94,7 +94,6 @@ pub struct ConfigToml {
     pub ondemand_download_behavior_treat_error_as_warn: bool,
     #[serde(with = "humantime_serde")]
     pub background_task_maximum_delay: Duration,
-    pub use_compaction_semaphore: bool,
     pub control_plane_api: Option<reqwest::Url>,
     pub control_plane_api_token: Option<String>,
     pub control_plane_emergency_mode: bool,
@@ -266,6 +265,9 @@ pub struct TenantConfigToml {
     pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
     /// If true, compact down L0 across all tenant timelines before doing regular compaction.
     pub compaction_l0_first: bool,
+    /// If true, use a separate semaphore (i.e. concurrency limit) for the L0 compaction pass. Only
+    /// has an effect if `compaction_l0_first` is `true`.
+    pub compaction_l0_semaphore: bool,
     /// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure,
     /// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer
     /// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification
@@ -474,7 +476,6 @@ impl Default for ConfigToml {
                 DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
             )
             .unwrap()),
-            use_compaction_semaphore: false,
 
             control_plane_api: (None),
             control_plane_api_token: (None),
@@ -548,6 +549,7 @@ pub mod tenant_conf_defaults {
     // calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB.
     pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50;
     pub const DEFAULT_COMPACTION_L0_FIRST: bool = false;
+    pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true;
 
     pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
         crate::models::CompactionAlgorithm::Legacy;
@@ -598,6 +600,7 @@ impl Default for TenantConfigToml {
                 kind: DEFAULT_COMPACTION_ALGORITHM,
             },
             compaction_l0_first: DEFAULT_COMPACTION_L0_FIRST,
+            compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE,
             l0_flush_delay_threshold: None,
             l0_flush_stall_threshold: None,
             l0_flush_wait_upload: DEFAULT_L0_FLUSH_WAIT_UPLOAD,
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 335ac4cec5..6dbfbec345 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -466,6 +466,8 @@ pub struct TenantConfigPatch {
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub compaction_l0_first: FieldPatch<bool>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_l0_semaphore: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub l0_flush_delay_threshold: FieldPatch<usize>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub l0_flush_stall_threshold: FieldPatch<usize>,
@@ -532,6 +534,7 @@ pub struct TenantConfig {
     // defer parsing compaction_algorithm, like eviction_policy
     pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
     pub compaction_l0_first: Option<bool>,
+    pub compaction_l0_semaphore: Option<bool>,
     pub l0_flush_delay_threshold: Option<usize>,
     pub l0_flush_stall_threshold: Option<usize>,
     pub l0_flush_wait_upload: Option<bool>,
@@ -571,6 +574,7 @@ impl TenantConfig {
             mut compaction_upper_limit,
             mut compaction_algorithm,
             mut compaction_l0_first,
+            mut compaction_l0_semaphore,
             mut l0_flush_delay_threshold,
             mut l0_flush_stall_threshold,
             mut l0_flush_wait_upload,
@@ -611,6 +615,9 @@ impl TenantConfig {
             .apply(&mut compaction_upper_limit);
         patch.compaction_algorithm.apply(&mut compaction_algorithm);
         patch.compaction_l0_first.apply(&mut compaction_l0_first);
+        patch
+            .compaction_l0_semaphore
+            .apply(&mut compaction_l0_semaphore);
         patch
             .l0_flush_delay_threshold
             .apply(&mut l0_flush_delay_threshold);
@@ -675,6 +682,7 @@ impl TenantConfig {
             compaction_upper_limit,
             compaction_algorithm,
             compaction_l0_first,
+            compaction_l0_semaphore,
             l0_flush_delay_threshold,
             l0_flush_stall_threshold,
             l0_flush_wait_upload,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 3dd519de75..c5368f6806 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -140,10 +140,6 @@ pub struct PageServerConf {
     /// not terrible.
     pub background_task_maximum_delay: Duration,
 
-    /// If true, use a separate semaphore for compaction tasks instead of the common background task
-    /// semaphore. Defaults to false.
-    pub use_compaction_semaphore: bool,
-
     pub control_plane_api: Option<Url>,
 
     /// JWT token for use with the control plane API.
@@ -340,7 +336,6 @@ impl PageServerConf {
             test_remote_failures,
             ondemand_download_behavior_treat_error_as_warn,
             background_task_maximum_delay,
-            use_compaction_semaphore,
             control_plane_api,
             control_plane_api_token,
             control_plane_emergency_mode,
@@ -395,7 +390,6 @@ impl PageServerConf {
             test_remote_failures,
             ondemand_download_behavior_treat_error_as_warn,
             background_task_maximum_delay,
-            use_compaction_semaphore,
             control_plane_api,
             control_plane_emergency_mode,
             heatmap_upload_concurrency,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 1d5edaa571..bd196621c1 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2151,6 +2151,7 @@ async fn timeline_compact_handler(
     let state = get_state(&request);
 
     let mut flags = EnumSet::empty();
+    flags |= CompactFlags::NoYield; // run compaction to completion
 
     if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? {
         flags |= CompactFlags::ForceL0Compaction;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 8520ae62e8..4c65991e45 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5542,6 +5542,7 @@ pub(crate) mod harness {
                 compaction_upper_limit: Some(tenant_conf.compaction_upper_limit),
                 compaction_algorithm: Some(tenant_conf.compaction_algorithm),
                 compaction_l0_first: Some(tenant_conf.compaction_l0_first),
+                compaction_l0_semaphore: Some(tenant_conf.compaction_l0_semaphore),
                 l0_flush_delay_threshold: tenant_conf.l0_flush_delay_threshold,
                 l0_flush_stall_threshold: tenant_conf.l0_flush_stall_threshold,
                 l0_flush_wait_upload: Some(tenant_conf.l0_flush_wait_upload),
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index cff33afffd..7fdfd736ad 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -289,6 +289,10 @@ pub struct TenantConfOpt {
     #[serde(default)]
     pub compaction_l0_first: Option<bool>,
 
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub compaction_l0_semaphore: Option<bool>,
+
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
     pub l0_flush_delay_threshold: Option<usize>,
@@ -423,6 +427,9 @@ impl TenantConfOpt {
             compaction_l0_first: self
                 .compaction_l0_first
                 .unwrap_or(global_conf.compaction_l0_first),
+            compaction_l0_semaphore: self
+                .compaction_l0_semaphore
+                .unwrap_or(global_conf.compaction_l0_semaphore),
             l0_flush_delay_threshold: self
                 .l0_flush_delay_threshold
                 .or(global_conf.l0_flush_delay_threshold),
@@ -501,6 +508,7 @@ impl TenantConfOpt {
             mut compaction_upper_limit,
             mut compaction_algorithm,
             mut compaction_l0_first,
+            mut compaction_l0_semaphore,
             mut l0_flush_delay_threshold,
             mut l0_flush_stall_threshold,
             mut l0_flush_wait_upload,
@@ -547,6 +555,9 @@ impl TenantConfOpt {
             .apply(&mut compaction_upper_limit);
         patch.compaction_algorithm.apply(&mut compaction_algorithm);
         patch.compaction_l0_first.apply(&mut compaction_l0_first);
+        patch
+            .compaction_l0_semaphore
+            .apply(&mut compaction_l0_semaphore);
         patch
             .l0_flush_delay_threshold
             .apply(&mut l0_flush_delay_threshold);
@@ -629,6 +640,7 @@ impl TenantConfOpt {
             compaction_upper_limit,
             compaction_algorithm,
             compaction_l0_first,
+            compaction_l0_semaphore,
             l0_flush_delay_threshold,
             l0_flush_stall_threshold,
             l0_flush_wait_upload,
@@ -692,6 +704,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             compaction_threshold: value.compaction_threshold,
             compaction_upper_limit: value.compaction_upper_limit,
             compaction_l0_first: value.compaction_l0_first,
+            compaction_l0_semaphore: value.compaction_l0_semaphore,
             l0_flush_delay_threshold: value.l0_flush_delay_threshold,
             l0_flush_stall_threshold: value.l0_flush_stall_threshold,
             l0_flush_wait_upload: value.l0_flush_wait_upload,
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 1fa01e4229..029444e973 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -4,7 +4,7 @@ use std::cmp::max;
 use std::future::Future;
 use std::ops::{ControlFlow, RangeInclusive};
 use std::pin::pin;
-use std::sync::{Arc, Mutex};
+use std::sync::Arc;
 use std::time::{Duration, Instant};
 
 use once_cell::sync::Lazy;
@@ -15,7 +15,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::metrics::{BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
+use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS};
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::compaction::CompactionOutcome;
@@ -25,7 +25,6 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD;
 use utils::backoff::exponential_backoff_duration;
 use utils::completion::Barrier;
 use utils::pausable_failpoint;
-use utils::rate_limit::RateLimit;
 
 /// Semaphore limiting concurrent background tasks (across all tenants).
 ///
@@ -38,17 +37,17 @@ static CONCURRENT_BACKGROUND_TASKS: Lazy<Semaphore> = Lazy::new(|| {
     Semaphore::new(permits)
 });
 
-/// Semaphore limiting concurrent compaction tasks (across all tenants). This is disabled by
-/// default, see `use_compaction_semaphore`.
-///
-/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work.
+/// Semaphore limiting concurrent L0 compaction tasks (across all tenants). This is only used if
+/// both `compaction_l0_semaphore` and `compaction_l0_first` are enabled.
 ///
 /// This is a separate semaphore from background tasks, because L0 compaction needs to be responsive
-/// to avoid high read amp during heavy write workloads.
+/// to avoid high read amp during heavy write workloads. Regular image/GC compaction is less
+/// important (e.g. due to page images in delta layers) and can wait for other background tasks.
 ///
-/// TODO: split image compaction and L0 compaction, and move image compaction to background tasks.
-/// Only L0 compaction needs to be responsive, and it shouldn't block on image compaction.
-static CONCURRENT_COMPACTION_TASKS: Lazy<Semaphore> = Lazy::new(|| {
+/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work. Note
+/// that this runs on the same Tokio runtime as `CONCURRENT_BACKGROUND_TASKS`, and shares the same
+/// thread pool.
+static CONCURRENT_L0_COMPACTION_TASKS: Lazy<Semaphore> = Lazy::new(|| {
     let total_threads = TOKIO_WORKER_THREADS.get();
     let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0));
     assert_ne!(permits, 0, "we will not be adding in permits later");
@@ -59,7 +58,7 @@ static CONCURRENT_COMPACTION_TASKS: Lazy<Semaphore> = Lazy::new(|| {
 /// Background jobs.
 ///
 /// NB: not all of these acquire a CONCURRENT_BACKGROUND_TASKS semaphore permit, only the ones that
-/// do any significant IO.
+/// do any significant IO or CPU work.
 #[derive(
     Debug,
     PartialEq,
@@ -72,6 +71,9 @@ static CONCURRENT_COMPACTION_TASKS: Lazy<Semaphore> = Lazy::new(|| {
 )]
 #[strum(serialize_all = "snake_case")]
 pub(crate) enum BackgroundLoopKind {
+    /// L0Compaction runs as a separate pass within the Compaction loop, not a separate loop. It is
+    /// used to request the `CONCURRENT_L0_COMPACTION_TASKS` semaphore and associated metrics.
+    L0Compaction,
     Compaction,
     Gc,
     Eviction,
@@ -91,37 +93,22 @@ pub struct BackgroundLoopSemaphorePermit<'a> {
 /// Acquires a semaphore permit, to limit concurrent background jobs.
 pub(crate) async fn acquire_concurrency_permit(
     loop_kind: BackgroundLoopKind,
-    use_compaction_semaphore: bool,
     _ctx: &RequestContext,
 ) -> BackgroundLoopSemaphorePermit<'static> {
-    // TODO: use a lower threshold and remove the pacer once we resolve some blockage.
-    const WARN_THRESHOLD: Duration = Duration::from_secs(600);
-    static WARN_PACER: Lazy<Mutex<RateLimit>> =
-        Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-
-    let mut recorder = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind);
+    let mut recorder = metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind);
 
     if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation {
         pausable_failpoint!("initial-size-calculation-permit-pause");
     }
 
     // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
-    let permit = if loop_kind == BackgroundLoopKind::Compaction && use_compaction_semaphore {
-        CONCURRENT_COMPACTION_TASKS.acquire().await
-    } else {
-        assert!(!use_compaction_semaphore);
-        CONCURRENT_BACKGROUND_TASKS.acquire().await
-    }
-    .expect("should never close");
+    let semaphore = match loop_kind {
+        BackgroundLoopKind::L0Compaction => &CONCURRENT_L0_COMPACTION_TASKS,
+        _ => &CONCURRENT_BACKGROUND_TASKS,
+    };
+    let permit = semaphore.acquire().await.expect("should never close");
 
-    let waited = recorder.acquired();
-    if waited >= WARN_THRESHOLD {
-        let waited = waited.as_secs_f64();
-        WARN_PACER
-            .lock()
-            .unwrap()
-            .call(|| warn!("{loop_kind} task waited {waited:.3}s for semaphore permit"));
-    }
+    recorder.acquired();
 
     BackgroundLoopSemaphorePermit {
         _permit: permit,
@@ -589,7 +576,7 @@ pub(crate) fn warn_when_period_overrun(
             ?task,
             "task iteration took longer than the configured period"
         );
-        crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
+        metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
             .with_label_values(&[task.into(), &format!("{}", period.as_secs())])
             .inc();
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index afa8efa453..33ca75de17 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -879,6 +879,9 @@ pub(crate) enum CompactFlags {
     OnlyL0Compaction,
     EnhancedGcBottomMostCompaction,
     DryRun,
+    /// Disables compaction yielding e.g. due to high L0 count. This is set e.g. when requesting
+    /// compaction via HTTP API.
+    NoYield,
 }
 
 #[serde_with::serde_as]
@@ -1787,34 +1790,46 @@ impl Timeline {
         .await
     }
 
-    /// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending
-    /// compaction tasks.
+    /// Outermost timeline compaction operation; downloads needed layers.
+    ///
+    /// NB: the cancellation token is usually from a background task, but can also come from a
+    /// request task.
     pub(crate) async fn compact_with_options(
         self: &Arc<Self>,
         cancel: &CancellationToken,
         options: CompactOptions,
         ctx: &RequestContext,
     ) -> Result<CompactionOutcome, CompactionError> {
-        // most likely the cancellation token is from background task, but in tests it could be the
-        // request task as well.
+        // Acquire the compaction lock and task semaphore.
+        //
+        // L0-only compaction uses a separate semaphore (if enabled) to make sure it isn't starved
+        // out by other background tasks (including image compaction). We request this via
+        // `BackgroundLoopKind::L0Compaction`.
+        //
+        // If this is a regular compaction pass, and L0-only compaction is enabled in the config,
+        // then we should yield for immediate L0 compaction if necessary while we're waiting for the
+        // background task semaphore. There's no point yielding otherwise, since we'd just end up
+        // right back here.
+        let is_l0_only = options.flags.contains(CompactFlags::OnlyL0Compaction);
+        let semaphore_kind = match is_l0_only && self.get_compaction_l0_semaphore() {
+            true => BackgroundLoopKind::L0Compaction,
+            false => BackgroundLoopKind::Compaction,
+        };
+        let yield_for_l0 = !is_l0_only
+            && self.get_compaction_l0_first()
+            && !options.flags.contains(CompactFlags::NoYield);
 
-        let prepare = async move {
+        let acquire = async move {
             let guard = self.compaction_lock.lock().await;
-
-            let permit = super::tasks::acquire_concurrency_permit(
-                BackgroundLoopKind::Compaction,
-                self.conf.use_compaction_semaphore,
-                ctx,
-            )
-            .await;
-
+            let permit = super::tasks::acquire_concurrency_permit(semaphore_kind, ctx).await;
             (guard, permit)
         };
 
-        // this wait probably never needs any "long time spent" logging, because we already nag if
-        // compaction task goes over it's period (20s) which is quite often in production.
         let (_guard, _permit) = tokio::select! {
-            tuple = prepare => { tuple },
+            (guard, permit) = acquire => (guard, permit),
+            _ = self.l0_compaction_trigger.notified(), if yield_for_l0 => {
+                return Ok(CompactionOutcome::YieldForL0);
+            }
             _ = self.cancel.cancelled() => return Ok(CompactionOutcome::Skipped),
             _ = cancel.cancelled() => return Ok(CompactionOutcome::Skipped),
         };
@@ -2326,6 +2341,20 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit)
     }
 
+    pub fn get_compaction_l0_first(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .compaction_l0_first
+            .unwrap_or(self.conf.default_tenant_conf.compaction_l0_first)
+    }
+
+    pub fn get_compaction_l0_semaphore(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .compaction_l0_semaphore
+            .unwrap_or(self.conf.default_tenant_conf.compaction_l0_semaphore)
+    }
+
     fn get_l0_flush_delay_threshold(&self) -> Option<usize> {
         // Disable L0 flushes by default. This and compaction needs further tuning.
         const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 0; // TODO: default to e.g. 3
@@ -3143,7 +3172,6 @@ impl Timeline {
             async move {
                 let wait_for_permit = super::tasks::acquire_concurrency_permit(
                     BackgroundLoopKind::InitialLogicalSizeCalculation,
-                    false,
                     background_ctx,
                 );
 
@@ -4188,6 +4216,7 @@ impl Timeline {
                     ImageLayerCreationMode::Initial,
                     ctx,
                     LastImageLayerCreationStatus::Initial,
+                    false, // don't yield for L0, we're flushing L0
                 )
                 .await?;
             debug_assert!(
@@ -4760,6 +4789,7 @@ impl Timeline {
         mode: ImageLayerCreationMode,
         ctx: &RequestContext,
         last_status: LastImageLayerCreationStatus,
+        yield_for_l0: bool,
     ) -> Result<(Vec<ResidentLayer>, LastImageLayerCreationStatus), CreateImageLayersError> {
         let timer = self.metrics.create_images_time_histo.start_timer();
 
@@ -4956,7 +4986,7 @@ impl Timeline {
 
             if let ImageLayerCreationMode::Try = mode {
                 // We have at least made some progress
-                if batch_image_writer.pending_layer_num() >= 1 {
+                if yield_for_l0 && batch_image_writer.pending_layer_num() >= 1 {
                     // The `Try` mode is currently only used on the compaction path. We want to avoid
                     // image layer generation taking too long time and blocking L0 compaction. So in this
                     // mode, we also inspect the current number of L0 layers and skip image layer generation
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index aea92d34e0..5b915c50d3 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -726,7 +726,9 @@ impl Timeline {
         }
 
         // Yield if we have pending L0 compaction. The scheduler will do another pass.
-        if l0_outcome == CompactionOutcome::Pending || l0_outcome == CompactionOutcome::YieldForL0 {
+        if (l0_outcome == CompactionOutcome::Pending || l0_outcome == CompactionOutcome::YieldForL0)
+            && !options.flags.contains(CompactFlags::NoYield)
+        {
             info!("image/ancestor compaction yielding for L0 compaction");
             return Ok(CompactionOutcome::YieldForL0);
         }
@@ -774,6 +776,7 @@ impl Timeline {
                                 .load()
                                 .as_ref()
                                 .clone(),
+                            !options.flags.contains(CompactFlags::NoYield),
                         )
                         .await
                         .inspect_err(|err| {
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 42e5f1496d..77c33349e0 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -332,11 +332,8 @@ impl Timeline {
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> {
-        let acquire_permit = crate::tenant::tasks::acquire_concurrency_permit(
-            BackgroundLoopKind::Eviction,
-            false,
-            ctx,
-        );
+        let acquire_permit =
+            crate::tenant::tasks::acquire_concurrency_permit(BackgroundLoopKind::Eviction, ctx);
 
         tokio::select! {
             permit = acquire_permit => ControlFlow::Continue(permit),
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 34d56c5cb1..07600dd911 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -141,6 +141,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "compaction_threshold": 13,
         "compaction_upper_limit": 100,
         "compaction_l0_first": False,
+        "compaction_l0_semaphore": False,
         "l0_flush_delay_threshold": 25,
         "l0_flush_stall_threshold": 42,
         "l0_flush_wait_upload": False,

From 20fe4b8ec3466fd85ff039c3cbe82a1751da905f Mon Sep 17 00:00:00 2001
From: Cheng Chen <cheng@mooncakelabs.com>
Date: Wed, 12 Feb 2025 08:29:19 -0800
Subject: [PATCH 448/583] chore(compute): pg_mooncake v0.1.2 (#10778)

## Problem
Upgrade pg_mooncake to v0.1.2

## Summary of changes

https://github.com/Mooncake-Labs/pg_mooncake/blob/main/CHANGELOG.md#012-2025-02-11
---
 compute/compute-node.Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 4a7dcf6f95..6814aadcb9 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1451,8 +1451,8 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
 FROM build-deps AS pg_mooncake-src
 ARG PG_VERSION
 WORKDIR /ext-src
-RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.1/pg_mooncake-0.1.1.tar.gz -O pg_mooncake.tar.gz && \
-    echo "a2d16eff7948dde64f072609ca5d2962d6b4d07cb89d45952add473529c55f55 pg_mooncake.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.2/pg_mooncake-0.1.2.tar.gz -O pg_mooncake.tar.gz && \
+    echo "4550473784fcdd2e1e18062bc01eb9c286abd27cdf5e11a4399be6c0a426ba90 pg_mooncake.tar.gz" | sha256sum --check && \
     mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \
     echo "make -f pg_mooncake-src/Makefile.build installcheck TEST_DIR=./test SQL_DIR=./sql SRC_DIR=./src" > neon-test.sh && \
     chmod a+x neon-test.sh

From f45f9209b9e6aa697dabb04dd3925ba8f615ce8b Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 12 Feb 2025 17:00:23 +0000
Subject: [PATCH 449/583] CI(trigger-e2e-tests): check permissions before
 running jobs (#10785)

## Problem

PRs created by external contributors, in some cases might list failed
jobs
- `Trigger E2E Tests / cancel-previous-e2e-tests`
- `Trigger E2E Tests / tag`

They don't block the merge, and tests in fact pass (their counterparts
in internal PR), but because jobs are triggered from an external PR (and
not from the corresponding internal one) they still present as red
marks.

For example https://github.com/neondatabase/neon/pull/10778

## Summary of changes
- Check permissions before triggering e2e tests
---
 .github/workflows/trigger-e2e-tests.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 31696248b0..7c7fae7972 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -15,7 +15,14 @@ env:
   E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
 
 jobs:
+  check-permissions:
+    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
+    uses: ./.github/workflows/check-permissions.yml
+    with:
+      github-event-name: ${{ github.event_name }}
+
   cancel-previous-e2e-tests:
+    needs: [ check-permissions ]
     if: github.event_name == 'pull_request'
     runs-on: ubuntu-22.04
 
@@ -29,6 +36,7 @@ jobs:
               --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
 
   tag:
+    needs: [ check-permissions ]
     runs-on: ubuntu-22.04
     outputs:
       build-tag: ${{ steps.build-tag.outputs.tag }}

From 49775d28e4ac831418d57adc43d1dba42b1a8443 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Wed, 12 Feb 2025 18:54:21 +0100
Subject: [PATCH 450/583] fix(compute): Respect skip_pg_catalog_updates in
 reconfigure() (#10696)

## Problem

We respect `skip_pg_catalog_updates` at the initial start, but ignore at
the follow-up `/configure`. Yet, it's used for storage->cplane->compute
notify requests after migrations, shard split, etc. So every time we get
them, applying the new config takes much longer than it should because
we go through Postgres catalog checks. Cplane sets this flag, when it
does serves notify attach call
https://github.com/neondatabase/cloud/commit/9068c7d7433f943af2bc350e9fd59772867e622c

Related to `inc-403`, for example

## Summary of changes

Look at `skip_pg_catalog_updates` in `compute.reconfigure()`
---
 compute_tools/src/compute.rs                  | 33 +++++-----
 .../regress/test_compute_reconfigure.py       | 62 +++++++++++++++++++
 2 files changed, 79 insertions(+), 16 deletions(-)
 create mode 100644 test_runner/regress/test_compute_reconfigure.py

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index cadc6f84d1..d323ea3dcd 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1400,26 +1400,27 @@ impl ComputeNode {
         let postgresql_conf_path = pgdata_path.join("postgresql.conf");
         config::write_postgres_conf(&postgresql_conf_path, &spec, self.internal_http_port)?;
 
-        let max_concurrent_connections = spec.reconfigure_concurrency;
+        if !spec.skip_pg_catalog_updates {
+            let max_concurrent_connections = spec.reconfigure_concurrency;
+            // Temporarily reset max_cluster_size in config
+            // to avoid the possibility of hitting the limit, while we are reconfiguring:
+            // creating new extensions, roles, etc.
+            config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
+                self.pg_reload_conf()?;
 
-        // Temporarily reset max_cluster_size in config
-        // to avoid the possibility of hitting the limit, while we are reconfiguring:
-        // creating new extensions, roles, etc.
-        config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
-            self.pg_reload_conf()?;
+                if spec.mode == ComputeMode::Primary {
+                    let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
+                    conf.application_name("apply_config");
+                    let conf = Arc::new(conf);
 
-            if spec.mode == ComputeMode::Primary {
-                let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
-                conf.application_name("apply_config");
-                let conf = Arc::new(conf);
+                    let spec = Arc::new(spec.clone());
 
-                let spec = Arc::new(spec.clone());
+                    self.apply_spec_sql(spec, conf, max_concurrent_connections)?;
+                }
 
-                self.apply_spec_sql(spec, conf, max_concurrent_connections)?;
-            }
-
-            Ok(())
-        })?;
+                Ok(())
+            })?;
+        }
 
         self.pg_reload_conf()?;
 
diff --git a/test_runner/regress/test_compute_reconfigure.py b/test_runner/regress/test_compute_reconfigure.py
new file mode 100644
index 0000000000..6619548811
--- /dev/null
+++ b/test_runner/regress/test_compute_reconfigure.py
@@ -0,0 +1,62 @@
+from __future__ import annotations
+
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.utils import wait_until
+
+
+def test_compute_reconfigure(neon_simple_env: NeonEnv):
+    """
+    Test that we can change postgresql.conf settings even if
+    skip_pg_catalog_updates=True is set.
+    """
+    env = neon_simple_env
+
+    TEST_LOG_LINE_PREFIX = "%m [%p] [test_compute_reconfigure]: "
+
+    endpoint = env.endpoints.create_start("main")
+
+    # Check that the log line prefix is not set
+    # or different from TEST_LOG_LINE_PREFIX
+    with endpoint.cursor() as cursor:
+        cursor.execute("SHOW log_line_prefix;")
+        row = cursor.fetchone()
+        assert row is not None
+        assert row[0] != TEST_LOG_LINE_PREFIX
+
+    endpoint.respec_deep(
+        **{
+            "skip_pg_catalog_updates": True,
+            "cluster": {
+                "settings": [
+                    {
+                        "name": "log_line_prefix",
+                        "vartype": "string",
+                        "value": TEST_LOG_LINE_PREFIX,
+                    }
+                ]
+            },
+        }
+    )
+    endpoint.reconfigure()
+
+    # Check that in logs we see that it was actually reconfigured,
+    # not restarted or something else.
+    endpoint.log_contains("INFO request{method=POST uri=/configure")
+
+    # In /configure we only send SIGHUP at the end, so in theory
+    # it doesn't necessarily mean that Postgres already reloaded
+    # the new config; and it may race in some envs.
+    # So we wait until we see the log line that the config was changed.
+    def check_logs():
+        endpoint.log_contains(
+            f'[test_compute_reconfigure]: LOG:  parameter "log_line_prefix" changed to "{TEST_LOG_LINE_PREFIX}"'
+        )
+
+    wait_until(check_logs)
+
+    # Check that the log line prefix is set
+    with endpoint.cursor() as cursor:
+        cursor.execute("SHOW log_line_prefix;")
+        row = cursor.fetchone()
+        assert row is not None
+        assert row[0] == TEST_LOG_LINE_PREFIX

From b77dd66bc458ee1adee4f1f5c0c4753a67cb61e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Wed, 12 Feb 2025 18:54:51 +0100
Subject: [PATCH 451/583] refactor(ci): overhaul container image pushing
 (#10613)

## Problem
Retagging container images and pushing container images taken from one
registry to another is very tangled up with artifact building and not
separated by component. This makes not building compute for storage
releases and vice versa pretty tricky. To enable that, I want to clean
up retagging and pushing of container images and then continue on making
the pipelines for releases leaner by not building unnecessary things.

## Summary of changes
- Add a reusable workflow that can push to ACR, ECR and Docker Hub,
while being very flexible in terms of source and target images. This
allows for retagging and pushing images between container registries.
- Stop pushing images to registries aside of docker hub in the jobs that
build the images
- Split image pushing into 4 different jobs (not mentioning special
cases):
  - neon-dev
  - neon-prod
  - compute-dev
  - compute-prod

## TODO
- Consider also using this for `pin-build-tools-image`, as it's
basically another instance of the same thing.

## Known limitations
- The ECR part of this workflow supports authenticating to multiple AWS
accounts and therefore multiple ECR endpoints, but the ACR part only
supports one Azure Account. If someone with more knowledge on Azure can
tell me whether an equivalent to
https://github.com/aws-actions/amazon-ecr-login?tab=readme-ov-file#login-to-ecr-on-multiple-aws-accounts
is easily possible, that'd be great.
- The `image_map` input is a bit complex. It expects something along the
lines of
  ```
  {
    "docker.io/neondatabase/compute-node-v14:13196061314": [
      "docker.io/neondatabase/compute-node-v14:13196061314",

"369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:13196061314",
      "neoneastus2.azurecr.io/neondatabase/compute-node-v14:13196061314"
    ],
    "docker.io/neondatabase/compute-node-v15:13196061314": [
      "docker.io/neondatabase/compute-node-v15:13196061314",

"369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:13196061314",
      "neoneastus2.azurecr.io/neondatabase/compute-node-v15:13196061314"
    ]
  }
  ```
to map from source to target image. We have a small python step to
generate this map for the 4 main image pushing jobs. The concrete
example is taken from
https://github.com/neondatabase/neon/actions/runs/13196061314/job/36838584098?pr=10613#step:3:6
and shortened to two images.
---
 .github/workflows/_push-to-acr.yml            |  56 ----
 .../workflows/_push-to-container-registry.yml | 101 +++++++
 .github/workflows/build_and_test.yml          | 258 +++++++-----------
 .github/workflows/trigger-e2e-tests.yml       |  30 +-
 scripts/generate_image_maps.py                |  58 ++++
 scripts/push_with_image_map.py                |  22 ++
 6 files changed, 294 insertions(+), 231 deletions(-)
 delete mode 100644 .github/workflows/_push-to-acr.yml
 create mode 100644 .github/workflows/_push-to-container-registry.yml
 create mode 100644 scripts/generate_image_maps.py
 create mode 100644 scripts/push_with_image_map.py

diff --git a/.github/workflows/_push-to-acr.yml b/.github/workflows/_push-to-acr.yml
deleted file mode 100644
index c304172ff7..0000000000
--- a/.github/workflows/_push-to-acr.yml
+++ /dev/null
@@ -1,56 +0,0 @@
-name: Push images to ACR
-on:
-  workflow_call:
-    inputs:
-      client_id:
-        description: Client ID of Azure managed identity or Entra app
-        required: true
-        type: string
-      image_tag:
-        description: Tag for the container image
-        required: true
-        type: string
-      images:
-        description: Images to push
-        required: true
-        type: string
-      registry_name:
-        description: Name of the container registry
-        required: true
-        type: string
-      subscription_id:
-        description: Azure subscription ID
-        required: true
-        type: string
-      tenant_id:
-        description: Azure tenant ID
-        required: true
-        type: string
-
-jobs:
-  push-to-acr:
-    runs-on: ubuntu-22.04
-    permissions:
-      contents: read  # This is required for actions/checkout
-      id-token: write # This is required for Azure Login to work.
-
-    steps:
-      - name: Azure login
-        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
-        with:
-          client-id: ${{ inputs.client_id }}
-          subscription-id: ${{ inputs.subscription_id }}
-          tenant-id: ${{ inputs.tenant_id }}
-
-      - name: Login to ACR
-        run: |
-          az acr login --name=${{ inputs.registry_name }}
-
-      - name: Copy docker images to ACR ${{ inputs.registry_name }}
-        run: |
-          images='${{ inputs.images }}'
-          for image in ${images}; do
-            docker buildx imagetools create \
-              -t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
-                                                        neondatabase/${image}:${{ inputs.image_tag }}
-          done
diff --git a/.github/workflows/_push-to-container-registry.yml b/.github/workflows/_push-to-container-registry.yml
new file mode 100644
index 0000000000..3c97c8a67a
--- /dev/null
+++ b/.github/workflows/_push-to-container-registry.yml
@@ -0,0 +1,101 @@
+name: Push images to Container Registry
+on:
+  workflow_call:
+    inputs:
+      # Example: {"docker.io/neondatabase/neon:13196061314":["369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]}
+      image-map:
+        description: JSON map of images, mapping from a source image to an array of target images that should be pushed.
+        required: true
+        type: string
+      aws-region:
+        description: AWS region to log in to. Required when pushing to ECR.
+        required: false
+        type: string
+      aws-account-ids:
+        description: Comma separated AWS account IDs to log in to for pushing to ECR. Required when pushing to ECR.
+        required: false
+        type: string
+      azure-client-id:
+        description: Client ID of Azure managed identity or Entra app. Required when pushing to ACR.
+        required: false
+        type: string
+      azure-subscription-id:
+        description: Azure subscription ID. Required when pushing to ACR.
+        required: false
+        type: string
+      azure-tenant-id:
+        description: Azure tenant ID. Required when pushing to ACR.
+        required: false
+        type: string
+      acr-registry-name:
+        description: ACR registry name. Required when pushing to ACR.
+        required: false
+        type: string
+    secrets:
+      docker-hub-username:
+        description: Docker Hub username. Required when pushing to Docker Hub.
+        required: false
+      docker-hub-password:
+        description: Docker Hub password. Required when pushing to Docker Hub.
+        required: false
+      aws-role-to-assume:
+        description: AWS role to assume. Required when pushing to ECR.
+        required: false
+
+permissions: {}
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+jobs:
+  push-to-container-registry:
+    runs-on: ubuntu-22.04
+    permissions:
+      id-token: write  # Required for aws/azure login
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          sparse-checkout: scripts/push_with_image_map.py
+          sparse-checkout-cone-mode: false
+
+      - name: Print image-map
+        run: echo '${{ inputs.image-map }}' | jq
+
+      - name: Configure AWS credentials
+        if: contains(inputs.image-map, 'amazonaws.com/')
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: "${{ inputs.aws-region }}"
+          role-to-assume: "${{ secrets.aws-role-to-assume }}"
+          role-duration-seconds: 3600
+
+      - name: Login to ECR
+        if: contains(inputs.image-map, 'amazonaws.com/')
+        uses: aws-actions/amazon-ecr-login@v2
+        with:
+          registries: "${{ inputs.aws-account-ids }}"
+
+      - name: Configure Azure credentials
+        if: contains(inputs.image-map, 'azurecr.io/')
+        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
+        with:
+          client-id: ${{ inputs.azure-client-id }}
+          subscription-id: ${{ inputs.azure-subscription-id }}
+          tenant-id: ${{ inputs.azure-tenant-id }}
+
+      - name: Login to ACR
+        if: contains(inputs.image-map, 'azurecr.io/')
+        run: |
+          az acr login --name=${{ inputs.acr-registry-name }}
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.docker-hub-username }}
+          password: ${{ secrets.docker-hub-password }}
+
+      - name: Copy docker images to target registries
+        run: python scripts/push_with_image_map.py
+        env:
+          IMAGE_MAP: ${{ inputs.image-map }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 5a4bdecb99..bbb489c152 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -497,7 +497,7 @@ jobs:
 
   trigger-e2e-tests:
     if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }}
-    needs: [ check-permissions, promote-images-dev, tag ]
+    needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, tag ]
     uses: ./.github/workflows/trigger-e2e-tests.yml
     secrets: inherit
 
@@ -571,21 +571,6 @@ jobs:
                                              neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \
                                              neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64
 
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-region: eu-central-1
-          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-          role-duration-seconds: 3600
-
-      - name: Login to Amazon Dev ECR
-        uses: aws-actions/amazon-ecr-login@v2
-
-      - name: Push multi-arch image to ECR
-        run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \
-                                                                                neondatabase/neon:${{ needs.tag.outputs.build-tag }}
-
   compute-node-image-arch:
     needs: [ check-permissions, build-build-tools-image, tag ]
     permissions:
@@ -632,16 +617,6 @@ jobs:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-region: eu-central-1
-          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-          role-duration-seconds: 3600
-
-      - name: Login to Amazon Dev ECR
-        uses: aws-actions/amazon-ecr-login@v2
-
       - uses: docker/login-action@v3
         with:
           registry: cache.neon.build
@@ -729,21 +704,6 @@ jobs:
                                              neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
                                              neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
 
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-region: eu-central-1
-          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-          role-duration-seconds: 3600
-
-      - name: Login to Amazon Dev ECR
-        uses: aws-actions/amazon-ecr-login@v2
-
-      - name: Push multi-arch compute-node-${{ matrix.version.pg }} image to ECR
-        run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
-                                                                                neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
-
   vm-compute-node-image:
     needs: [ check-permissions, tag, compute-node-image ]
     runs-on: [ self-hosted, large ]
@@ -876,133 +836,109 @@ jobs:
           docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true
           docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down
 
-  promote-images-dev:
-    needs: [ check-permissions, tag, vm-compute-node-image, neon-image ]
+  generate-image-maps:
+    needs: [ tag ]
     runs-on: ubuntu-22.04
-
-    permissions:
-      id-token: write # aws-actions/configure-aws-credentials
-      statuses: write
-      contents: read
-
-    env:
-      VERSIONS: v14 v15 v16 v17
-
+    outputs:
+      neon-dev: ${{ steps.generate.outputs.neon-dev }}
+      neon-prod: ${{ steps.generate.outputs.neon-prod }}
+      compute-dev: ${{ steps.generate.outputs.compute-dev }}
+      compute-prod: ${{ steps.generate.outputs.compute-prod }}
     steps:
-      - uses: docker/login-action@v3
+      - uses: actions/checkout@v4
         with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+          sparse-checkout: scripts/generate_image_maps.py
+          sparse-checkout-cone-mode: false
 
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-region: eu-central-1
-          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-          role-duration-seconds: 3600
+      - name: Generate Image Maps
+        id: generate
+        run: python scripts/generate_image_maps.py
+        env:
+          BUILD_TAG: "${{ needs.tag.outputs.build-tag }}"
+          BRANCH: "${{ github.ref_name }}"
+          DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}"
+          PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}"
 
-      - name: Login to Amazon Dev ECR
-        uses: aws-actions/amazon-ecr-login@v2
+  push-neon-image-dev:
+    needs: [ generate-image-maps, neon-image ]
+    uses: ./.github/workflows/_push-to-container-registry.yml
+    with:
+      image-map: '${{ needs.generate-image-maps.outputs.neon-dev }}'
+      aws-region: eu-central-1
+      aws-account-ids: "369495373322"
+      azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
+      azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
+      azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
+      acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
+    secrets:
+      aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}"
+      docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+      docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - name: Copy vm-compute-node images to ECR
-        run: |
-          for version in ${VERSIONS}; do
-            docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \
-                                               neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
-          done
+  push-compute-image-dev:
+    needs: [ generate-image-maps, vm-compute-node-image ]
+    uses: ./.github/workflows/_push-to-container-registry.yml
+    with:
+      image-map: '${{ needs.generate-image-maps.outputs.compute-dev }}'
+      aws-region: eu-central-1
+      aws-account-ids: "369495373322"
+      azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
+      azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
+      azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
+      acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
+    secrets:
+      aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}"
+      docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+      docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-  promote-images-prod:
-    needs: [ check-permissions, tag, test-images, promote-images-dev ]
-    runs-on: ubuntu-22.04
+  push-neon-image-prod:
     if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
+    needs: [ generate-image-maps, neon-image, test-images ]
+    uses: ./.github/workflows/_push-to-container-registry.yml
+    with:
+      image-map: '${{ needs.generate-image-maps.outputs.neon-prod }}'
+      aws-region: eu-central-1
+      aws-account-ids: "093970136003"
+      azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }}
+      azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
+      azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
+      acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
+    secrets:
+      aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}"
+      docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+      docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-    permissions:
-      id-token: write # aws-actions/configure-aws-credentials
-      statuses: write
-      contents: read
+  push-compute-image-prod:
+    if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
+    needs: [ generate-image-maps, vm-compute-node-image, test-images ]
+    uses: ./.github/workflows/_push-to-container-registry.yml
+    with:
+      image-map: '${{ needs.generate-image-maps.outputs.compute-prod }}'
+      aws-region: eu-central-1
+      aws-account-ids: "093970136003"
+      azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }}
+      azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
+      azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
+      acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
+    secrets:
+      aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}"
+      docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+      docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-    env:
-      VERSIONS: v14 v15 v16 v17
-
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-region: eu-central-1
-          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-          role-duration-seconds: 3600
-
-      - name: Login to Amazon Dev ECR
-        uses: aws-actions/amazon-ecr-login@v2
-
-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - name: Add latest tag to images
-        if: github.ref_name == 'main'
-        run: |
-          for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
-            docker buildx imagetools create -t $repo/neon:latest \
-                                               $repo/neon:${{ needs.tag.outputs.build-tag }}
-
-            for version in ${VERSIONS}; do
-              docker buildx imagetools create -t $repo/compute-node-${version}:latest \
-                                                 $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
-
-              docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
-                                                 $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
-            done
-          done
-          docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
-                                              neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
-
-      - name: Configure AWS-prod credentials
-        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-region: eu-central-1
-          mask-aws-account-id: true
-          role-to-assume: ${{ secrets.PROD_GHA_OIDC_ROLE }}
-
-      - name: Login to prod ECR
-        uses: docker/login-action@v3
-        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
-        with:
-          registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com
-
-      - name: Copy all images to prod ECR
-        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
-        run: |
-          for image in neon {vm-,}compute-node-{v14,v15,v16,v17}; do
-            docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
-                                               369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
-          done
-
-  push-to-acr-dev:
+  # This is a bit of a special case so we're not using a generated image map.
+  add-latest-tag-to-neon-extensions-test-image:
     if: github.ref_name == 'main'
-    needs: [ tag, promote-images-dev ]
-    uses: ./.github/workflows/_push-to-acr.yml
+    needs: [ tag, compute-node-image ]
+    uses: ./.github/workflows/_push-to-container-registry.yml
     with:
-      client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
-      image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
-      registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
-      subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
-      tenant_id: ${{ vars.AZURE_TENANT_ID }}
-
-  push-to-acr-prod:
-    if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
-    needs: [ tag, promote-images-prod ]
-    uses: ./.github/workflows/_push-to-acr.yml
-    with:
-      client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
-      image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
-      registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
-      subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
-      tenant_id: ${{ vars.AZURE_TENANT_ID }}
+      image-map: |
+        {
+          "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"],
+          "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"]
+        }
+    secrets:
+      docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+      docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
   trigger-custom-extensions-build-and-wait:
     needs: [ check-permissions, tag ]
@@ -1084,7 +1020,7 @@ jobs:
           exit 1
 
   deploy:
-    needs: [ check-permissions, promote-images-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
+    needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
     # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
     if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled()
     permissions:
@@ -1337,7 +1273,7 @@ jobs:
           done
 
   pin-build-tools-image:
-    needs: [ build-build-tools-image, promote-images-prod, build-and-test-locally ]
+    needs: [ build-build-tools-image, push-compute-image-prod, push-neon-image-prod, build-and-test-locally ]
     if: github.ref_name == 'main'
     uses: ./.github/workflows/pin-build-tools-image.yml
     with:
@@ -1362,7 +1298,8 @@ jobs:
       - check-codestyle-rust
       - check-dependencies-rust
       - files-changed
-      - promote-images-dev
+      - push-compute-image-dev
+      - push-neon-image-dev
       - test-images
       - trigger-custom-extensions-build-and-wait
     runs-on: ubuntu-22.04
@@ -1379,6 +1316,7 @@ jobs:
           || needs.check-codestyle-python.result == 'skipped'
           || needs.check-codestyle-rust.result == 'skipped'
           || needs.files-changed.result == 'skipped'
-          || needs.promote-images-dev.result == 'skipped'
+          || needs.push-compute-image-dev.result == 'skipped'
+          || needs.push-neon-image-dev.result == 'skipped'
           || needs.test-images.result == 'skipped'
           || needs.trigger-custom-extensions-build-and-wait.result == 'skipped'
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 7c7fae7972..27ed1e4cff 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -76,7 +76,7 @@ jobs:
       GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
       TAG: ${{ needs.tag.outputs.build-tag }}
     steps:
-      - name: Wait for `promote-images-dev` job to finish
+      - name: Wait for `push-{neon,compute}-image-dev` job to finish
         # It's important to have a timeout here, the script in the step can run infinitely
         timeout-minutes: 60
         run: |
@@ -87,20 +87,20 @@ jobs:
           # For PRs we use the run id as the tag
           BUILD_AND_TEST_RUN_ID=${TAG}
           while true; do
-            conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images-dev") | .conclusion')
-            case "$conclusion" in
-              success)
-                break
-                ;;
-              failure | cancelled | skipped)
-                echo "The 'promote-images-dev' job didn't succeed: '${conclusion}'. Exiting..."
-                exit 1
-                ;;
-              *)
-                echo "The 'promote-images-dev' hasn't succeed yet. Waiting..."
-                sleep 60
-                ;;
-            esac
+            gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '[.jobs[] | select((.name | startswith("push-neon-image-dev")) or (.name | startswith("push-compute-image-dev"))) | {"name": .name, "conclusion": .conclusion, "url": .url}]' > jobs.json
+            if [ $(jq '[.[] | select(.conclusion == "success")]' jobs.json) -eq 2 ]; then
+              break
+            fi
+            jq -c '.[]' jobs.json | while read -r job; do
+              case $(echo $job | jq .conclusion) in
+                failure | cancelled | skipped)
+                  echo "The '$(echo $job | jq .name)' job didn't succeed: '$(echo $job | jq .conclusion)'. See log in '$(echo $job | jq .url)' Exiting..."
+                  exit 1
+                  ;;
+              esac
+            done
+            echo "The 'push-{neon,compute}-image-dev' jobs haven't succeeded yet. Waiting..."
+            sleep 60
           done
 
       - name: Set e2e-platforms
diff --git a/scripts/generate_image_maps.py b/scripts/generate_image_maps.py
new file mode 100644
index 0000000000..a2f553d290
--- /dev/null
+++ b/scripts/generate_image_maps.py
@@ -0,0 +1,58 @@
+import itertools
+import json
+import os
+
+build_tag = os.environ["BUILD_TAG"]
+branch = os.environ["BRANCH"]
+dev_acr = os.environ["DEV_ACR"]
+prod_acr = os.environ["PROD_ACR"]
+
+components = {
+    "neon": ["neon"],
+    "compute": [
+        "compute-node-v14",
+        "compute-node-v15",
+        "compute-node-v16",
+        "compute-node-v17",
+        "vm-compute-node-v14",
+        "vm-compute-node-v15",
+        "vm-compute-node-v16",
+        "vm-compute-node-v17",
+    ],
+}
+
+registries = {
+    "dev": [
+        "docker.io/neondatabase",
+        "369495373322.dkr.ecr.eu-central-1.amazonaws.com",
+        f"{dev_acr}.azurecr.io/neondatabase",
+    ],
+    "prod": [
+        "093970136003.dkr.ecr.eu-central-1.amazonaws.com",
+        f"{prod_acr}.azurecr.io/neondatabase",
+    ],
+}
+
+outputs: dict[str, dict[str, list[str]]] = {}
+
+target_tags = [build_tag, "latest"] if branch == "main" else [build_tag]
+target_stages = ["dev", "prod"] if branch.startswith("release") else ["dev"]
+
+for component_name, component_images in components.items():
+    for stage in target_stages:
+        outputs[f"{component_name}-{stage}"] = dict(
+            [
+                (
+                    f"docker.io/neondatabase/{component_image}:{build_tag}",
+                    [
+                        f"{combo[0]}/{component_image}:{combo[1]}"
+                        for combo in itertools.product(registries[stage], target_tags)
+                    ],
+                )
+                for component_image in component_images
+            ]
+        )
+
+with open(os.environ["GITHUB_OUTPUT"], "a") as f:
+    for key, value in outputs.items():
+        f.write(f"{key}={json.dumps(value)}\n")
diff --git a/scripts/push_with_image_map.py b/scripts/push_with_image_map.py
new file mode 100644
index 0000000000..c68f6ad407
--- /dev/null
+++ b/scripts/push_with_image_map.py
@@ -0,0 +1,22 @@
+import json
+import os
+import subprocess
+
+image_map = os.getenv("IMAGE_MAP")
+if not image_map:
+    raise ValueError("IMAGE_MAP environment variable is not set")
+
+try:
+    parsed_image_map: dict[str, list[str]] = json.loads(image_map)
+except json.JSONDecodeError as e:
+    raise ValueError("Failed to parse IMAGE_MAP as JSON") from e
+
+for source, targets in parsed_image_map.items():
+    for target in targets:
+        cmd = ["docker", "buildx", "imagetools", "create", "-t", target, source]
+        print(f"Running: {' '.join(cmd)}")
+        result = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+
+        if result.returncode != 0:
+            print(f"Error: {result.stdout}")
+            raise RuntimeError(f"Command failed: {' '.join(cmd)}")

From 61d24746323e6f938bc74eb427b707e328e47185 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 12 Feb 2025 20:29:17 +0100
Subject: [PATCH 452/583] Also check by the planned gc cutoff for lease
 creation (#10764)

We don't want to allow new leases below the planned gc cutoff either.
Other APIs like branch creation or getpage requests already enforce
this.
---
 pageserver/src/tenant/timeline.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 33ca75de17..aa71ccbbab 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1557,6 +1557,7 @@ impl Timeline {
             let lsn = xlog_utils::normalize_lsn(lsn, WAL_SEGMENT_SIZE);
 
             let mut gc_info = self.gc_info.write().unwrap();
+            let planned_cutoff = gc_info.min_cutoff();
 
             let valid_until = SystemTime::now() + length;
 
@@ -1577,7 +1578,7 @@ impl Timeline {
                     existing_lease.clone()
                 }
                 Entry::Vacant(vacant) => {
-                    // Reject already GC-ed LSN (lsn < latest_gc_cutoff) if we are in AttachedSingle and
+                    // Reject already GC-ed LSN if we are in AttachedSingle and
                     // not blocked by the lsn lease deadline.
                     let validate = {
                         let conf = self.tenant_conf.load();
@@ -1588,7 +1589,10 @@ impl Timeline {
                     if init || validate {
                         let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn();
                         if lsn < *latest_gc_cutoff_lsn {
-                            bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
+                            bail!("tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
+                        }
+                        if lsn < planned_cutoff {
+                            bail!("tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", lsn, planned_cutoff);
                         }
                     }
 

From 922f3ee17d6e464b8cb257fa9371e94d6bf30204 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 12 Feb 2025 20:48:11 +0100
Subject: [PATCH 453/583] Compress git history of Azure SDK (#10790)

Switch the Azure SDK git fork to one with a compressed git history. This
helps with download speed of the git repository.

closes #10732
---
 Cargo.lock | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 30b7130bbf..407c8170bb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -786,7 +786,7 @@ dependencies = [
 [[package]]
 name = "azure_core"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
 dependencies = [
  "async-trait",
  "base64 0.22.1",
@@ -815,7 +815,7 @@ dependencies = [
 [[package]]
 name = "azure_identity"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
 dependencies = [
  "async-lock",
  "async-trait",
@@ -834,7 +834,7 @@ dependencies = [
 [[package]]
 name = "azure_storage"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
 dependencies = [
  "RustyXML",
  "async-lock",
@@ -852,7 +852,7 @@ dependencies = [
 [[package]]
 name = "azure_storage_blobs"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
 dependencies = [
  "RustyXML",
  "azure_core",
@@ -872,7 +872,7 @@ dependencies = [
 [[package]]
 name = "azure_svc_blobstorage"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
 dependencies = [
  "azure_core",
  "bytes",

From e38694742cf0c5d2538856d0a969b52c2076a01b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Wed, 12 Feb 2025 21:26:05 +0100
Subject: [PATCH 454/583] fix(ci): don't try pushing to prod container
 registries from main (#10795)

## Problem
https://github.com/neondatabase/neon/pull/10613 changed how images are
pushed, and there was a small mismatch between the github workflow and
the script generating what to push where. This resulted in the workflow
trying to push images to prod registries from the main branch, even
though we don't do that and therefore didn't generate a mapping for
those registries in the script that decides what to push where.

This misconception happened because promote-images-dev pushed to dev
registries, and promote-images-prod pushed to prod registries, but
promote-images-prod also updated the latest tag in the dev registries if
and only if we are on the main branch. This last bit is why the
push-<component>-image-prod jobs were trying to run on the main branch.

## Summary of changes
Don't try pushing to prod registries from the main branch.
---
 .github/workflows/build_and_test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index bbb489c152..88cb395958 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -892,7 +892,7 @@ jobs:
       docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
   push-neon-image-prod:
-    if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
+    if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
     needs: [ generate-image-maps, neon-image, test-images ]
     uses: ./.github/workflows/_push-to-container-registry.yml
     with:
@@ -909,7 +909,7 @@ jobs:
       docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
   push-compute-image-prod:
-    if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
+    if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
     needs: [ generate-image-maps, vm-compute-node-image, test-images ]
     uses: ./.github/workflows/_push-to-container-registry.yml
     with:

From 7b966a2b71914d2344c9eca04c49cc687a714ff9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Thu, 13 Feb 2025 11:13:26 +0100
Subject: [PATCH 455/583] CI(trigger-e2e-tests): fix checking for successful
 image pushes (#10803)

## Problem
https://github.com/neondatabase/neon/pull/10613 changed how images are
pushed, and therefore also how we have to wait for images to be pushed
in `trigger-e2e-tests`. The `trigger-e2e-tests` workflow is triggered in
three different ways:
- When a pull request is pushed to that is already ready to review, here
we call the workflow from `build_and_test`
- When a pull request is marked ready for review, then the workflow is
triggered directly
- When a push to `main` or `release(-.*)?` triggers `build_and_test` and
that indirectly calls `trigger-e2e-tests`.

The second of these paths had a bug, which was not tested in the PR,
because this path being different wasn't clear to me.

## Summary of changes
Fix the jq statement that caused the bug.
---
 .github/workflows/trigger-e2e-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 27ed1e4cff..be6a7a7901 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -88,7 +88,7 @@ jobs:
           BUILD_AND_TEST_RUN_ID=${TAG}
           while true; do
             gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '[.jobs[] | select((.name | startswith("push-neon-image-dev")) or (.name | startswith("push-compute-image-dev"))) | {"name": .name, "conclusion": .conclusion, "url": .url}]' > jobs.json
-            if [ $(jq '[.[] | select(.conclusion == "success")]' jobs.json) -eq 2 ]; then
+            if [ $(jq '[.[] | select(.conclusion == "success")] | length' jobs.json) -eq 2 ]; then
               break
             fi
             jq -c '.[]' jobs.json | while read -r job; do

From 356cca23a58d50cb6d210c3dde70763ee117afab Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Thu, 13 Feb 2025 12:22:13 +0200
Subject: [PATCH 456/583] fix(proxy): Change HSet to HDel for cancellation key
 metric (#10789)

---
 proxy/src/cancellation.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index e84f1676e2..1f9c8a48b7 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -501,7 +501,7 @@ impl Session {
             _guard: Metrics::get()
                 .proxy
                 .cancel_channel_size
-                .guard(RedisMsgKind::HSet),
+                .guard(RedisMsgKind::HDel),
         };
 
         let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {

From b8095f84a006c89914cc15b1ab5610d51fb92fc6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 13 Feb 2025 10:33:47 +0000
Subject: [PATCH 457/583] pageserver: make true GC cutoff visible in admin API,
 rebrand `latest_gc_cutoff` as `applied_gc_cutoff` (#10707)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

We expose `latest_gc_cutoff` in our API, and callers understandably were
using that to validate LSNs for branch creation. However, this is _not_
the true GC cutoff from a user's point of view: it's just the point at
which we last actually did GC. The actual cutoff used when validating
branch creations and page_service reads is the min() of latest_gc_cutoff
and the planned GC lsn in GcInfo.

Closes: https://github.com/neondatabase/neon/issues/10639

## Summary of changes

- Expose the more useful min() of GC cutoffs as `gc_cutoff_lsn` in the
API, so that the most obviously named field is really the one people
should use.
- Retain the ability to read the LSN at which GC was actually done, in
an `applied_gc_cutoff_lsn` field.
- Internally rename `latest_gc_cutoff_lsn` to `applied_gc_cutoff_lsn`
("latest" was a confusing name, as the value in GcInfo is more up to
date in terms of what a user experiences)
- Temporarily preserve the old `latest_gc_cutoff_lsn` field for compat
with control plane until we update it to use the new field.

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 libs/pageserver_api/src/models.rs            | 17 ++++++++
 pageserver/src/http/openapi_spec.yml         |  5 ++-
 pageserver/src/http/routes.rs                | 12 ++++-
 pageserver/src/page_service.rs               | 12 ++---
 pageserver/src/pgdatadir_mapping.rs          |  2 +-
 pageserver/src/tenant.rs                     | 46 ++++++++++----------
 pageserver/src/tenant/metadata.rs            |  3 ++
 pageserver/src/tenant/size.rs                |  2 +-
 pageserver/src/tenant/timeline.rs            | 33 +++++++++-----
 pageserver/src/tenant/timeline/compaction.rs |  4 +-
 test_runner/regress/test_import_pgdata.py    |  4 +-
 test_runner/regress/test_readonly_node.py    |  2 +-
 12 files changed, 92 insertions(+), 50 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 6dbfbec345..426222a531 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1136,7 +1136,24 @@ pub struct TimelineInfo {
     pub ancestor_lsn: Option<Lsn>,
     pub last_record_lsn: Lsn,
     pub prev_record_lsn: Option<Lsn>,
+
+    /// Legacy field for compat with control plane.  Synonym of `min_readable_lsn`.
+    /// TODO: remove once control plane no longer reads it.
     pub latest_gc_cutoff_lsn: Lsn,
+
+    /// The LSN up to which GC has advanced: older data may still exist but it is not available for clients.
+    /// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead,
+    /// as it is easier to reason about.
+    pub applied_gc_cutoff_lsn: Lsn,
+
+    /// The upper bound of data which is either already GC'ed, or elegible to be GC'ed at any time based on PITR interval.
+    /// This LSN represents the "end of history" for this timeline, and callers should use it to figure out the oldest
+    /// LSN at which it is legal to create a branch or ephemeral endpoint.
+    ///
+    /// Note that holders of valid LSN leases may be able to create branches and read pages earlier
+    /// than this LSN, but new leases may not be taken out earlier than this LSN.
+    pub min_readable_lsn: Lsn,
+
     pub disk_consistent_lsn: Lsn,
 
     /// The LSN that we have succesfully uploaded to remote storage
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 4b976e7f6f..b8ed7aaf26 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1080,7 +1080,10 @@ components:
           type: integer
         state:
           type: string
-        latest_gc_cutoff_lsn:
+        min_readable_lsn:
+          type: string
+          format: hex
+        applied_gc_cutoff_lsn:
           type: string
           format: hex
 
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index bd196621c1..a0c639a16d 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -482,6 +482,11 @@ async fn build_timeline_info_common(
 
     let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();
 
+    let min_readable_lsn = std::cmp::max(
+        timeline.get_gc_cutoff_lsn(),
+        *timeline.get_applied_gc_cutoff_lsn(),
+    );
+
     let info = TimelineInfo {
         tenant_id: timeline.tenant_shard_id,
         timeline_id: timeline.timeline_id,
@@ -493,7 +498,12 @@ async fn build_timeline_info_common(
         initdb_lsn,
         last_record_lsn,
         prev_record_lsn: Some(timeline.get_prev_record_lsn()),
-        latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
+        // Externally, expose the lowest LSN that can be used to create a branch as the "GC cutoff", although internally
+        // we distinguish between the "planned" GC cutoff (PITR point) and the "latest" GC cutoff (where we
+        // actually trimmed data to), which can pass each other when PITR is changed.
+        latest_gc_cutoff_lsn: min_readable_lsn,
+        min_readable_lsn,
+        applied_gc_cutoff_lsn: *timeline.get_applied_gc_cutoff_lsn(),
         current_logical_size: current_logical_size.size_dont_care_about_accuracy(),
         current_logical_size_is_accurate: match current_logical_size.accuracy() {
             tenant::timeline::logical_size::Accuracy::Approximate => false,
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 972dad34d4..025519d0ec 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -914,7 +914,7 @@ impl PageServerHandler {
                     &shard,
                     req.hdr.request_lsn,
                     req.hdr.not_modified_since,
-                    &shard.get_latest_gc_cutoff_lsn(),
+                    &shard.get_applied_gc_cutoff_lsn(),
                     ctx,
                 )
                 // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
@@ -1810,7 +1810,7 @@ impl PageServerHandler {
         req: &PagestreamExistsRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
             req.hdr.request_lsn,
@@ -1837,7 +1837,7 @@ impl PageServerHandler {
         req: &PagestreamNblocksRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
             req.hdr.request_lsn,
@@ -1864,7 +1864,7 @@ impl PageServerHandler {
         req: &PagestreamDbSizeRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
             req.hdr.request_lsn,
@@ -1954,7 +1954,7 @@ impl PageServerHandler {
         req: &PagestreamGetSlruSegmentRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
             req.hdr.request_lsn,
@@ -2071,7 +2071,7 @@ impl PageServerHandler {
             //return Err(QueryError::NotFound("timeline is archived".into()))
         }
 
-        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
         if let Some(lsn) = lsn {
             // Backup was requested at a particular LSN. Wait for it to arrive.
             info!("waiting for {}", lsn);
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 00f332d797..f2dca8befa 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -611,7 +611,7 @@ impl Timeline {
     ) -> Result<LsnForTimestamp, PageReconstructError> {
         pausable_failpoint!("find-lsn-for-timestamp-pausable");
 
-        let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
+        let gc_cutoff_lsn_guard = self.get_applied_gc_cutoff_lsn();
         let gc_cutoff_planned = {
             let gc_info = self.gc_info.read().unwrap();
             gc_info.min_cutoff()
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4c65991e45..605bfac2b3 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4695,24 +4695,24 @@ impl Tenant {
         // We check it against both the planned GC cutoff stored in 'gc_info',
         // and the 'latest_gc_cutoff' of the last GC that was performed.  The
         // planned GC cutoff in 'gc_info' is normally larger than
-        // 'latest_gc_cutoff_lsn', but beware of corner cases like if you just
+        // 'applied_gc_cutoff_lsn', but beware of corner cases like if you just
         // changed the GC settings for the tenant to make the PITR window
         // larger, but some of the data was already removed by an earlier GC
         // iteration.
 
         // check against last actual 'latest_gc_cutoff' first
-        let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
+        let applied_gc_cutoff_lsn = src_timeline.get_applied_gc_cutoff_lsn();
         {
             let gc_info = src_timeline.gc_info.read().unwrap();
             let planned_cutoff = gc_info.min_cutoff();
             if gc_info.lsn_covered_by_lease(start_lsn) {
-                tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *latest_gc_cutoff_lsn);
+                tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *applied_gc_cutoff_lsn);
             } else {
                 src_timeline
-                    .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
+                    .check_lsn_is_in_scope(start_lsn, &applied_gc_cutoff_lsn)
                     .context(format!(
                         "invalid branch start lsn: less than latest GC cutoff {}",
-                        *latest_gc_cutoff_lsn,
+                        *applied_gc_cutoff_lsn,
                     ))
                     .map_err(CreateTimelineError::AncestorLsn)?;
 
@@ -4751,7 +4751,7 @@ impl Tenant {
             dst_prev,
             Some(src_id),
             start_lsn,
-            *src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
+            *src_timeline.applied_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
             src_timeline.initdb_lsn,
             src_timeline.pg_version,
         );
@@ -6130,8 +6130,8 @@ mod tests {
         make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
 
         repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?;
-        let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
-        assert!(*latest_gc_cutoff_lsn > Lsn(0x25));
+        let applied_gc_cutoff_lsn = tline.get_applied_gc_cutoff_lsn();
+        assert!(*applied_gc_cutoff_lsn > Lsn(0x25));
         match tline.get(*TEST_KEY, Lsn(0x25)) {
             Ok(_) => panic!("request for page should have failed"),
             Err(err) => assert!(err.to_string().contains("not found at")),
@@ -8427,7 +8427,7 @@ mod tests {
             .await?;
         {
             tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                 .lock_for_write()
                 .store_and_unlock(Lsn(0x30))
                 .wait()
@@ -8535,7 +8535,7 @@ mod tests {
         // increase GC horizon and compact again
         {
             tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                 .lock_for_write()
                 .store_and_unlock(Lsn(0x40))
                 .wait()
@@ -8703,8 +8703,8 @@ mod tests {
 
         // Force set disk consistent lsn so we can get the cutoff at `end_lsn`.
         info!(
-            "latest_gc_cutoff_lsn: {}",
-            *timeline.get_latest_gc_cutoff_lsn()
+            "applied_gc_cutoff_lsn: {}",
+            *timeline.get_applied_gc_cutoff_lsn()
         );
         timeline.force_set_disk_consistent_lsn(end_lsn);
 
@@ -8730,7 +8730,7 @@ mod tests {
 
         // Make lease on a already GC-ed LSN.
         // 0/80 does not have a valid lease + is below latest_gc_cutoff
-        assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn());
+        assert!(Lsn(0x80) < *timeline.get_applied_gc_cutoff_lsn());
         timeline
             .init_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx)
             .expect_err("lease request on GC-ed LSN should fail");
@@ -8921,7 +8921,7 @@ mod tests {
         };
         {
             tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                 .lock_for_write()
                 .store_and_unlock(Lsn(0x30))
                 .wait()
@@ -9008,7 +9008,7 @@ mod tests {
         // increase GC horizon and compact again
         {
             tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                 .lock_for_write()
                 .store_and_unlock(Lsn(0x40))
                 .wait()
@@ -9461,7 +9461,7 @@ mod tests {
             .await?;
         {
             tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                 .lock_for_write()
                 .store_and_unlock(Lsn(0x30))
                 .wait()
@@ -9608,7 +9608,7 @@ mod tests {
         // increase GC horizon and compact again
         {
             tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                 .lock_for_write()
                 .store_and_unlock(Lsn(0x38))
                 .wait()
@@ -9709,7 +9709,7 @@ mod tests {
             .await?;
         {
             tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                 .lock_for_write()
                 .store_and_unlock(Lsn(0x30))
                 .wait()
@@ -9960,7 +9960,7 @@ mod tests {
 
         {
             parent_tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                 .lock_for_write()
                 .store_and_unlock(Lsn(0x10))
                 .wait()
@@ -9980,7 +9980,7 @@ mod tests {
 
         {
             branch_tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                 .lock_for_write()
                 .store_and_unlock(Lsn(0x50))
                 .wait()
@@ -10336,7 +10336,7 @@ mod tests {
 
         {
             tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                 .lock_for_write()
                 .store_and_unlock(Lsn(0x30))
                 .wait()
@@ -10721,7 +10721,7 @@ mod tests {
             .await?;
         {
             tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                 .lock_for_write()
                 .store_and_unlock(Lsn(0x30))
                 .wait()
@@ -10972,7 +10972,7 @@ mod tests {
             .await?;
         {
             tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                 .lock_for_write()
                 .store_and_unlock(Lsn(0x30))
                 .wait()
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index d281eb305f..15c6955260 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -130,7 +130,10 @@ struct TimelineMetadataBodyV2 {
     prev_record_lsn: Option<Lsn>,
     ancestor_timeline: Option<TimelineId>,
     ancestor_lsn: Lsn,
+
+    // The LSN at which GC was last executed.  Synonym of [`Timeline::applied_gc_cutoff_lsn`].
     latest_gc_cutoff_lsn: Lsn,
+
     initdb_lsn: Lsn,
     pg_version: u32,
 }
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 6c3276ea3c..1e84a9d9dc 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -394,7 +394,7 @@ pub(super) async fn gather_inputs(
             ancestor_lsn,
             last_record: last_record_lsn,
             // this is not used above, because it might not have updated recently enough
-            latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
+            latest_gc_cutoff: *timeline.get_applied_gc_cutoff_lsn(),
             next_pitr_cutoff,
             retention_param_cutoff,
             lease_points,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index aa71ccbbab..b211af4eff 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -352,8 +352,11 @@ pub struct Timeline {
     /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
     layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
 
-    // Needed to ensure that we can't create a branch at a point that was already garbage collected
-    pub latest_gc_cutoff_lsn: Rcu<Lsn>,
+    // The LSN at which we have executed GC: whereas [`Self::gc_info`] records the LSN at which
+    // we _intend_ to GC (i.e. the PITR cutoff), this LSN records where we actually last did it.
+    // Because PITR interval is mutable, it's possible for this LSN to be earlier or later than
+    // the planned GC cutoff.
+    pub applied_gc_cutoff_lsn: Rcu<Lsn>,
 
     pub(crate) gc_compaction_layer_update_lock: tokio::sync::RwLock<()>,
 
@@ -1077,9 +1080,15 @@ impl Timeline {
         (history, gc_info.within_ancestor_pitr)
     }
 
-    /// Lock and get timeline's GC cutoff
-    pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
-        self.latest_gc_cutoff_lsn.read()
+    /// Read timeline's GC cutoff: this is the LSN at which GC has started to happen
+    pub(crate) fn get_applied_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
+        self.applied_gc_cutoff_lsn.read()
+    }
+
+    /// Read timeline's planned GC cutoff: this is the logical end of history that users
+    /// are allowed to read (based on configured PITR), even if physically we have more history.
+    pub(crate) fn get_gc_cutoff_lsn(&self) -> Lsn {
+        self.gc_info.read().unwrap().cutoffs.time
     }
 
     /// Look up given page version.
@@ -1587,7 +1596,7 @@ impl Timeline {
                     };
 
                     if init || validate {
-                        let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn();
+                        let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn();
                         if lsn < *latest_gc_cutoff_lsn {
                             bail!("tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
                         }
@@ -2659,7 +2668,7 @@ impl Timeline {
                     LastImageLayerCreationStatus::default(),
                 )),
 
-                latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
+                applied_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
                 initdb_lsn: metadata.initdb_lsn(),
 
                 current_logical_size: if disk_consistent_lsn.is_valid() {
@@ -3662,7 +3671,7 @@ impl Timeline {
         // the timeline, then it will remove layers that are required for fulfilling
         // the current get request (read-path cannot "look back" and notice the new
         // image layer).
-        let _gc_cutoff_holder = timeline.get_latest_gc_cutoff_lsn();
+        let _gc_cutoff_holder = timeline.get_applied_gc_cutoff_lsn();
 
         // See `compaction::compact_with_gc` for why we need this.
         let _guard = timeline.gc_compaction_layer_update_lock.read().await;
@@ -4349,7 +4358,7 @@ impl Timeline {
         let update = crate::tenant::metadata::MetadataUpdate::new(
             disk_consistent_lsn,
             ondisk_prev_record_lsn,
-            *self.latest_gc_cutoff_lsn.read(),
+            *self.applied_gc_cutoff_lsn.read(),
         );
 
         fail_point!("checkpoint-before-saving-metadata", |x| bail!(
@@ -5577,7 +5586,7 @@ impl Timeline {
                 // PITR interval is set & we didn't look up a timestamp successfully.  Conservatively assume PITR
                 // cannot advance beyond what was already GC'd, and respect space-based retention
                 GcCutoffs {
-                    time: *self.get_latest_gc_cutoff_lsn(),
+                    time: *self.get_applied_gc_cutoff_lsn(),
                     space: space_cutoff,
                 }
             }
@@ -5698,7 +5707,7 @@ impl Timeline {
         let mut result: GcResult = GcResult::default();
 
         // Nothing to GC. Return early.
-        let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff = *self.get_applied_gc_cutoff_lsn();
         if latest_gc_cutoff >= new_gc_cutoff {
             info!(
                 "Nothing to GC: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}",
@@ -5712,7 +5721,7 @@ impl Timeline {
         //
         // The GC cutoff should only ever move forwards.
         let waitlist = {
-            let write_guard = self.latest_gc_cutoff_lsn.lock_for_write();
+            let write_guard = self.applied_gc_cutoff_lsn.lock_for_write();
             if *write_guard > new_gc_cutoff {
                 return Err(GcError::BadLsn {
                     why: format!(
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 5b915c50d3..6931f360a4 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -852,7 +852,7 @@ impl Timeline {
         //
         // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
         // are rewriting layers.
-        let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();
 
         tracing::info!(
             "latest_gc_cutoff: {}, pitr cutoff {}",
@@ -2202,7 +2202,7 @@ impl Timeline {
 
         // TODO: ensure the child branches will not use anything below the watermark, or consider
         // them when computing the watermark.
-        gc_cutoff_lsn.min(*self.get_latest_gc_cutoff_lsn())
+        gc_cutoff_lsn.min(*self.get_applied_gc_cutoff_lsn())
     }
 
     /// Split a gc-compaction job into multiple compaction jobs. The split is based on the key range and the estimated size of the compaction job.
diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py
index 6b35f3c6bb..ea86eb62eb 100644
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -231,14 +231,14 @@ def test_pgdata_import_smoke(
     shard_zero_http = shard_zero_ps.http_client()
     shard_zero_timeline_info = shard_zero_http.timeline_detail(shard_zero["shard_id"], timeline_id)
     initdb_lsn = Lsn(shard_zero_timeline_info["initdb_lsn"])
-    latest_gc_cutoff_lsn = Lsn(shard_zero_timeline_info["latest_gc_cutoff_lsn"])
+    min_readable_lsn = Lsn(shard_zero_timeline_info["min_readable_lsn"])
     last_record_lsn = Lsn(shard_zero_timeline_info["last_record_lsn"])
     disk_consistent_lsn = Lsn(shard_zero_timeline_info["disk_consistent_lsn"])
     _remote_consistent_lsn = Lsn(shard_zero_timeline_info["remote_consistent_lsn"])
     remote_consistent_lsn_visible = Lsn(shard_zero_timeline_info["remote_consistent_lsn_visible"])
     # assert remote_consistent_lsn_visible == remote_consistent_lsn TODO: this fails initially and after restart, presumably because `UploadQueue::clean.1` is still `None`
     assert remote_consistent_lsn_visible == disk_consistent_lsn
-    assert initdb_lsn == latest_gc_cutoff_lsn
+    assert initdb_lsn == min_readable_lsn
     assert disk_consistent_lsn == initdb_lsn + 8
     assert last_record_lsn == disk_consistent_lsn
     # TODO: assert these values are the same everywhere
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index c13bea7ee1..fe970a868c 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -287,7 +287,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
                 offset=offset,
             )
 
-        # Do some update so we can increment latest_gc_cutoff
+        # Do some update so we can increment gc_cutoff
         generate_updates_on_main(env, ep_main, i, end=100)
 
     # Wait for the existing lease to expire.

From 536bdb32098c0332478ef0fb22ae92c9a4577785 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 13 Feb 2025 12:06:30 +0100
Subject: [PATCH 458/583] storcon: track safekeepers in memory, send heartbeats
 to them (#10583)

In #9011, we want to schedule timelines to safekeepers. In order to do
such scheduling, we need information about how utilized a safekeeper is
and if it's available or not.

Therefore, send constant heartbeats to the safekeepers and try to figure
out if they are online or not.

Includes some code from #10440.
---
 Cargo.lock                                  |   2 +
 safekeeper/client/src/mgmt_api.rs           |  10 +-
 storage_controller/Cargo.toml               |   2 +
 storage_controller/src/heartbeater.rs       | 189 +++++++++++++++++---
 storage_controller/src/lib.rs               |   2 +
 storage_controller/src/metrics.rs           |  12 ++
 storage_controller/src/persistence.rs       |  32 ++--
 storage_controller/src/safekeeper.rs        | 139 ++++++++++++++
 storage_controller/src/safekeeper_client.rs | 105 +++++++++++
 storage_controller/src/service.rs           | 185 ++++++++++++++++---
 10 files changed, 613 insertions(+), 65 deletions(-)
 create mode 100644 storage_controller/src/safekeeper.rs
 create mode 100644 storage_controller/src/safekeeper_client.rs

diff --git a/Cargo.lock b/Cargo.lock
index 407c8170bb..b3a88d46ac 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6464,6 +6464,8 @@ dependencies = [
  "routerify",
  "rustls 0.23.18",
  "rustls-native-certs 0.8.0",
+ "safekeeper_api",
+ "safekeeper_client",
  "scoped-futures",
  "scopeguard",
  "serde",
diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs
index df049f3eba..d4f47fc96d 100644
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -5,7 +5,7 @@
 
 use http_utils::error::HttpErrorBody;
 use reqwest::{IntoUrl, Method, StatusCode};
-use safekeeper_api::models::{TimelineCreateRequest, TimelineStatus};
+use safekeeper_api::models::{SafekeeperUtilization, TimelineCreateRequest, TimelineStatus};
 use std::error::Error as _;
 use utils::{
     id::{NodeId, TenantId, TimelineId},
@@ -32,6 +32,9 @@ pub enum Error {
     /// Status is not ok; parsed error in body as `HttpErrorBody`.
     #[error("safekeeper API: {1}")]
     ApiError(StatusCode, String),
+
+    #[error("Cancelled")]
+    Cancelled,
 }
 
 pub type Result<T> = std::result::Result<T, Error>;
@@ -124,9 +127,10 @@ impl Client {
         self.get(&uri).await
     }
 
-    pub async fn utilization(&self) -> Result<reqwest::Response> {
+    pub async fn utilization(&self) -> Result<SafekeeperUtilization> {
         let uri = format!("{}/v1/utilization/", self.mgmt_api_endpoint);
-        self.get(&uri).await
+        let resp = self.get(&uri).await?;
+        resp.json().await.map_err(Error::ReceiveBody)
     }
 
     async fn post<B: serde::Serialize, U: IntoUrl>(
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 91d8098cb9..69276bfde4 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -32,6 +32,8 @@ postgres_connection.workspace = true
 rand.workspace = true
 reqwest = { workspace = true, features = ["stream"] }
 routerify.workspace = true
+safekeeper_api.workspace = true
+safekeeper_client.workspace = true
 rustls-native-certs.workspace = true
 serde.workspace = true
 serde_json.workspace = true
diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index b7e66d33eb..6f110d3294 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -1,6 +1,10 @@
 use futures::{stream::FuturesUnordered, StreamExt};
+use safekeeper_api::models::SafekeeperUtilization;
+use safekeeper_client::mgmt_api;
 use std::{
     collections::HashMap,
+    fmt::Debug,
+    future::Future,
     sync::Arc,
     time::{Duration, Instant},
 };
@@ -9,15 +13,15 @@ use tokio_util::sync::CancellationToken;
 use pageserver_api::{controller_api::NodeAvailability, models::PageserverUtilization};
 
 use thiserror::Error;
-use utils::id::NodeId;
+use utils::{id::NodeId, logging::SecretString};
 
-use crate::node::Node;
+use crate::{node::Node, safekeeper::Safekeeper};
 
-struct HeartbeaterTask {
-    receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
+struct HeartbeaterTask<Server, State> {
+    receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest<Server, State>>,
     cancel: CancellationToken,
 
-    state: HashMap<NodeId, PageserverState>,
+    state: HashMap<NodeId, State>,
 
     max_offline_interval: Duration,
     max_warming_up_interval: Duration,
@@ -36,8 +40,17 @@ pub(crate) enum PageserverState {
     Offline,
 }
 
+#[derive(Debug, Clone)]
+pub(crate) enum SafekeeperState {
+    Available {
+        last_seen_at: Instant,
+        utilization: SafekeeperUtilization,
+    },
+    Offline,
+}
+
 #[derive(Debug)]
-pub(crate) struct AvailablityDeltas(pub Vec<(NodeId, PageserverState)>);
+pub(crate) struct AvailablityDeltas<State>(pub Vec<(NodeId, State)>);
 
 #[derive(Debug, Error)]
 pub(crate) enum HeartbeaterError {
@@ -45,23 +58,28 @@ pub(crate) enum HeartbeaterError {
     Cancel,
 }
 
-struct HeartbeatRequest {
-    pageservers: Arc<HashMap<NodeId, Node>>,
-    reply: tokio::sync::oneshot::Sender<Result<AvailablityDeltas, HeartbeaterError>>,
+struct HeartbeatRequest<Server, State> {
+    servers: Arc<HashMap<NodeId, Server>>,
+    reply: tokio::sync::oneshot::Sender<Result<AvailablityDeltas<State>, HeartbeaterError>>,
 }
 
-pub(crate) struct Heartbeater {
-    sender: tokio::sync::mpsc::UnboundedSender<HeartbeatRequest>,
+pub(crate) struct Heartbeater<Server, State> {
+    sender: tokio::sync::mpsc::UnboundedSender<HeartbeatRequest<Server, State>>,
 }
 
-impl Heartbeater {
+#[allow(private_bounds)]
+impl<Server: Send + Sync + 'static, State: Debug + Send + 'static> Heartbeater<Server, State>
+where
+    HeartbeaterTask<Server, State>: HeartBeat<Server, State>,
+{
     pub(crate) fn new(
         jwt_token: Option<String>,
         max_offline_interval: Duration,
         max_warming_up_interval: Duration,
         cancel: CancellationToken,
     ) -> Self {
-        let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::<HeartbeatRequest>();
+        let (sender, receiver) =
+            tokio::sync::mpsc::unbounded_channel::<HeartbeatRequest<Server, State>>();
         let mut heartbeater = HeartbeaterTask::new(
             receiver,
             jwt_token,
@@ -76,12 +94,12 @@ impl Heartbeater {
 
     pub(crate) async fn heartbeat(
         &self,
-        pageservers: Arc<HashMap<NodeId, Node>>,
-    ) -> Result<AvailablityDeltas, HeartbeaterError> {
+        servers: Arc<HashMap<NodeId, Server>>,
+    ) -> Result<AvailablityDeltas<State>, HeartbeaterError> {
         let (sender, receiver) = tokio::sync::oneshot::channel();
         self.sender
             .send(HeartbeatRequest {
-                pageservers,
+                servers,
                 reply: sender,
             })
             .map_err(|_| HeartbeaterError::Cancel)?;
@@ -93,9 +111,12 @@ impl Heartbeater {
     }
 }
 
-impl HeartbeaterTask {
+impl<Server, State: Debug> HeartbeaterTask<Server, State>
+where
+    HeartbeaterTask<Server, State>: HeartBeat<Server, State>,
+{
     fn new(
-        receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
+        receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest<Server, State>>,
         jwt_token: Option<String>,
         max_offline_interval: Duration,
         max_warming_up_interval: Duration,
@@ -110,14 +131,13 @@ impl HeartbeaterTask {
             jwt_token,
         }
     }
-
     async fn run(&mut self) {
         loop {
             tokio::select! {
                 request = self.receiver.recv() => {
                     match request {
                         Some(req) => {
-                            let res = self.heartbeat(req.pageservers).await;
+                            let res = self.heartbeat(req.servers).await;
                             req.reply.send(res).unwrap();
                         },
                         None => { return; }
@@ -127,11 +147,20 @@ impl HeartbeaterTask {
             }
         }
     }
+}
 
+pub(crate) trait HeartBeat<Server, State> {
+    fn heartbeat(
+        &mut self,
+        pageservers: Arc<HashMap<NodeId, Server>>,
+    ) -> impl Future<Output = Result<AvailablityDeltas<State>, HeartbeaterError>> + Send;
+}
+
+impl HeartBeat<Node, PageserverState> for HeartbeaterTask<Node, PageserverState> {
     async fn heartbeat(
         &mut self,
         pageservers: Arc<HashMap<NodeId, Node>>,
-    ) -> Result<AvailablityDeltas, HeartbeaterError> {
+    ) -> Result<AvailablityDeltas<PageserverState>, HeartbeaterError> {
         let mut new_state = HashMap::new();
 
         let mut heartbeat_futs = FuturesUnordered::new();
@@ -272,3 +301,121 @@ impl HeartbeaterTask {
         Ok(AvailablityDeltas(deltas))
     }
 }
+
+impl HeartBeat<Safekeeper, SafekeeperState> for HeartbeaterTask<Safekeeper, SafekeeperState> {
+    async fn heartbeat(
+        &mut self,
+        safekeepers: Arc<HashMap<NodeId, Safekeeper>>,
+    ) -> Result<AvailablityDeltas<SafekeeperState>, HeartbeaterError> {
+        let mut new_state = HashMap::new();
+
+        let mut heartbeat_futs = FuturesUnordered::new();
+        for (node_id, sk) in &*safekeepers {
+            heartbeat_futs.push({
+                let jwt_token = self
+                    .jwt_token
+                    .as_ref()
+                    .map(|t| SecretString::from(t.to_owned()));
+                let cancel = self.cancel.clone();
+
+                async move {
+                    let response = sk
+                        .with_client_retries(
+                            |client| async move { client.get_utilization().await },
+                            &jwt_token,
+                            3,
+                            3,
+                            Duration::from_secs(1),
+                            &cancel,
+                        )
+                        .await;
+
+                    let status = match response {
+                        Ok(utilization) => SafekeeperState::Available {
+                            last_seen_at: Instant::now(),
+                            utilization,
+                        },
+                        Err(mgmt_api::Error::Cancelled) => {
+                            // This indicates cancellation of the request.
+                            // We ignore the node in this case.
+                            return None;
+                        }
+                        Err(_) => SafekeeperState::Offline,
+                    };
+
+                    Some((*node_id, status))
+                }
+            });
+
+            loop {
+                let maybe_status = tokio::select! {
+                    next = heartbeat_futs.next() => {
+                        match next {
+                            Some(result) => result,
+                            None => { break; }
+                        }
+                    },
+                    _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); }
+                };
+
+                if let Some((node_id, status)) = maybe_status {
+                    new_state.insert(node_id, status);
+                }
+            }
+        }
+
+        let mut offline = 0;
+        for state in new_state.values() {
+            match state {
+                SafekeeperState::Offline { .. } => offline += 1,
+                SafekeeperState::Available { .. } => {}
+            }
+        }
+
+        tracing::info!(
+            "Heartbeat round complete for {} safekeepers, {} offline",
+            new_state.len(),
+            offline
+        );
+
+        let mut deltas = Vec::new();
+        let now = Instant::now();
+        for (node_id, sk_state) in new_state.iter_mut() {
+            use std::collections::hash_map::Entry::*;
+            let entry = self.state.entry(*node_id);
+
+            let mut needs_update = false;
+            match entry {
+                Occupied(ref occ) => match (occ.get(), &sk_state) {
+                    (SafekeeperState::Offline, SafekeeperState::Offline) => {}
+                    (SafekeeperState::Available { last_seen_at, .. }, SafekeeperState::Offline) => {
+                        if now - *last_seen_at >= self.max_offline_interval {
+                            deltas.push((*node_id, sk_state.clone()));
+                            needs_update = true;
+                        }
+                    }
+                    _ => {
+                        deltas.push((*node_id, sk_state.clone()));
+                        needs_update = true;
+                    }
+                },
+                Vacant(_) => {
+                    // This is a new node. Don't generate a delta for it.
+                    deltas.push((*node_id, sk_state.clone()));
+                }
+            }
+
+            match entry {
+                Occupied(mut occ) if needs_update => {
+                    (*occ.get_mut()) = sk_state.clone();
+                }
+                Vacant(vac) => {
+                    vac.insert(sk_state.clone());
+                }
+                _ => {}
+            }
+        }
+
+        Ok(AvailablityDeltas(deltas))
+    }
+}
diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs
index f5823935e1..5f2c081927 100644
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -17,6 +17,8 @@ mod pageserver_client;
 mod peer_client;
 pub mod persistence;
 mod reconciler;
+mod safekeeper;
+mod safekeeper_client;
 mod scheduler;
 mod schema;
 pub mod service;
diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index 4164e3dc2b..6d67e0d130 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -80,6 +80,11 @@ pub(crate) struct StorageControllerMetricGroup {
     pub(crate) storage_controller_pageserver_request_error:
         measured::CounterVec<PageserverRequestLabelGroupSet>,
 
+    /// Count of HTTP requests to the safekeeper that resulted in an error,
+    /// broken down by the safekeeper node id, request name and method
+    pub(crate) storage_controller_safekeeper_request_error:
+        measured::CounterVec<PageserverRequestLabelGroupSet>,
+
     /// Latency of HTTP requests to the pageserver, broken down by pageserver
     /// node id, request name and method. This include both successful and unsuccessful
     /// requests.
@@ -87,6 +92,13 @@ pub(crate) struct StorageControllerMetricGroup {
     pub(crate) storage_controller_pageserver_request_latency:
         measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
 
+    /// Latency of HTTP requests to the safekeeper, broken down by safekeeper
+    /// node id, request name and method. This include both successful and unsuccessful
+    /// requests.
+    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
+    pub(crate) storage_controller_safekeeper_request_latency:
+        measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
+
     /// Count of pass-through HTTP requests to the pageserver that resulted in an error,
     /// broken down by the pageserver node id, request name and method
     pub(crate) storage_controller_passthrough_request_error:
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index c4e5b39589..67b60eadf3 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -1185,23 +1185,6 @@ impl Persistence {
         Ok(safekeepers)
     }
 
-    pub(crate) async fn safekeeper_get(
-        &self,
-        id: i64,
-    ) -> Result<SafekeeperPersistence, DatabaseError> {
-        use crate::schema::safekeepers::dsl::{id as id_column, safekeepers};
-        self.with_conn(move |conn| {
-            Box::pin(async move {
-                Ok(safekeepers
-                    .filter(id_column.eq(&id))
-                    .select(SafekeeperPersistence::as_select())
-                    .get_result(conn)
-                    .await?)
-            })
-        })
-        .await
-    }
-
     pub(crate) async fn safekeeper_upsert(
         &self,
         record: SafekeeperUpsert,
@@ -1554,6 +1537,21 @@ pub(crate) struct SafekeeperPersistence {
 }
 
 impl SafekeeperPersistence {
+    pub(crate) fn from_upsert(
+        upsert: SafekeeperUpsert,
+        scheduling_policy: SkSchedulingPolicy,
+    ) -> Self {
+        crate::persistence::SafekeeperPersistence {
+            id: upsert.id,
+            region_id: upsert.region_id,
+            version: upsert.version,
+            host: upsert.host,
+            port: upsert.port,
+            http_port: upsert.http_port,
+            availability_zone_id: upsert.availability_zone_id,
+            scheduling_policy: String::from(scheduling_policy),
+        }
+    }
     pub(crate) fn as_describe_response(&self) -> Result<SafekeeperDescribeResponse, DatabaseError> {
         let scheduling_policy =
             SkSchedulingPolicy::from_str(&self.scheduling_policy).map_err(|e| {
diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs
new file mode 100644
index 0000000000..be073d0cb9
--- /dev/null
+++ b/storage_controller/src/safekeeper.rs
@@ -0,0 +1,139 @@
+use std::{str::FromStr, time::Duration};
+
+use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy};
+use reqwest::StatusCode;
+use safekeeper_client::mgmt_api;
+use tokio_util::sync::CancellationToken;
+use utils::{backoff, id::NodeId, logging::SecretString};
+
+use crate::{
+    heartbeater::SafekeeperState,
+    persistence::{DatabaseError, SafekeeperPersistence},
+    safekeeper_client::SafekeeperClient,
+};
+
+#[derive(Clone)]
+pub struct Safekeeper {
+    pub(crate) skp: SafekeeperPersistence,
+    cancel: CancellationToken,
+    listen_http_addr: String,
+    listen_http_port: u16,
+    id: NodeId,
+    availability: SafekeeperState,
+}
+
+impl Safekeeper {
+    pub(crate) fn from_persistence(skp: SafekeeperPersistence, cancel: CancellationToken) -> Self {
+        Self {
+            cancel,
+            listen_http_addr: skp.host.clone(),
+            listen_http_port: skp.http_port as u16,
+            id: NodeId(skp.id as u64),
+            skp,
+            availability: SafekeeperState::Offline,
+        }
+    }
+    pub(crate) fn base_url(&self) -> String {
+        format!("http://{}:{}", self.listen_http_addr, self.listen_http_port)
+    }
+
+    pub(crate) fn get_id(&self) -> NodeId {
+        self.id
+    }
+    pub(crate) fn describe_response(&self) -> Result<SafekeeperDescribeResponse, DatabaseError> {
+        self.skp.as_describe_response()
+    }
+    pub(crate) fn set_availability(&mut self, availability: SafekeeperState) {
+        self.availability = availability;
+    }
+    /// Perform an operation (which is given a [`SafekeeperClient`]) with retries
+    pub(crate) async fn with_client_retries<T, O, F>(
+        &self,
+        mut op: O,
+        jwt: &Option<SecretString>,
+        warn_threshold: u32,
+        max_retries: u32,
+        timeout: Duration,
+        cancel: &CancellationToken,
+    ) -> mgmt_api::Result<T>
+    where
+        O: FnMut(SafekeeperClient) -> F,
+        F: std::future::Future<Output = mgmt_api::Result<T>>,
+    {
+        fn is_fatal(e: &mgmt_api::Error) -> bool {
+            use mgmt_api::Error::*;
+            match e {
+                ReceiveBody(_) | ReceiveErrorBody(_) => false,
+                ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
+                | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
+                | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
+                ApiError(_, _) => true,
+                Cancelled => true,
+            }
+        }
+
+        backoff::retry(
+            || {
+                let http_client = reqwest::ClientBuilder::new()
+                    .timeout(timeout)
+                    .build()
+                    .expect("Failed to construct HTTP client");
+
+                let client = SafekeeperClient::from_client(
+                    self.get_id(),
+                    http_client,
+                    self.base_url(),
+                    jwt.clone(),
+                );
+
+                let node_cancel_fut = self.cancel.cancelled();
+
+                let op_fut = op(client);
+
+                async {
+                    tokio::select! {
+                        r = op_fut=> {r},
+                        _ = node_cancel_fut => {
+                        Err(mgmt_api::Error::Cancelled)
+                    }}
+                }
+            },
+            is_fatal,
+            warn_threshold,
+            max_retries,
+            &format!(
+                "Call to node {} ({}:{}) management API",
+                self.id, self.listen_http_addr, self.listen_http_port
+            ),
+            cancel,
+        )
+        .await
+        .unwrap_or(Err(mgmt_api::Error::Cancelled))
+    }
+
+    pub(crate) fn update_from_record(&mut self, record: crate::persistence::SafekeeperUpsert) {
+        let crate::persistence::SafekeeperUpsert {
+            active: _,
+            availability_zone_id: _,
+            host,
+            http_port,
+            id,
+            port: _,
+            region_id: _,
+            version: _,
+        } = record.clone();
+        if id != self.id.0 as i64 {
+            // The way the function is called ensures this. If we regress on that, it's a bug.
+            panic!(
+                "id can't be changed via update_from_record function: {id} != {}",
+                self.id.0
+            );
+        }
+        self.skp = crate::persistence::SafekeeperPersistence::from_upsert(
+            record,
+            SkSchedulingPolicy::from_str(&self.skp.scheduling_policy).unwrap(),
+        );
+        self.listen_http_port = http_port as u16;
+        self.listen_http_addr = host;
+    }
+}
diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs
new file mode 100644
index 0000000000..bb494f20fa
--- /dev/null
+++ b/storage_controller/src/safekeeper_client.rs
@@ -0,0 +1,105 @@
+use crate::metrics::PageserverRequestLabelGroup;
+use safekeeper_api::models::{SafekeeperUtilization, TimelineCreateRequest, TimelineStatus};
+use safekeeper_client::mgmt_api::{Client, Result};
+use utils::{
+    id::{NodeId, TenantId, TimelineId},
+    logging::SecretString,
+};
+
+/// Thin wrapper around [`safekeeper_client::mgmt_api::Client`]. It allows the storage
+/// controller to collect metrics in a non-intrusive manner.
+///
+/// Analogous to [`crate::pageserver_client::PageserverClient`].
+#[derive(Debug, Clone)]
+pub(crate) struct SafekeeperClient {
+    inner: Client,
+    node_id_label: String,
+}
+
+macro_rules! measured_request {
+    ($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{
+        let labels = PageserverRequestLabelGroup {
+            pageserver_id: $node_id,
+            path: $name,
+            method: $method,
+        };
+
+        let latency = &crate::metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_safekeeper_request_latency;
+        let _timer_guard = latency.start_timer(labels.clone());
+
+        let res = $invoke;
+
+        if res.is_err() {
+            let error_counters = &crate::metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_pageserver_request_error;
+            error_counters.inc(labels)
+        }
+
+        res
+    }};
+}
+
+impl SafekeeperClient {
+    #[allow(dead_code)]
+    pub(crate) fn new(
+        node_id: NodeId,
+        mgmt_api_endpoint: String,
+        jwt: Option<SecretString>,
+    ) -> Self {
+        Self {
+            inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt),
+            node_id_label: node_id.0.to_string(),
+        }
+    }
+
+    pub(crate) fn from_client(
+        node_id: NodeId,
+        raw_client: reqwest::Client,
+        mgmt_api_endpoint: String,
+        jwt: Option<SecretString>,
+    ) -> Self {
+        Self {
+            inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt),
+            node_id_label: node_id.0.to_string(),
+        }
+    }
+
+    #[allow(dead_code)]
+    pub(crate) async fn create_timeline(
+        &self,
+        req: &TimelineCreateRequest,
+    ) -> Result<TimelineStatus> {
+        measured_request!(
+            "create_timeline",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner.create_timeline(req).await
+        )
+    }
+
+    #[allow(dead_code)]
+    pub(crate) async fn delete_timeline(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<TimelineStatus> {
+        measured_request!(
+            "delete_timeline",
+            crate::metrics::Method::Delete,
+            &self.node_id_label,
+            self.inner.delete_timeline(tenant_id, timeline_id).await
+        )
+    }
+
+    pub(crate) async fn get_utilization(&self) -> Result<SafekeeperUtilization> {
+        measured_request!(
+            "utilization",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.utilization().await
+        )
+    }
+}
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 6829663a4c..b9db46fe4a 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2,6 +2,7 @@ pub mod chaos_injector;
 mod context_iterator;
 
 use hyper::Uri;
+use safekeeper_api::models::SafekeeperUtilization;
 use std::{
     borrow::Cow,
     cmp::Ordering,
@@ -20,6 +21,7 @@ use crate::{
     },
     compute_hook::{self, NotifyError},
     drain_utils::{self, TenantShardDrain, TenantShardIterator},
+    heartbeater::SafekeeperState,
     id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
     leadership::Leadership,
     metrics,
@@ -29,6 +31,7 @@ use crate::{
         ShardGenerationState, TenantFilter,
     },
     reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
+    safekeeper::Safekeeper,
     scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
     tenant_shard::{
         MigrateAttachment, ObservedStateDelta, ReconcileNeeded, ReconcilerStatus,
@@ -206,6 +209,8 @@ struct ServiceState {
 
     nodes: Arc<HashMap<NodeId, Node>>,
 
+    safekeepers: Arc<HashMap<NodeId, Safekeeper>>,
+
     scheduler: Scheduler,
 
     /// Ongoing background operation on the cluster if any is running.
@@ -272,6 +277,7 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError {
 impl ServiceState {
     fn new(
         nodes: HashMap<NodeId, Node>,
+        safekeepers: HashMap<NodeId, Safekeeper>,
         tenants: BTreeMap<TenantShardId, TenantShard>,
         scheduler: Scheduler,
         delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
@@ -283,6 +289,7 @@ impl ServiceState {
             leadership_status: initial_leadership_status,
             tenants,
             nodes: Arc::new(nodes),
+            safekeepers: Arc::new(safekeepers),
             scheduler,
             ongoing_operation: None,
             delayed_reconcile_rx,
@@ -299,6 +306,23 @@ impl ServiceState {
         (&mut self.nodes, &mut self.tenants, &mut self.scheduler)
     }
 
+    #[allow(clippy::type_complexity)]
+    fn parts_mut_sk(
+        &mut self,
+    ) -> (
+        &mut Arc<HashMap<NodeId, Node>>,
+        &mut Arc<HashMap<NodeId, Safekeeper>>,
+        &mut BTreeMap<TenantShardId, TenantShard>,
+        &mut Scheduler,
+    ) {
+        (
+            &mut self.nodes,
+            &mut self.safekeepers,
+            &mut self.tenants,
+            &mut self.scheduler,
+        )
+    }
+
     fn get_leadership_status(&self) -> LeadershipStatus {
         self.leadership_status
     }
@@ -397,7 +421,8 @@ pub struct Service {
     compute_hook: Arc<ComputeHook>,
     result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResultRequest>,
 
-    heartbeater: Heartbeater,
+    heartbeater_ps: Heartbeater<Node, PageserverState>,
+    heartbeater_sk: Heartbeater<Safekeeper, SafekeeperState>,
 
     // Channel for background cleanup from failed operations that require cleanup, such as shard split
     abort_tx: tokio::sync::mpsc::UnboundedSender<TenantShardSplitAbort>,
@@ -607,7 +632,8 @@ impl Service {
             let locked = self.inner.read().unwrap();
             locked.nodes.clone()
         };
-        let mut nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await;
+        let (mut nodes_online, mut sks_online) =
+            self.initial_heartbeat_round(all_nodes.keys()).await;
 
         // List of tenants for which we will attempt to notify compute of their location at startup
         let mut compute_notifications = Vec::new();
@@ -616,7 +642,7 @@ impl Service {
         tracing::info!("Populating tenant shards' states from initial pageserver scan...");
         let shard_count = {
             let mut locked = self.inner.write().unwrap();
-            let (nodes, tenants, scheduler) = locked.parts_mut();
+            let (nodes, safekeepers, tenants, scheduler) = locked.parts_mut_sk();
 
             // Mark nodes online if they responded to us: nodes are offline by default after a restart.
             let mut new_nodes = (**nodes).clone();
@@ -628,6 +654,17 @@ impl Service {
             }
             *nodes = Arc::new(new_nodes);
 
+            let mut new_sks = (**safekeepers).clone();
+            for (node_id, node) in new_sks.iter_mut() {
+                if let Some((utilization, last_seen_at)) = sks_online.remove(node_id) {
+                    node.set_availability(SafekeeperState::Available {
+                        utilization,
+                        last_seen_at,
+                    });
+                }
+            }
+            *safekeepers = Arc::new(new_sks);
+
             for (tenant_shard_id, observed_state) in observed.0 {
                 let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else {
                     for node_id in observed_state.locations.keys() {
@@ -736,7 +773,10 @@ impl Service {
     async fn initial_heartbeat_round<'a>(
         &self,
         node_ids: impl Iterator<Item = &'a NodeId>,
-    ) -> HashMap<NodeId, PageserverUtilization> {
+    ) -> (
+        HashMap<NodeId, PageserverUtilization>,
+        HashMap<NodeId, (SafekeeperUtilization, Instant)>,
+    ) {
         assert!(!self.startup_complete.is_ready());
 
         let all_nodes = {
@@ -756,14 +796,20 @@ impl Service {
             }
         }
 
+        let all_sks = {
+            let locked = self.inner.read().unwrap();
+            locked.safekeepers.clone()
+        };
+
         tracing::info!("Sending initial heartbeats...");
-        let res = self
-            .heartbeater
+        let res_ps = self
+            .heartbeater_ps
             .heartbeat(Arc::new(nodes_to_heartbeat))
             .await;
+        let res_sk = self.heartbeater_sk.heartbeat(all_sks).await;
 
         let mut online_nodes = HashMap::new();
-        if let Ok(deltas) = res {
+        if let Ok(deltas) = res_ps {
             for (node_id, status) in deltas.0 {
                 match status {
                     PageserverState::Available { utilization, .. } => {
@@ -777,7 +823,22 @@ impl Service {
             }
         }
 
-        online_nodes
+        let mut online_sks = HashMap::new();
+        if let Ok(deltas) = res_sk {
+            for (node_id, status) in deltas.0 {
+                match status {
+                    SafekeeperState::Available {
+                        utilization,
+                        last_seen_at,
+                    } => {
+                        online_sks.insert(node_id, (utilization, last_seen_at));
+                    }
+                    SafekeeperState::Offline => {}
+                }
+            }
+        }
+
+        (online_nodes, online_sks)
     }
 
     /// Used during [`Self::startup_reconcile`]: issue GETs to all nodes concurrently, with a deadline.
@@ -984,8 +1045,14 @@ impl Service {
                 locked.nodes.clone()
             };
 
-            let res = self.heartbeater.heartbeat(nodes).await;
-            if let Ok(deltas) = res {
+            let safekeepers = {
+                let locked = self.inner.read().unwrap();
+                locked.safekeepers.clone()
+            };
+
+            let res_ps = self.heartbeater_ps.heartbeat(nodes).await;
+            let res_sk = self.heartbeater_sk.heartbeat(safekeepers).await;
+            if let Ok(deltas) = res_ps {
                 let mut to_handle = Vec::default();
 
                 for (node_id, state) in deltas.0 {
@@ -1086,6 +1153,18 @@ impl Service {
                     }
                 }
             }
+            if let Ok(deltas) = res_sk {
+                let mut locked = self.inner.write().unwrap();
+                let mut safekeepers = (*locked.safekeepers).clone();
+                for (id, state) in deltas.0 {
+                    let Some(sk) = safekeepers.get_mut(&id) else {
+                        tracing::info!("Couldn't update safekeeper safekeeper state for id {id} from heartbeat={state:?}");
+                        continue;
+                    };
+                    sk.set_availability(state);
+                }
+                locked.safekeepers = Arc::new(safekeepers);
+            }
         }
     }
 
@@ -1311,6 +1390,17 @@ impl Service {
             .storage_controller_pageserver_nodes
             .set(nodes.len() as i64);
 
+        tracing::info!("Loading safekeepers from database...");
+        let safekeepers = persistence
+            .list_safekeepers()
+            .await?
+            .into_iter()
+            .map(|skp| Safekeeper::from_persistence(skp, CancellationToken::new()))
+            .collect::<Vec<_>>();
+        let safekeepers: HashMap<NodeId, Safekeeper> =
+            safekeepers.into_iter().map(|n| (n.get_id(), n)).collect();
+        tracing::info!("Loaded {} safekeepers from database.", safekeepers.len());
+
         tracing::info!("Loading shards from database...");
         let mut tenant_shard_persistence = persistence.load_active_tenant_shards().await?;
         tracing::info!(
@@ -1437,7 +1527,14 @@ impl Service {
         let cancel = CancellationToken::new();
         let reconcilers_cancel = cancel.child_token();
 
-        let heartbeater = Heartbeater::new(
+        let heartbeater_ps = Heartbeater::new(
+            config.jwt_token.clone(),
+            config.max_offline_interval,
+            config.max_warming_up_interval,
+            cancel.clone(),
+        );
+
+        let heartbeater_sk = Heartbeater::new(
             config.jwt_token.clone(),
             config.max_offline_interval,
             config.max_warming_up_interval,
@@ -1453,6 +1550,7 @@ impl Service {
         let this = Arc::new(Self {
             inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
                 nodes,
+                safekeepers,
                 tenants,
                 scheduler,
                 delayed_reconcile_rx,
@@ -1462,7 +1560,8 @@ impl Service {
             persistence,
             compute_hook: Arc::new(ComputeHook::new(config.clone())),
             result_tx,
-            heartbeater,
+            heartbeater_ps,
+            heartbeater_sk,
             reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new(
                 config.reconciler_concurrency,
             )),
@@ -7661,29 +7760,54 @@ impl Service {
     pub(crate) async fn safekeepers_list(
         &self,
     ) -> Result<Vec<SafekeeperDescribeResponse>, DatabaseError> {
-        self.persistence
-            .list_safekeepers()
-            .await?
-            .into_iter()
-            .map(|v| v.as_describe_response())
-            .collect::<Result<Vec<_>, _>>()
+        let locked = self.inner.read().unwrap();
+        let mut list = locked
+            .safekeepers
+            .iter()
+            .map(|sk| sk.1.describe_response())
+            .collect::<Result<Vec<_>, _>>()?;
+        list.sort_by_key(|v| v.id);
+        Ok(list)
     }
 
     pub(crate) async fn get_safekeeper(
         &self,
         id: i64,
     ) -> Result<SafekeeperDescribeResponse, DatabaseError> {
-        self.persistence
-            .safekeeper_get(id)
-            .await
-            .and_then(|v| v.as_describe_response())
+        let locked = self.inner.read().unwrap();
+        let sk = locked
+            .safekeepers
+            .get(&NodeId(id as u64))
+            .ok_or(diesel::result::Error::NotFound)?;
+        sk.describe_response()
     }
 
     pub(crate) async fn upsert_safekeeper(
         &self,
         record: crate::persistence::SafekeeperUpsert,
     ) -> Result<(), DatabaseError> {
-        self.persistence.safekeeper_upsert(record).await
+        let node_id = NodeId(record.id as u64);
+        self.persistence.safekeeper_upsert(record.clone()).await?;
+        {
+            let mut locked = self.inner.write().unwrap();
+            let mut safekeepers = (*locked.safekeepers).clone();
+            match safekeepers.entry(node_id) {
+                std::collections::hash_map::Entry::Occupied(mut entry) => {
+                    entry.get_mut().update_from_record(record);
+                }
+                std::collections::hash_map::Entry::Vacant(entry) => {
+                    entry.insert(Safekeeper::from_persistence(
+                        crate::persistence::SafekeeperPersistence::from_upsert(
+                            record,
+                            SkSchedulingPolicy::Pause,
+                        ),
+                        CancellationToken::new(),
+                    ));
+                }
+            }
+            locked.safekeepers = Arc::new(safekeepers);
+        }
+        Ok(())
     }
 
     pub(crate) async fn set_safekeeper_scheduling_policy(
@@ -7693,7 +7817,20 @@ impl Service {
     ) -> Result<(), DatabaseError> {
         self.persistence
             .set_safekeeper_scheduling_policy(id, scheduling_policy)
-            .await
+            .await?;
+        let node_id = NodeId(id as u64);
+        // After the change has been persisted successfully, update the in-memory state
+        {
+            let mut locked = self.inner.write().unwrap();
+            let mut safekeepers = (*locked.safekeepers).clone();
+            let sk = safekeepers
+                .get_mut(&node_id)
+                .ok_or(DatabaseError::Logical("Not found".to_string()))?;
+            sk.skp.scheduling_policy = String::from(scheduling_policy);
+
+            locked.safekeepers = Arc::new(safekeepers);
+        }
+        Ok(())
     }
 
     pub(crate) async fn update_shards_preferred_azs(

From 8fea43a5ba573013bc821eef0446b31a61f43ae9 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 13 Feb 2025 12:48:47 +0000
Subject: [PATCH 459/583] pageserver: make heatmap generation additive (#10597)

## Problem

Previously, when cutting over to cold secondary locations,
we would clobber the previous, good, heatmap with a cold one.
This is because heatmap generation used to include only resident layers.

Once this merges, we can add an endpoint which triggers full heatmap
hydration on attached locations to heal cold migrations.

## Summary of changes

With this patch, heatmap generation becomes additive. If we have a
heatmap from when this location was secondary, the new uploaded heatmap
will be the result of a reconciliation between the old one and the on
disk resident layers.

More concretely, when we have the previous heatmap:
1. Filter the previous heatmap and keep layers that are (a) present
in the current layer map, (b) visible, (c) not resident. Call this set
of layers `visible_non_resident`.
2. From the layer map, select all layers that are resident and visible.
Call this set of layers `resident`.
3. The new heatmap is the result of merging the two disjoint sets.

Related https://github.com/neondatabase/neon/issues/10541
---
 control_plane/src/storage_controller.rs       |   5 +-
 control_plane/storcon_cli/src/main.rs         |  15 +-
 libs/pageserver_api/src/controller_api.rs     |  12 +
 pageserver/src/tenant.rs                      |  83 ++++++-
 pageserver/src/tenant/secondary/heatmap.rs    |  19 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  31 ++-
 pageserver/src/tenant/timeline.rs             | 235 +++++++++++++++++-
 pageserver/src/tenant/timeline/delete.rs      |   1 +
 storage_controller/src/reconciler.rs          |  18 +-
 storage_controller/src/service.rs             |   7 +-
 test_runner/fixtures/neon_fixtures.py         |  20 +-
 .../regress/test_pageserver_secondary.py      |  98 +++++++-
 12 files changed, 511 insertions(+), 33 deletions(-)

diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 9a2d30c861..0fadb9c5fe 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -838,7 +838,10 @@ impl StorageController {
         self.dispatch(
             Method::PUT,
             format!("control/v1/tenant/{tenant_shard_id}/migrate"),
-            Some(TenantShardMigrateRequest { node_id }),
+            Some(TenantShardMigrateRequest {
+                node_id,
+                migration_config: None,
+            }),
         )
         .await
     }
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 985fe6b3b1..83faf6b4af 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -609,7 +609,10 @@ async fn main() -> anyhow::Result<()> {
             tenant_shard_id,
             node,
         } => {
-            let req = TenantShardMigrateRequest { node_id: node };
+            let req = TenantShardMigrateRequest {
+                node_id: node,
+                migration_config: None,
+            };
 
             storcon_client
                 .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
@@ -623,7 +626,10 @@ async fn main() -> anyhow::Result<()> {
             tenant_shard_id,
             node,
         } => {
-            let req = TenantShardMigrateRequest { node_id: node };
+            let req = TenantShardMigrateRequest {
+                node_id: node,
+                migration_config: None,
+            };
 
             storcon_client
                 .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
@@ -1082,7 +1088,10 @@ async fn main() -> anyhow::Result<()> {
                             .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
                                 Method::PUT,
                                 format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
-                                Some(TenantShardMigrateRequest { node_id: mv.to }),
+                                Some(TenantShardMigrateRequest {
+                                    node_id: mv.to,
+                                    migration_config: None,
+                                }),
                             )
                             .await
                             .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 78e080981a..42f6e47e63 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -182,6 +182,18 @@ pub struct TenantDescribeResponseShard {
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateRequest {
     pub node_id: NodeId,
+    #[serde(default)]
+    pub migration_config: Option<MigrationConfig>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct MigrationConfig {
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub secondary_warmup_timeout: Option<Duration>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub secondary_download_request_timeout: Option<Duration>,
 }
 
 #[derive(Serialize, Clone, Debug)]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 605bfac2b3..dec585ff65 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -40,6 +40,8 @@ use remote_timeline_client::manifest::{
 use remote_timeline_client::UploadQueueNotReadyError;
 use remote_timeline_client::FAILED_REMOTE_OP_RETRIES;
 use remote_timeline_client::FAILED_UPLOAD_WARN_THRESHOLD;
+use secondary::heatmap::HeatMapTenant;
+use secondary::heatmap::HeatMapTimeline;
 use std::collections::BTreeMap;
 use std::fmt;
 use std::future::Future;
@@ -55,6 +57,7 @@ use timeline::offload::OffloadError;
 use timeline::CompactFlags;
 use timeline::CompactOptions;
 use timeline::CompactionError;
+use timeline::PreviousHeatmap;
 use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
@@ -262,6 +265,7 @@ struct TimelinePreload {
     timeline_id: TimelineId,
     client: RemoteTimelineClient,
     index_part: Result<MaybeDeletedIndexPart, DownloadError>,
+    previous_heatmap: Option<PreviousHeatmap>,
 }
 
 pub(crate) struct TenantPreload {
@@ -1128,6 +1132,7 @@ impl Tenant {
         resources: TimelineResources,
         mut index_part: IndexPart,
         metadata: TimelineMetadata,
+        previous_heatmap: Option<PreviousHeatmap>,
         ancestor: Option<Arc<Timeline>>,
         cause: LoadTimelineCause,
         ctx: &RequestContext,
@@ -1158,6 +1163,7 @@ impl Tenant {
         let timeline = self.create_timeline_struct(
             timeline_id,
             &metadata,
+            previous_heatmap,
             ancestor.clone(),
             resources,
             CreateTimelineCause::Load,
@@ -1557,8 +1563,18 @@ impl Tenant {
             }
         }
 
+        // TODO(vlad): Could go to S3 if the secondary is freezing cold and hasn't even
+        // pulled the first heatmap. Not entirely necessary since the storage controller
+        // will kick the secondary in any case and cause a download.
+        let maybe_heatmap_at = self.read_on_disk_heatmap().await;
+
         let timelines = self
-            .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
+            .load_timelines_metadata(
+                remote_timeline_ids,
+                remote_storage,
+                maybe_heatmap_at,
+                cancel,
+            )
             .await?;
 
         Ok(TenantPreload {
@@ -1571,6 +1587,26 @@ impl Tenant {
         })
     }
 
+    async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> {
+        let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id);
+        match tokio::fs::read_to_string(on_disk_heatmap_path).await {
+            Ok(heatmap) => match serde_json::from_str::<HeatMapTenant>(&heatmap) {
+                Ok(heatmap) => Some((heatmap, std::time::Instant::now())),
+                Err(err) => {
+                    error!("Failed to deserialize old heatmap: {err}");
+                    None
+                }
+            },
+            Err(err) => match err.kind() {
+                std::io::ErrorKind::NotFound => None,
+                _ => {
+                    error!("Unexpected IO error reading old heatmap: {err}");
+                    None
+                }
+            },
+        }
+    }
+
     ///
     /// Background task that downloads all data for a tenant and brings it to Active state.
     ///
@@ -1658,7 +1694,10 @@ impl Tenant {
             match index_part {
                 MaybeDeletedIndexPart::IndexPart(index_part) => {
                     timeline_ancestors.insert(timeline_id, index_part.metadata.clone());
-                    remote_index_and_client.insert(timeline_id, (index_part, preload.client));
+                    remote_index_and_client.insert(
+                        timeline_id,
+                        (index_part, preload.client, preload.previous_heatmap),
+                    );
                 }
                 MaybeDeletedIndexPart::Deleted(index_part) => {
                     info!(
@@ -1677,7 +1716,7 @@ impl Tenant {
         // layer file.
         let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?;
         for (timeline_id, remote_metadata) in sorted_timelines {
-            let (index_part, remote_client) = remote_index_and_client
+            let (index_part, remote_client, previous_heatmap) = remote_index_and_client
                 .remove(&timeline_id)
                 .expect("just put it in above");
 
@@ -1697,6 +1736,7 @@ impl Tenant {
                     timeline_id,
                     index_part,
                     remote_metadata,
+                    previous_heatmap,
                     self.get_timeline_resources_for(remote_client),
                     LoadTimelineCause::Attach,
                     ctx,
@@ -1846,11 +1886,13 @@ impl Tenant {
     }
 
     #[instrument(skip_all, fields(timeline_id=%timeline_id))]
+    #[allow(clippy::too_many_arguments)]
     async fn load_remote_timeline(
         self: &Arc<Self>,
         timeline_id: TimelineId,
         index_part: IndexPart,
         remote_metadata: TimelineMetadata,
+        previous_heatmap: Option<PreviousHeatmap>,
         resources: TimelineResources,
         cause: LoadTimelineCause,
         ctx: &RequestContext,
@@ -1880,6 +1922,7 @@ impl Tenant {
             resources,
             index_part,
             remote_metadata,
+            previous_heatmap,
             ancestor,
             cause,
             ctx,
@@ -1891,14 +1934,29 @@ impl Tenant {
         self: &Arc<Tenant>,
         timeline_ids: HashSet<TimelineId>,
         remote_storage: &GenericRemoteStorage,
+        heatmap: Option<(HeatMapTenant, std::time::Instant)>,
         cancel: CancellationToken,
     ) -> anyhow::Result<HashMap<TimelineId, TimelinePreload>> {
+        let mut timeline_heatmaps = heatmap.map(|h| (h.0.into_timelines_index(), h.1));
+
         let mut part_downloads = JoinSet::new();
         for timeline_id in timeline_ids {
             let cancel_clone = cancel.clone();
+
+            let previous_timeline_heatmap = timeline_heatmaps.as_mut().and_then(|hs| {
+                hs.0.remove(&timeline_id).map(|h| PreviousHeatmap::Active {
+                    heatmap: h,
+                    read_at: hs.1,
+                })
+            });
             part_downloads.spawn(
-                self.load_timeline_metadata(timeline_id, remote_storage.clone(), cancel_clone)
-                    .instrument(info_span!("download_index_part", %timeline_id)),
+                self.load_timeline_metadata(
+                    timeline_id,
+                    remote_storage.clone(),
+                    previous_timeline_heatmap,
+                    cancel_clone,
+                )
+                .instrument(info_span!("download_index_part", %timeline_id)),
             );
         }
 
@@ -1946,6 +2004,7 @@ impl Tenant {
         self: &Arc<Tenant>,
         timeline_id: TimelineId,
         remote_storage: GenericRemoteStorage,
+        previous_heatmap: Option<PreviousHeatmap>,
         cancel: CancellationToken,
     ) -> impl Future<Output = TimelinePreload> {
         let client = self.build_timeline_client(timeline_id, remote_storage);
@@ -1961,6 +2020,7 @@ impl Tenant {
                 client,
                 timeline_id,
                 index_part,
+                previous_heatmap,
             }
         }
     }
@@ -2072,7 +2132,12 @@ impl Tenant {
             })?;
 
         let timeline_preload = self
-            .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel.clone())
+            .load_timeline_metadata(
+                timeline_id,
+                self.remote_storage.clone(),
+                None,
+                cancel.clone(),
+            )
             .await;
 
         let index_part = match timeline_preload.index_part {
@@ -2106,6 +2171,7 @@ impl Tenant {
             timeline_id,
             index_part,
             remote_metadata,
+            None,
             timeline_resources,
             LoadTimelineCause::Unoffload,
             &ctx,
@@ -2821,7 +2887,7 @@ impl Tenant {
         };
         let metadata = index_part.metadata.clone();
         self
-            .load_remote_timeline(timeline_id, index_part, metadata, resources, LoadTimelineCause::ImportPgdata{
+            .load_remote_timeline(timeline_id, index_part, metadata, None, resources, LoadTimelineCause::ImportPgdata{
                 create_guard: timeline_create_guard, activate, }, &ctx)
             .await?
             .ready_to_activate()
@@ -4030,6 +4096,7 @@ impl Tenant {
         &self,
         new_timeline_id: TimelineId,
         new_metadata: &TimelineMetadata,
+        previous_heatmap: Option<PreviousHeatmap>,
         ancestor: Option<Arc<Timeline>>,
         resources: TimelineResources,
         cause: CreateTimelineCause,
@@ -4053,6 +4120,7 @@ impl Tenant {
             self.conf,
             Arc::clone(&self.tenant_conf),
             new_metadata,
+            previous_heatmap,
             ancestor,
             new_timeline_id,
             self.tenant_shard_id,
@@ -5124,6 +5192,7 @@ impl Tenant {
             .create_timeline_struct(
                 new_timeline_id,
                 new_metadata,
+                None,
                 ancestor,
                 resources,
                 CreateTimelineCause::Load,
diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs
index 4a8e66d38a..0fa10ca294 100644
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -1,4 +1,4 @@
-use std::time::SystemTime;
+use std::{collections::HashMap, time::SystemTime};
 
 use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName};
 
@@ -8,7 +8,7 @@ use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
 use utils::{generation::Generation, id::TimelineId};
 
 #[derive(Serialize, Deserialize)]
-pub(super) struct HeatMapTenant {
+pub(crate) struct HeatMapTenant {
     /// Generation of the attached location that uploaded the heatmap: this is not required
     /// for correctness, but acts as a hint to secondary locations in order to detect thrashing
     /// in the unlikely event that two attached locations are both uploading conflicting heatmaps.
@@ -25,8 +25,17 @@ pub(super) struct HeatMapTenant {
     pub(super) upload_period_ms: Option<u128>,
 }
 
+impl HeatMapTenant {
+    pub(crate) fn into_timelines_index(self) -> HashMap<TimelineId, HeatMapTimeline> {
+        self.timelines
+            .into_iter()
+            .map(|htl| (htl.timeline_id, htl))
+            .collect()
+    }
+}
+
 #[serde_as]
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
 pub(crate) struct HeatMapTimeline {
     #[serde_as(as = "DisplayFromStr")]
     pub(crate) timeline_id: TimelineId,
@@ -35,13 +44,13 @@ pub(crate) struct HeatMapTimeline {
 }
 
 #[serde_as]
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
 pub(crate) struct HeatMapLayer {
     pub(crate) name: LayerName,
     pub(crate) metadata: LayerFileMetadata,
 
     #[serde_as(as = "TimestampSeconds<i64>")]
-    pub(super) access_time: SystemTime,
+    pub(crate) access_time: SystemTime,
     // TODO: an actual 'heat' score that would let secondary locations prioritize downloading
     // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
 }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 40282defd4..0bf606cf0a 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -136,6 +136,22 @@ pub(crate) fn local_layer_path(
     }
 }
 
+pub(crate) enum LastEviction {
+    Never,
+    At(std::time::Instant),
+    Evicting,
+}
+
+impl LastEviction {
+    pub(crate) fn happened_after(&self, timepoint: std::time::Instant) -> bool {
+        match self {
+            LastEviction::Never => false,
+            LastEviction::At(evicted_at) => evicted_at > &timepoint,
+            LastEviction::Evicting => true,
+        }
+    }
+}
+
 impl Layer {
     /// Creates a layer value for a file we know to not be resident.
     pub(crate) fn for_evicted(
@@ -405,6 +421,17 @@ impl Layer {
         self.0.metadata()
     }
 
+    pub(crate) fn last_evicted_at(&self) -> LastEviction {
+        match self.0.last_evicted_at.try_lock() {
+            Ok(lock) => match *lock {
+                None => LastEviction::Never,
+                Some(at) => LastEviction::At(at),
+            },
+            Err(std::sync::TryLockError::WouldBlock) => LastEviction::Evicting,
+            Err(std::sync::TryLockError::Poisoned(p)) => panic!("Lock poisoned: {p}"),
+        }
+    }
+
     pub(crate) fn get_timeline_id(&self) -> Option<TimelineId> {
         self.0
             .timeline
@@ -656,7 +683,9 @@ struct LayerInner {
 
     /// When the Layer was last evicted but has not been downloaded since.
     ///
-    /// This is used solely for updating metrics. See [`LayerImplMetrics::redownload_after`].
+    /// This is used for skipping evicted layers from the previous heatmap (see
+    /// `[Timeline::generate_heatmap]`) and for updating metrics
+    /// (see [`LayerImplMetrics::redownload_after`]).
     last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,
 
     #[cfg(test)]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b211af4eff..782b7d88b0 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -150,16 +150,15 @@ use super::{
     config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized,
     MaybeOffloaded,
 };
-use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
+use super::{
+    debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf, HeatMapTimeline,
+};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{
     remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
     storage_layer::ReadableLayer,
 };
-use super::{
-    secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
-    GcError,
-};
+use super::{secondary::heatmap::HeatMapLayer, GcError};
 
 #[cfg(test)]
 use pageserver_api::value::Value;
@@ -465,6 +464,16 @@ pub struct Timeline {
 
     /// If Some, collects GetPage metadata for an ongoing PageTrace.
     pub(crate) page_trace: ArcSwapOption<Sender<PageTraceEvent>>,
+
+    previous_heatmap: ArcSwapOption<PreviousHeatmap>,
+}
+
+pub(crate) enum PreviousHeatmap {
+    Active {
+        heatmap: HeatMapTimeline,
+        read_at: std::time::Instant,
+    },
+    Obsolete,
 }
 
 pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
@@ -2568,6 +2577,7 @@ impl Timeline {
         conf: &'static PageServerConf,
         tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
         metadata: &TimelineMetadata,
+        previous_heatmap: Option<PreviousHeatmap>,
         ancestor: Option<Arc<Timeline>>,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
@@ -2730,6 +2740,8 @@ impl Timeline {
                 create_idempotency,
 
                 page_trace: Default::default(),
+
+                previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap),
             };
 
             result.repartition_threshold =
@@ -3468,12 +3480,52 @@ impl Timeline {
 
         let guard = self.layers.read().await;
 
+        // Firstly, if there's any heatmap left over from when this location
+        // was a secondary, take that into account. Keep layers that are:
+        // * present in the layer map
+        // * visible
+        // * non-resident
+        // * not evicted since we read the heatmap
+        //
+        // Without this, a new cold, attached location would clobber the previous
+        // heatamp.
+        let previous_heatmap = self.previous_heatmap.load();
+        let visible_non_resident = match previous_heatmap.as_deref() {
+            Some(PreviousHeatmap::Active { heatmap, read_at }) => {
+                Some(heatmap.layers.iter().filter_map(|hl| {
+                    let desc: PersistentLayerDesc = hl.name.clone().into();
+                    let layer = guard.try_get_from_key(&desc.key())?;
+
+                    if layer.visibility() == LayerVisibilityHint::Covered {
+                        return None;
+                    }
+
+                    if layer.is_likely_resident() {
+                        return None;
+                    }
+
+                    if layer.last_evicted_at().happened_after(*read_at) {
+                        return None;
+                    }
+
+                    Some((desc, hl.metadata.clone(), hl.access_time))
+                }))
+            }
+            Some(PreviousHeatmap::Obsolete) => None,
+            None => None,
+        };
+
+        // Secondly, all currently visible, resident layers are included.
         let resident = guard.likely_resident_layers().filter_map(|layer| {
             match layer.visibility() {
                 LayerVisibilityHint::Visible => {
                     // Layer is visible to one or more read LSNs: elegible for inclusion in layer map
                     let last_activity_ts = layer.latest_activity();
-                    Some((layer.layer_desc(), layer.metadata(), last_activity_ts))
+                    Some((
+                        layer.layer_desc().clone(),
+                        layer.metadata(),
+                        last_activity_ts,
+                    ))
                 }
                 LayerVisibilityHint::Covered => {
                     // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
@@ -3482,7 +3534,18 @@ impl Timeline {
             }
         });
 
-        let mut layers = resident.collect::<Vec<_>>();
+        let mut layers = match visible_non_resident {
+            Some(non_resident) => {
+                let mut non_resident = non_resident.peekable();
+                if non_resident.peek().is_none() {
+                    self.previous_heatmap
+                        .store(Some(PreviousHeatmap::Obsolete.into()));
+                }
+
+                non_resident.chain(resident).collect::<Vec<_>>()
+            }
+            None => resident.collect::<Vec<_>>(),
+        };
 
         // Sort layers in order of which to download first.  For a large set of layers to download, we
         // want to prioritize those layers which are most likely to still be in the resident many minutes
@@ -6661,18 +6724,32 @@ fn is_send() {
 
 #[cfg(test)]
 mod tests {
+    use std::sync::Arc;
+
     use pageserver_api::key::Key;
     use pageserver_api::value::Value;
+    use tracing::Instrument;
     use utils::{id::TimelineId, lsn::Lsn};
 
     use crate::tenant::{
         harness::{test_img, TenantHarness},
         layer_map::LayerMap,
-        storage_layer::{Layer, LayerName},
+        storage_layer::{Layer, LayerName, LayerVisibilityHint},
         timeline::{DeltaLayerTestDesc, EvictionError},
-        Timeline,
+        PreviousHeatmap, Timeline,
     };
 
+    use super::HeatMapTimeline;
+
+    fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) {
+        assert_eq!(lhs.layers.len(), rhs.layers.len());
+        let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter());
+        for (l, r) in lhs_rhs {
+            assert_eq!(l.name, r.name);
+            assert_eq!(l.metadata, r.metadata);
+        }
+    }
+
     #[tokio::test]
     async fn test_heatmap_generation() {
         let harness = TenantHarness::create("heatmap_generation").await.unwrap();
@@ -6746,7 +6823,7 @@ mod tests {
         assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
 
         let mut last_lsn = Lsn::MAX;
-        for layer in heatmap.layers {
+        for layer in &heatmap.layers {
             // Covered layer should be omitted
             assert!(layer.name != covered_delta.layer_name());
 
@@ -6761,6 +6838,144 @@ mod tests {
                 last_lsn = layer_lsn;
             }
         }
+
+        // Evict all the layers and stash the old heatmap in the timeline.
+        // This simulates a migration to a cold secondary location.
+
+        let guard = timeline.layers.read().await;
+        let mut all_layers = Vec::new();
+        let forever = std::time::Duration::from_secs(120);
+        for layer in guard.likely_resident_layers() {
+            all_layers.push(layer.clone());
+            layer.evict_and_wait(forever).await.unwrap();
+        }
+        drop(guard);
+
+        timeline
+            .previous_heatmap
+            .store(Some(Arc::new(PreviousHeatmap::Active {
+                heatmap: heatmap.clone(),
+                read_at: std::time::Instant::now(),
+            })));
+
+        // Generate a new heatmap and assert that it contains the same layers as the old one.
+        let post_migration_heatmap = timeline.generate_heatmap().await.unwrap();
+        assert_heatmaps_have_same_layers(&heatmap, &post_migration_heatmap);
+
+        // Download each layer one by one. Generate the heatmap at each step and check
+        // that it's stable.
+        for layer in all_layers {
+            if layer.visibility() == LayerVisibilityHint::Covered {
+                continue;
+            }
+
+            eprintln!("Downloading {layer} and re-generating heatmap");
+
+            let _resident = layer
+                .download_and_keep_resident()
+                .instrument(tracing::info_span!(
+                    parent: None,
+                    "download_layer",
+                    tenant_id = %timeline.tenant_shard_id.tenant_id,
+                    shard_id = %timeline.tenant_shard_id.shard_slug(),
+                    timeline_id = %timeline.timeline_id
+                ))
+                .await
+                .unwrap();
+
+            let post_download_heatmap = timeline.generate_heatmap().await.unwrap();
+            assert_heatmaps_have_same_layers(&heatmap, &post_download_heatmap);
+        }
+
+        // Everything from the post-migration heatmap is now resident.
+        // Check that we drop it from memory.
+        assert!(matches!(
+            timeline.previous_heatmap.load().as_deref(),
+            Some(PreviousHeatmap::Obsolete)
+        ));
+    }
+
+    #[tokio::test]
+    async fn test_previous_heatmap_obsoletion() {
+        let harness = TenantHarness::create("heatmap_previous_heatmap_obsoletion")
+            .await
+            .unwrap();
+
+        let l0_delta = DeltaLayerTestDesc::new(
+            Lsn(0x20)..Lsn(0x30),
+            Key::from_hex("000000000000000000000000000000000000").unwrap()
+                ..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(),
+            vec![(
+                Key::from_hex("720000000033333333444444445500000000").unwrap(),
+                Lsn(0x25),
+                Value::Image(test_img("foo")),
+            )],
+        );
+
+        let image_layer = (
+            Lsn(0x40),
+            vec![(
+                Key::from_hex("620000000033333333444444445500000000").unwrap(),
+                test_img("bar"),
+            )],
+        );
+
+        let delta_layers = vec![l0_delta];
+        let image_layers = vec![image_layer];
+
+        let (tenant, ctx) = harness.load().await;
+        let timeline = tenant
+            .create_test_timeline_with_layers(
+                TimelineId::generate(),
+                Lsn(0x10),
+                14,
+                &ctx,
+                delta_layers,
+                image_layers,
+                Lsn(0x100),
+            )
+            .await
+            .unwrap();
+
+        // Layer visibility is an input to heatmap generation, so refresh it first
+        timeline.update_layer_visibility().await.unwrap();
+
+        let heatmap = timeline
+            .generate_heatmap()
+            .await
+            .expect("Infallible while timeline is not shut down");
+
+        // Both layers should be in the heatmap
+        assert!(!heatmap.layers.is_empty());
+
+        // Now simulate a migration.
+        timeline
+            .previous_heatmap
+            .store(Some(Arc::new(PreviousHeatmap::Active {
+                heatmap: heatmap.clone(),
+                read_at: std::time::Instant::now(),
+            })));
+
+        // Evict all the layers in the previous heatmap
+        let guard = timeline.layers.read().await;
+        let forever = std::time::Duration::from_secs(120);
+        for layer in guard.likely_resident_layers() {
+            layer.evict_and_wait(forever).await.unwrap();
+        }
+        drop(guard);
+
+        // Generate a new heatmap and check that the previous heatmap
+        // has been marked obsolete.
+        let post_eviction_heatmap = timeline
+            .generate_heatmap()
+            .await
+            .expect("Infallible while timeline is not shut down");
+
+        assert!(post_eviction_heatmap.layers.is_empty());
+        assert!(matches!(
+            timeline.previous_heatmap.load().as_deref(),
+            Some(PreviousHeatmap::Obsolete)
+        ));
     }
 
     #[tokio::test]
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 93b7efedb8..841b2fa1c7 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -294,6 +294,7 @@ impl DeleteTimelineFlow {
                 timeline_id,
                 local_metadata,
                 None, // Ancestor is not needed for deletion.
+                None, // Previous heatmap is not needed for deletion
                 tenant.get_timeline_resources_for(remote_client),
                 // Important. We dont pass ancestor above because it can be missing.
                 // Thus we need to skip the validation here.
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 58bc0ba1cd..8c7e9b1726 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -1,7 +1,7 @@
 use crate::pageserver_client::PageserverClient;
 use crate::persistence::Persistence;
 use crate::{compute_hook, service};
-use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy};
+use pageserver_api::controller_api::{AvailabilityZone, MigrationConfig, PlacementPolicy};
 use pageserver_api::models::{
     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, TenantWaitLsnRequest,
 };
@@ -162,6 +162,22 @@ impl ReconcilerConfig {
     }
 }
 
+impl From<&MigrationConfig> for ReconcilerConfig {
+    fn from(value: &MigrationConfig) -> Self {
+        let mut builder = ReconcilerConfigBuilder::new();
+
+        if let Some(timeout) = value.secondary_warmup_timeout {
+            builder = builder.secondary_warmup_timeout(timeout)
+        }
+
+        if let Some(timeout) = value.secondary_download_request_timeout {
+            builder = builder.secondary_download_request_timeout(timeout)
+        }
+
+        builder.build()
+    }
+}
+
 /// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O
 pub(crate) struct ReconcileUnits {
     _sem_units: tokio::sync::OwnedSemaphorePermit,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index b9db46fe4a..c1da9374e4 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5213,7 +5213,12 @@ impl Service {
                 shard.sequence = shard.sequence.next();
             }
 
-            self.maybe_reconcile_shard(shard, nodes)
+            let reconciler_config = match migrate_req.migration_config {
+                Some(cfg) => (&cfg).into(),
+                None => ReconcilerConfig::default(),
+            };
+
+            self.maybe_configured_reconcile_shard(shard, nodes, reconciler_config)
         };
 
         if let Some(waiter) = waiter {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 2fa82754ef..b7afbec403 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import abc
 import asyncio
 import concurrent.futures
+import dataclasses
 import filecmp
 import json
 import os
@@ -1675,6 +1676,12 @@ class StorageControllerLeadershipStatus(StrEnum):
     CANDIDATE = "candidate"
 
 
+@dataclass
+class StorageControllerMigrationConfig:
+    secondary_warmup_timeout: str | None
+    secondary_download_request_timeout: str | None
+
+
 class NeonStorageController(MetricsGetter, LogUtils):
     def __init__(self, env: NeonEnv, port: int, auth_enabled: bool):
         self.env = env
@@ -2068,11 +2075,20 @@ class NeonStorageController(MetricsGetter, LogUtils):
         shards: list[TenantShardId] = body["new_shards"]
         return shards
 
-    def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
+    def tenant_shard_migrate(
+        self,
+        tenant_shard_id: TenantShardId,
+        dest_ps_id: int,
+        config: StorageControllerMigrationConfig | None = None,
+    ):
+        payload = {"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id}
+        if config is not None:
+            payload["migration_config"] = dataclasses.asdict(config)
+
         self.request(
             "PUT",
             f"{self.api}/control/v1/tenant/{tenant_shard_id}/migrate",
-            json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
+            json=payload,
             headers=self.headers(TokenScope.ADMIN),
         )
         log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}")
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 590093d23c..8a91a255d8 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -10,14 +10,18 @@ from typing import TYPE_CHECKING
 import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    NeonPageserver,
+    StorageControllerMigrationConfig,
+)
 from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     assert_prefix_empty,
     wait_for_upload_queue_empty,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage
-from fixtures.utils import skip_in_debug_build, wait_until
+from fixtures.utils import run_only_on_default_postgres, skip_in_debug_build, wait_until
 from fixtures.workload import Workload
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -889,3 +893,93 @@ def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controll
     assert progress_3["heatmap_mtime"] is not None
     assert progress_3["layers_total"] == progress_3["layers_downloaded"]
     assert progress_3["bytes_total"] == progress_3["bytes_downloaded"]
+
+
+@skip_in_debug_build("only run with release build")
+@run_only_on_default_postgres("PG version is not interesting here")
+def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.enable_pageserver_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+    )
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    assert isinstance(env.pageserver_remote_storage, S3Storage)  # Satisfy linter
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.create_tenant(tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}')
+
+    env.storage_controller.reconcile_until_idle()
+
+    attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
+    ps_attached = env.get_pageserver(attached_to_id)
+    ps_secondary = next(p for p in env.pageservers if p != ps_attached)
+
+    # Generate a bunch of small layers (we will apply a slowdown failpoint that works on a per-layer basis)
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(128, upload=True)
+    workload.write_rows(128, upload=True)
+    workload.write_rows(128, upload=True)
+    workload.write_rows(128, upload=True)
+    workload.stop()
+
+    # Expect lots of layers
+    assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10
+
+    # Simulate large data by making layer downloads artifically slow
+    for ps in env.pageservers:
+        ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")])
+
+    # Upload a heatmap, so that secondaries have something to download
+    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+    heatmap_before_migration = env.pageserver_remote_storage.heatmap_content(tenant_id)
+
+    # This has no chance to succeed: we have lots of layers and each one takes at least 1000ms.
+    # However, it pulls the heatmap, which will be important later.
+    http_client = env.storage_controller.pageserver_api()
+    (status, progress) = http_client.tenant_secondary_download(tenant_id, wait_ms=4000)
+    assert status == 202
+    assert progress["heatmap_mtime"] is not None
+    assert progress["layers_downloaded"] > 0
+    assert progress["bytes_downloaded"] > 0
+    assert progress["layers_total"] > progress["layers_downloaded"]
+    assert progress["bytes_total"] > progress["bytes_downloaded"]
+
+    env.storage_controller.allowed_errors.extend(
+        [
+            ".*Timed out.*downloading layers.*",
+        ]
+    )
+
+    # Use a custom configuration that gives up earlier than usual.
+    # We can't hydrate everything anyway because of the failpoints.
+    config = StorageControllerMigrationConfig(
+        secondary_warmup_timeout="5s", secondary_download_request_timeout="2s"
+    )
+    env.storage_controller.tenant_shard_migrate(
+        TenantShardId(tenant_id, shard_number=0, shard_count=0), ps_secondary.id, config
+    )
+
+    env.storage_controller.reconcile_until_idle()
+    assert env.storage_controller.locate(tenant_id)[0]["node_id"] == ps_secondary.id
+
+    ps_secondary.http_client().tenant_heatmap_upload(tenant_id)
+    heatmap_after_migration = env.pageserver_remote_storage.heatmap_content(tenant_id)
+
+    assert len(heatmap_before_migration["timelines"][0]["layers"]) > 0
+
+    # The new layer map should contain all the layers in the pre-migration one
+    # and a new in memory layer
+    assert len(heatmap_before_migration["timelines"][0]["layers"]) + 1 == len(
+        heatmap_after_migration["timelines"][0]["layers"]
+    )
+
+    log.info(
+        f'Heatmap size after cold migration is {len(heatmap_after_migration["timelines"][0]["layers"])}'
+    )
+
+    # TODO: Once we have an endpoint for rescuing the cold location, exercise it here.

From e37ba8642d3b9f94cb40f54ac69f0f9dad0a65ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Thu, 13 Feb 2025 14:08:46 +0100
Subject: [PATCH 460/583] Integrate cargo-chef into Dockerfile (#10782)

## Problem
The build of the neon container image is not caching any part of the
rust build, making it fairly slow.

## Summary of changes
Cache dependency building using cargo-chef.
---
 Dockerfile             | 16 +++++++++++++++-
 build-tools.Dockerfile |  2 ++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index b399bcf7e4..83ad86badb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -50,6 +50,14 @@ RUN set -e \
     && rm -rf pg_install/build \
     && tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz .
 
+# Prepare cargo-chef recipe
+FROM $REPOSITORY/$IMAGE:$TAG AS plan
+WORKDIR /home/nonroot
+
+COPY --chown=nonroot . .
+
+RUN cargo chef prepare --recipe-path recipe.json
+
 # Build neon binaries
 FROM $REPOSITORY/$IMAGE:$TAG AS build
 WORKDIR /home/nonroot
@@ -63,9 +71,15 @@ COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_i
 COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/lib                       pg_install/v16/lib
 COPY --from=pg-build /home/nonroot/pg_install/v17/lib                       pg_install/v17/lib
+COPY --from=plan     /home/nonroot/recipe.json                              recipe.json
+
+ARG ADDITIONAL_RUSTFLAGS=""
+
+RUN set -e \
+    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo chef cook --locked --release --recipe-path recipe.json
+
 COPY --chown=nonroot . .
 
-ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
     && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
       --bin pg_sni_router  \
diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index fa72ca1bc2..317eded26e 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -300,6 +300,7 @@ ARG CARGO_HAKARI_VERSION=0.9.33
 ARG CARGO_DENY_VERSION=0.16.2
 ARG CARGO_HACK_VERSION=0.6.33
 ARG CARGO_NEXTEST_VERSION=0.9.85
+ARG CARGO_CHEF_VERSION=0.1.71
 ARG CARGO_DIESEL_CLI_VERSION=2.2.6
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
 	chmod +x rustup-init && \
@@ -314,6 +315,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
     cargo install cargo-hack          --version ${CARGO_HACK_VERSION} && \
     cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} && \
+    cargo install cargo-chef --locked --version ${CARGO_CHEF_VERSION} && \
     cargo install diesel_cli          --version ${CARGO_DIESEL_CLI_VERSION} \
                                       --features postgres-bundled --no-default-features && \
     rm -rf /home/nonroot/.cargo/registry && \

From 8c2f85b20922c9c32d255da6b0b362b7b323eb82 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 13 Feb 2025 14:28:05 +0100
Subject: [PATCH 461/583] chore(compute): Postgres 17.3, 16.7, 15.11 and 14.16
 (#10771)

## Summary of changes

Bump all minor versions. The only non-trivial conflict was between
-
https://github.com/postgres/postgres/commit/0350b876b074dc307b82ba18cd3c7cad46066baf
- and
https://github.com/neondatabase/postgres/commit/bd09a752f4c2556ba2722510e7196136cc266c43

It seems that just adding this extra argument is enough.

I also got conflict with

https://github.com/postgres/postgres/commit/c1c9df3159cfa91416bebe56ae50bc32d8a4e10b
but for some reason only in PG 15. Yet, that was a trivial one around
```c
		if (XLogCtl)
			LWLockRelease(ControlFileLock);
		/* durable_rename already emitted log message */
		return false;
```
in `xlog.c`

## Postgres PRs

- https://github.com/neondatabase/postgres/pull/580
- https://github.com/neondatabase/postgres/pull/579
- https://github.com/neondatabase/postgres/pull/577
- https://github.com/neondatabase/postgres/pull/578
---
 pgxn/neon/pagestore_smgr.c     |  6 +++---
 pgxn/neon_walredo/inmem_smgr.c |  4 ++--
 vendor/postgres-v14            |  2 +-
 vendor/postgres-v15            |  2 +-
 vendor/postgres-v16            |  2 +-
 vendor/postgres-v17            |  2 +-
 vendor/revisions.json          | 16 ++++++++--------
 7 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 8051970176..f1087a8ccb 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -3765,7 +3765,7 @@ neon_dbsize(Oid dbNode)
  *	neon_truncate() -- Truncate relation to specified number of blocks.
  */
 static void
-neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
+neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks)
 {
 	XLogRecPtr	lsn;
 
@@ -3780,7 +3780,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 
 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
-			mdtruncate(reln, forknum, nblocks);
+			mdtruncate(reln, forknum, old_blocks, nblocks);
 			return;
 
 		default:
@@ -3818,7 +3818,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
-		mdtruncate(reln, forknum, nblocks);
+		mdtruncate(reln, forknum, old_blocks, nblocks);
 #endif
 }
 
diff --git a/pgxn/neon_walredo/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c
index a45e8f5c4a..74cd5ac601 100644
--- a/pgxn/neon_walredo/inmem_smgr.c
+++ b/pgxn/neon_walredo/inmem_smgr.c
@@ -96,7 +96,7 @@ static void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
 							BlockNumber blocknum, BlockNumber nblocks);
 static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
 static void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
-						   BlockNumber nblocks);
+						   BlockNumber old_blocks, BlockNumber nblocks);
 static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
 #if PG_MAJORVERSION_NUM >= 17
 static void inmem_registersync(SMgrRelation reln, ForkNumber forknum);
@@ -345,7 +345,7 @@ inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
  *	inmem_truncate() -- Truncate relation to specified number of blocks.
  */
 static void
-inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
+inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks)
 {
 }
 
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index c0aedfd3ca..62a86dfc91 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit c0aedfd3cac447510a2db843b561f0c52901b679
+Subproject commit 62a86dfc91e0c35a72f2ea5e99e6969b830c0c26
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 355a7c69d3..80ed91ce25 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 355a7c69d3f907f3612eb406cc7b9c2f55d59b59
+Subproject commit 80ed91ce255c765d25be0bb4a02c942fe6311fbf
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 13cf5d06c9..999cf81b10 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 13cf5d06c98a8e9b0590ce6cdfd193a08d0a7792
+Subproject commit 999cf81b101ead40e597d5cd729458d8200f4537
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 4c45d78ad5..4d3a722312 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 4c45d78ad587e4bcb4a5a7ef6931b88c6a3d575d
+Subproject commit 4d3a722312b496ff7378156caa6d41c2e70c30e4
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 5f60e1d690..888f09124e 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
-    "17.2",
-    "4c45d78ad587e4bcb4a5a7ef6931b88c6a3d575d"
+    "17.3",
+    "4d3a722312b496ff7378156caa6d41c2e70c30e4"
   ],
   "v16": [
-    "16.6",
-    "13cf5d06c98a8e9b0590ce6cdfd193a08d0a7792"
+    "16.7",
+    "999cf81b101ead40e597d5cd729458d8200f4537"
   ],
   "v15": [
-    "15.10",
-    "355a7c69d3f907f3612eb406cc7b9c2f55d59b59"
+    "15.11",
+    "80ed91ce255c765d25be0bb4a02c942fe6311fbf"
   ],
   "v14": [
-    "14.15",
-    "c0aedfd3cac447510a2db843b561f0c52901b679"
+    "14.16",
+    "62a86dfc91e0c35a72f2ea5e99e6969b830c0c26"
   ]
 }

From ae463f366b34b8dbd3edb508a0bbaea4ab79d17b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 13 Feb 2025 16:15:04 +0000
Subject: [PATCH 462/583] tests: broaden allow-list for #10720 workaround
 (#10807)

## Problem

In #10752 I used an overly-strict regex that only ignored error on a
particular key.

## Summary of changes

- Drop key from regex so it matches all such errors
---
 test_runner/regress/test_sharding.py         | 2 +-
 test_runner/regress/test_storage_scrubber.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 8910873690..f58bbcd3c0 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1821,7 +1821,7 @@ def test_sharding_gc(
         # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed
         ps.allowed_errors.extend(
             [
-                ".*could not find data for key 020000000000000000000000000000000000.*",
+                ".*could not find data for key.*",
                 ".*could not ingest record.*",
             ]
         )
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index b8253fb125..d44c176b35 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -318,7 +318,7 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_
         # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed
         ps.allowed_errors.extend(
             [
-                ".*could not find data for key 020000000000000000000000000000000000.*",
+                ".*could not find data for key.*",
                 ".*could not ingest record.*",
             ]
         )

From a4d0a3459143744b7da07ab6c585520ddee5b9e3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 13 Feb 2025 16:23:51 +0000
Subject: [PATCH 463/583] tests: flush in test_isolation (#10658)

## Problem

This test occasionally fails while the test teardown tries to do a
graceful shutdown, because the test has quickly written lots of data
into the pageserver.

Closes: #10654

## Summary of changes

- Call `post_checks` at the end of `test_isolation`, as we already do
for test_pg_regress -- this improves our detection of issues, and as a
nice side effect flushes the pageserver.
- Ignore pg_notify files when validating state at end of test, these are
not expected to be the same
---
 test_runner/fixtures/neon_fixtures.py  | 7 ++++++-
 test_runner/regress/test_pg_regress.py | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b7afbec403..469bc8a1e5 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4988,8 +4988,13 @@ def check_restored_datadir_content(
 
     restored_files = list_files_to_compare(restored_dir_path)
 
+    # pg_notify files are always ignored
+    pgdata_files = [f for f in pgdata_files if not f.startswith("pg_notify")]
+    restored_files = [f for f in restored_files if not f.startswith("pg_notify")]
+
+    # pg_xact and pg_multixact files are optional in basebackup: depending on our configuration they
+    # may be omitted and loaded on demand.
     if pgdata_files != restored_files:
-        # filter pg_xact and multixact files which are downloaded on demand
         pgdata_files = [
             f
             for f in pgdata_files
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index c5ae669dce..411888efbc 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -261,7 +261,7 @@ def test_isolation(
         pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath)
 
     # This fails with a mismatch on `pg_multixact/offsets/0000`
-    # post_checks(env, test_output_dir, DBNAME, endpoint)
+    post_checks(env, test_output_dir, DBNAME, endpoint)
 
 
 # Run extra Neon-specific pg_regress-based tests. The tests and their

From b6f972ed83c594ed2df5fc670d246744bd2b1d11 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 13 Feb 2025 11:33:27 -0600
Subject: [PATCH 464/583] Increase the extension server request timeout to 1
 minute (#10800)

pg_search is 46ish MB. All other remote extensions are around hundeds of
KB. 3 seconds is not long enough to download the tarball if the S3
gateway cache doesn't already contain a copy. According to our setup,
the cache is limited to 10 GB in size and anything that has not been
accessed for an hour is purged.

This is really bad for scaling to 0, even more so if you're the only
project actively using the extension in a production Kubernetes cluster.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 pgxn/neon/extension_server.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c
index e38af08f89..6e558c433a 100644
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -14,7 +14,7 @@
 
 #include "utils/guc.h"
 
-#include "extension_server.h" 
+#include "extension_server.h"
 #include "neon_utils.h"
 
 static int	extension_server_port = 0;
@@ -45,7 +45,7 @@ neon_download_extension_file_http(const char *filename, bool is_library)
 		handle = alloc_curl_handle();
 
 		curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST");
-		curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ );
+		curl_easy_setopt(handle, CURLOPT_TIMEOUT, 60L /* seconds */ );
 	}
 
 	compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",

From 0cf9157adc141d487b5f7cb28afb9b6e3c1e8dee Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 13 Feb 2025 12:04:36 -0600
Subject: [PATCH 465/583] Handle new compute_ctl_config parameter in compute
 spec requests  (#10746)

There is now a compute_ctl_config field in the response that currently
only contains a JSON Web Key set. compute_ctl currently doesn't do
anything with the keys, but will in the future.

The reasoning for the new field is due to the nature of empty computes.
When an empty compute is created, it does not have a tenant. A compute
spec is the primary means of communicating the details of an attached
tenant. In the empty compute state, there is no spec. Instead we wait
for the control plane to pass us one via /configure. If we were to
include the jwks field in the compute spec, we would have a partial
compute spec, which doesn't logically make sense.

Instead, we can have two means of passing settings to the compute:

- spec: tenant specific config details
- compute_ctl_config: compute specific settings

For instance, the JSON Web Key set passed to the compute is independent
of any tenant. It is a setting of the compute whether it is attached or
not.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 Cargo.lock                           |  2 ++
 compute_tools/Cargo.toml             |  1 +
 compute_tools/src/bin/compute_ctl.rs | 12 +++++++++---
 compute_tools/src/spec.rs            | 21 ++++++++++++---------
 control_plane/src/endpoint.rs        | 13 +++++++++----
 libs/compute_api/Cargo.toml          |  1 +
 libs/compute_api/src/requests.rs     |  6 ++++--
 libs/compute_api/src/responses.rs    | 19 +++++++++++++++++--
 8 files changed, 55 insertions(+), 20 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b3a88d46ac..86d9603d36 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1293,6 +1293,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "chrono",
+ "jsonwebtoken",
  "regex",
  "remote_storage",
  "serde",
@@ -1320,6 +1321,7 @@ dependencies = [
  "flate2",
  "futures",
  "http 1.1.0",
+ "jsonwebtoken",
  "metrics",
  "nix 0.27.1",
  "notify",
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index b04f364cbb..b8828fa49f 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -24,6 +24,7 @@ fail.workspace = true
 flate2.workspace = true
 futures.workspace = true
 http.workspace = true
+jsonwebtoken.workspace = true
 metrics.workspace = true
 nix.workspace = true
 notify.workspace = true
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index df47adda6c..a8803ec793 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -55,7 +55,7 @@ use signal_hook::{consts::SIGINT, iterator::Signals};
 use tracing::{error, info, warn};
 use url::Url;
 
-use compute_api::responses::ComputeStatus;
+use compute_api::responses::{ComputeCtlConfig, ComputeStatus};
 use compute_api::spec::ComputeSpec;
 
 use compute_tools::compute::{
@@ -281,6 +281,7 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
         info!("got spec from cli argument {}", spec_json);
         return Ok(CliSpecParams {
             spec: Some(serde_json::from_str(spec_json)?),
+            compute_ctl_config: ComputeCtlConfig::default(),
             live_config_allowed: false,
         });
     }
@@ -290,6 +291,7 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
         let file = File::open(Path::new(spec_path))?;
         return Ok(CliSpecParams {
             spec: Some(serde_json::from_reader(file)?),
+            compute_ctl_config: ComputeCtlConfig::default(),
             live_config_allowed: true,
         });
     }
@@ -299,8 +301,9 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
     };
 
     match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
-        Ok(spec) => Ok(CliSpecParams {
-            spec,
+        Ok(resp) => Ok(CliSpecParams {
+            spec: resp.0,
+            compute_ctl_config: resp.1,
             live_config_allowed: true,
         }),
         Err(e) => {
@@ -317,6 +320,8 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
 struct CliSpecParams {
     /// If a spec was provided via CLI or file, the [`ComputeSpec`]
     spec: Option<ComputeSpec>,
+    #[allow(dead_code)]
+    compute_ctl_config: ComputeCtlConfig,
     live_config_allowed: bool,
 }
 
@@ -326,6 +331,7 @@ fn wait_spec(
     CliSpecParams {
         spec,
         live_config_allowed,
+        compute_ctl_config: _,
     }: CliSpecParams,
 ) -> Result<Arc<ComputeNode>> {
     let mut new_state = ComputeState::new();
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 73950cd95a..6f28bd9733 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -11,7 +11,9 @@ use crate::migration::MigrationRunner;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;
 
-use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse};
+use compute_api::responses::{
+    ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse,
+};
 use compute_api::spec::ComputeSpec;
 
 // Do control plane request and return response if any. In case of error it
@@ -73,14 +75,13 @@ fn do_control_plane_request(
 pub fn get_spec_from_control_plane(
     base_uri: &str,
     compute_id: &str,
-) -> Result<Option<ComputeSpec>> {
+) -> Result<(Option<ComputeSpec>, ComputeCtlConfig)> {
     let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
     let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
         Ok(v) => v,
         Err(_) => "".to_string(),
     };
     let mut attempt = 1;
-    let mut spec: Result<Option<ComputeSpec>> = Ok(None);
 
     info!("getting spec from control plane: {}", cp_uri);
 
@@ -90,7 +91,7 @@ pub fn get_spec_from_control_plane(
     // - no spec for compute yet (Empty state) -> return Ok(None)
     // - got spec -> return Ok(Some(spec))
     while attempt < 4 {
-        spec = match do_control_plane_request(&cp_uri, &jwt) {
+        let result = match do_control_plane_request(&cp_uri, &jwt) {
             Ok(spec_resp) => {
                 CPLANE_REQUESTS_TOTAL
                     .with_label_values(&[
@@ -99,10 +100,10 @@ pub fn get_spec_from_control_plane(
                     ])
                     .inc();
                 match spec_resp.status {
-                    ControlPlaneComputeStatus::Empty => Ok(None),
+                    ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)),
                     ControlPlaneComputeStatus::Attached => {
                         if let Some(spec) = spec_resp.spec {
-                            Ok(Some(spec))
+                            Ok((Some(spec), spec_resp.compute_ctl_config))
                         } else {
                             bail!("compute is attached, but spec is empty")
                         }
@@ -121,10 +122,10 @@ pub fn get_spec_from_control_plane(
             }
         };
 
-        if let Err(e) = &spec {
+        if let Err(e) = &result {
             error!("attempt {} to get spec failed with: {}", attempt, e);
         } else {
-            return spec;
+            return result;
         }
 
         attempt += 1;
@@ -132,7 +133,9 @@ pub fn get_spec_from_control_plane(
     }
 
     // All attempts failed, return error.
-    spec
+    Err(anyhow::anyhow!(
+        "Exhausted all attempts to retrieve the spec from the control plane"
+    ))
 }
 
 /// Check `pg_hba.conf` and update if needed to allow external connections.
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 3b2634204c..c3c8229c38 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -48,6 +48,8 @@ use std::sync::Arc;
 use std::time::Duration;
 
 use anyhow::{anyhow, bail, Context, Result};
+use compute_api::requests::ConfigurationRequest;
+use compute_api::responses::ComputeCtlConfig;
 use compute_api::spec::Database;
 use compute_api::spec::PgIdent;
 use compute_api::spec::RemoteExtSpec;
@@ -880,10 +882,13 @@ impl Endpoint {
                 self.external_http_address.port()
             ))
             .header(CONTENT_TYPE.as_str(), "application/json")
-            .body(format!(
-                "{{\"spec\":{}}}",
-                serde_json::to_string_pretty(&spec)?
-            ))
+            .body(
+                serde_json::to_string(&ConfigurationRequest {
+                    spec,
+                    compute_ctl_config: ComputeCtlConfig::default(),
+                })
+                .unwrap(),
+            )
             .send()
             .await?;
 
diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml
index c0ec40a6c2..c11a1b6688 100644
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -7,6 +7,7 @@ license.workspace = true
 [dependencies]
 anyhow.workspace = true
 chrono.workspace = true
+jsonwebtoken.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 regex.workspace = true
diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs
index fc3757d981..0c256cae2e 100644
--- a/libs/compute_api/src/requests.rs
+++ b/libs/compute_api/src/requests.rs
@@ -1,18 +1,20 @@
 //! Structs representing the JSON formats used in the compute_ctl's HTTP API.
 use crate::{
     privilege::Privilege,
+    responses::ComputeCtlConfig,
     spec::{ComputeSpec, ExtVersion, PgIdent},
 };
-use serde::Deserialize;
+use serde::{Deserialize, Serialize};
 
 /// Request of the /configure API
 ///
 /// We now pass only `spec` in the configuration request, but later we can
 /// extend it and something like `restart: bool` or something else. So put
 /// `spec` into a struct initially to be more flexible in the future.
-#[derive(Deserialize, Debug)]
+#[derive(Debug, Deserialize, Serialize)]
 pub struct ConfigurationRequest {
     pub spec: ComputeSpec,
+    pub compute_ctl_config: ComputeCtlConfig,
 }
 
 #[derive(Deserialize, Debug)]
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 5286e0e61d..a6248019d9 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -3,6 +3,7 @@
 use std::fmt::Display;
 
 use chrono::{DateTime, Utc};
+use jsonwebtoken::jwk::JwkSet;
 use serde::{Deserialize, Serialize, Serializer};
 
 use crate::{
@@ -135,13 +136,27 @@ pub struct CatalogObjects {
     pub databases: Vec<Database>,
 }
 
+#[derive(Debug, Deserialize, Serialize)]
+pub struct ComputeCtlConfig {
+    pub jwks: JwkSet,
+}
+
+impl Default for ComputeCtlConfig {
+    fn default() -> Self {
+        Self {
+            jwks: JwkSet {
+                keys: Vec::default(),
+            },
+        }
+    }
+}
+
 /// Response of the `/computes/{compute_id}/spec` control-plane API.
-/// This is not actually a compute API response, so consider moving
-/// to a different place.
 #[derive(Deserialize, Debug)]
 pub struct ControlPlaneSpecResponse {
     pub spec: Option<ComputeSpec>,
     pub status: ControlPlaneComputeStatus,
+    pub compute_ctl_config: ComputeCtlConfig,
 }
 
 #[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)]

From 98e18e9a543f6ab073d3e3a100fe795f6fc08576 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Thu, 13 Feb 2025 21:05:15 +0300
Subject: [PATCH 466/583] Add s3 storage to test_s3_wal_replay (#10809)

## Problem

The test is flaky: WAL in remote storage appears to be corrupted. One of
hypotheses so far is that corruption is the result of local fs
implementation being non atomic, and safekeepers may concurrently PUT
the same segment. That's dubious though because by looking at local_fs
impl I'd expect then early EOF on segment read rather then observed
zeros in test failures, but other directions seem even less probable.

## Summary of changes

Let's add s3 backend as well and see if it is also flaky. Also add some
more logging around segments uploads.

ref https://github.com/neondatabase/neon/issues/10761
---
 safekeeper/src/wal_backup.rs             | 18 ++++++++++++++----
 test_runner/regress/test_wal_acceptor.py |  8 ++++++--
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 8517fa0344..2f6b91cf47 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -310,9 +310,12 @@ impl WalBackupTask {
                     retry_attempt = 0;
                 }
                 Err(e) => {
+                    // We might have managed to upload some segment even though
+                    // some later in the range failed, so log backup_lsn
+                    // separately.
                     error!(
-                        "failed while offloading range {}-{}: {:?}",
-                        backup_lsn, commit_lsn, e
+                        "failed while offloading range {}-{}, backup_lsn {}: {:?}",
+                        backup_lsn, commit_lsn, backup_lsn, e
                     );
 
                     retry_attempt = retry_attempt.saturating_add(1);
@@ -338,6 +341,13 @@ async fn backup_lsn_range(
     let start_lsn = *backup_lsn;
     let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
 
+    info!(
+        "offloading segnos {:?} of range [{}-{})",
+        segments.iter().map(|&s| s.seg_no).collect::<Vec<_>>(),
+        start_lsn,
+        end_lsn,
+    );
+
     // Pool of concurrent upload tasks. We use `FuturesOrdered` to
     // preserve order of uploads, and update `backup_lsn` only after
     // all previous uploads are finished.
@@ -374,10 +384,10 @@ async fn backup_lsn_range(
     }
 
     info!(
-        "offloaded segnos {:?} up to {}, previous backup_lsn {}",
+        "offloaded segnos {:?} of range [{}-{})",
         segments.iter().map(|&s| s.seg_no).collect::<Vec<_>>(),
-        end_lsn,
         start_lsn,
+        end_lsn,
     );
     Ok(())
 }
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 2b6a267bdf..21b2ad479c 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -566,10 +566,14 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder):
     assert_prefix_empty(neon_env_builder.safekeepers_remote_storage, prefix)
 
 
-def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
+# This test is flaky, probably because PUTs of local fs storage are not atomic.
+# Let's keep both remote storage kinds for a while to see if this is the case.
+# https://github.com/neondatabase/neon/issues/10761
+@pytest.mark.parametrize("remote_storage_kind", [s3_storage(), RemoteStorageKind.LOCAL_FS])
+def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind):
     neon_env_builder.num_safekeepers = 3
 
-    neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
+    neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind)
 
     env = neon_env_builder.init_start()
     tenant_id = env.initial_tenant

From 7ac7755dad6034f3132470664e8c6d7f787ca0b5 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Thu, 13 Feb 2025 20:04:08 +0100
Subject: [PATCH 467/583] Add tests for pgtap (#10589)

## Problem
We do not test `pgtap` which is shipped with Neon
## Summary of changes
Test and binaries for `pgtap` are added.
---
 compute/compute-node.Dockerfile                   |  2 +-
 docker-compose/docker_compose_test.sh             |  2 +-
 .../ext-src/pgtap-src/test-upgrade.patch          | 15 +++++++++++++++
 docker-compose/ext-src/pgtap-src/test-upgrade.sh  |  6 ++++++
 docker-compose/test_extensions_upgrade.sh         |  3 ++-
 5 files changed, 25 insertions(+), 3 deletions(-)
 create mode 100644 docker-compose/ext-src/pgtap-src/test-upgrade.patch
 create mode 100755 docker-compose/ext-src/pgtap-src/test-upgrade.sh

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 6814aadcb9..30348c2b90 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1750,7 +1750,7 @@ COPY --from=pg_graphql-src /ext-src/ /ext-src/
 COPY --from=hypopg-src /ext-src/ /ext-src/
 COPY --from=pg_hashids-src /ext-src/ /ext-src/
 COPY --from=rum-src /ext-src/ /ext-src/
-#COPY --from=pgtap-src /ext-src/ /ext-src/
+COPY --from=pgtap-src /ext-src/ /ext-src/
 COPY --from=ip4r-src /ext-src/ /ext-src/
 COPY --from=prefix-src /ext-src/ /ext-src/
 COPY --from=hll-src /ext-src/ /ext-src/
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index c4ff86ab66..dd520d4986 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -71,7 +71,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
         cat ../compute/patches/contrib_pg${pg_version}.patch | docker exec -i $TEST_CONTAINER_NAME bash -c "(cd /postgres && patch -p1)"
         # We are running tests now
         rm -f testout.txt testout_contrib.txt
-        docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
+        docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
         $TEST_CONTAINER_NAME /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
         docker exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
         $TEST_CONTAINER_NAME /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
diff --git a/docker-compose/ext-src/pgtap-src/test-upgrade.patch b/docker-compose/ext-src/pgtap-src/test-upgrade.patch
new file mode 100644
index 0000000000..16089b2902
--- /dev/null
+++ b/docker-compose/ext-src/pgtap-src/test-upgrade.patch
@@ -0,0 +1,15 @@
+diff --git a/test/schedule/create.sql b/test/schedule/create.sql
+index ba355ed..7e250f5 100644
+--- a/test/schedule/create.sql
++++ b/test/schedule/create.sql
+@@ -1,3 +1,2 @@
+ \unset ECHO
+ \i test/psql.sql
+-CREATE EXTENSION pgtap;
+diff --git a/test/schedule/main.sch b/test/schedule/main.sch
+index a8a5fbc..0463fc4 100644
+--- a/test/schedule/main.sch
++++ b/test/schedule/main.sch
+@@ -1,2 +1 @@
+-test: build
+ test: create
diff --git a/docker-compose/ext-src/pgtap-src/test-upgrade.sh b/docker-compose/ext-src/pgtap-src/test-upgrade.sh
new file mode 100755
index 0000000000..a8c43dd010
--- /dev/null
+++ b/docker-compose/ext-src/pgtap-src/test-upgrade.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin'    --inputdir=test --max-connections=86 --schedule test/schedule/main.sch   --schedule test/build/run.sch --dbname contrib_regression --use-existing
\ No newline at end of file
diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh
index 08b1a60f2d..082b804a87 100755
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -41,7 +41,8 @@ EXTENSIONS='[
 {"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"},
 {"extname": "semver", "extdir": "pg_semver-src"},
 {"extname": "pg_ivm", "extdir": "pg_ivm-src"},
-{"extname": "pgjwt", "extdir": "pgjwt-src"}
+{"extname": "pgjwt", "extdir": "pgjwt-src"},
+{"extname": "pgtap", "extdir": "pgtap-src"}
 ]'
 EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
 TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d

From 6a741fd1c20146bc3c0db7fc10f550414cf26653 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 13 Feb 2025 14:38:02 -0500
Subject: [PATCH 468/583] fix(pageserver): ensure all basebackup client errors
 are caught (#10793)

## Problem

We didn't catch all client errors causing alerts.

## Summary of changes

Client errors should be wrapped with ClientError so that it doesn't fire
alerts.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/basebackup.rs   | 57 ++++++++++++++++++----------------
 pageserver/src/page_service.rs | 13 +++++---
 2 files changed, 38 insertions(+), 32 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index a6087920fd..25078b57c8 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -42,8 +42,8 @@ use utils::lsn::Lsn;
 pub enum BasebackupError {
     #[error("basebackup pageserver error {0:#}")]
     Server(#[from] anyhow::Error),
-    #[error("basebackup client error {0:#}")]
-    Client(#[source] io::Error),
+    #[error("basebackup client error {0:#} when {1}")]
+    Client(#[source] io::Error, &'static str),
 }
 
 /// Create basebackup with non-rel data in it.
@@ -234,7 +234,7 @@ where
         self.ar
             .append(&header, self.buf.as_slice())
             .await
-            .map_err(BasebackupError::Client)?;
+            .map_err(|e| BasebackupError::Client(e, "flush"))?;
 
         self.total_blocks += nblocks;
         debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
@@ -273,9 +273,9 @@ where
         for dir in subdirs.iter() {
             let header = new_tar_header_dir(dir)?;
             self.ar
-                .append(&header, &mut io::empty())
+                .append(&header, io::empty())
                 .await
-                .context("could not add directory to basebackup tarball")?;
+                .map_err(|e| BasebackupError::Client(e, "send_tarball"))?;
         }
 
         // Send config files.
@@ -286,13 +286,13 @@ where
                 self.ar
                     .append(&header, data)
                     .await
-                    .context("could not add config file to basebackup tarball")?;
+                    .map_err(|e| BasebackupError::Client(e, "send_tarball,pg_hba.conf"))?;
             } else {
                 let header = new_tar_header(filepath, 0)?;
                 self.ar
-                    .append(&header, &mut io::empty())
+                    .append(&header, io::empty())
                     .await
-                    .context("could not add config file to basebackup tarball")?;
+                    .map_err(|e| BasebackupError::Client(e, "send_tarball,add_config_file"))?;
             }
         }
         if !lazy_slru_download {
@@ -406,7 +406,7 @@ where
             self.ar
                 .append(&header, &*content)
                 .await
-                .context("could not add aux file to basebackup tarball")?;
+                .map_err(|e| BasebackupError::Client(e, "send_tarball,add_aux_file"))?;
         }
 
         if min_restart_lsn != Lsn::MAX {
@@ -419,7 +419,7 @@ where
             self.ar
                 .append(&header, &data[..])
                 .await
-                .context("could not add restart.lsn file to basebackup tarball")?;
+                .map_err(|e| BasebackupError::Client(e, "send_tarball,restart.lsn"))?;
         }
         for xid in self
             .timeline
@@ -451,9 +451,9 @@ where
             let crc32 = crc32c::crc32c(&content);
             content.extend_from_slice(&crc32.to_le_bytes());
             let header = new_tar_header("pg_logical/replorigin_checkpoint", content.len() as u64)?;
-            self.ar.append(&header, &*content).await.context(
-                "could not add pg_logical/replorigin_checkpoint file to basebackup tarball",
-            )?;
+            self.ar.append(&header, &*content).await.map_err(|e| {
+                BasebackupError::Client(e, "send_tarball,pg_logical/replorigin_checkpoint")
+            })?;
         }
 
         fail_point!("basebackup-before-control-file", |_| {
@@ -464,7 +464,10 @@ where
 
         // Generate pg_control and bootstrap WAL segment.
         self.add_pgcontrol_file().await?;
-        self.ar.finish().await.map_err(BasebackupError::Client)?;
+        self.ar
+            .finish()
+            .await
+            .map_err(|e| BasebackupError::Client(e, "send_tarball,finish"))?;
         debug!("all tarred up!");
         Ok(())
     }
@@ -482,9 +485,9 @@ where
             let file_name = dst.to_segfile_name(0);
             let header = new_tar_header(&file_name, 0)?;
             self.ar
-                .append(&header, &mut io::empty())
+                .append(&header, io::empty())
                 .await
-                .map_err(BasebackupError::Client)?;
+                .map_err(|e| BasebackupError::Client(e, "add_rel,empty"))?;
             return Ok(());
         }
 
@@ -515,7 +518,7 @@ where
             self.ar
                 .append(&header, segment_data.as_slice())
                 .await
-                .map_err(BasebackupError::Client)?;
+                .map_err(|e| BasebackupError::Client(e, "add_rel,segment"))?;
 
             seg += 1;
             startblk = endblk;
@@ -566,7 +569,7 @@ where
             self.ar
                 .append(&header, pg_version_str.as_bytes())
                 .await
-                .map_err(BasebackupError::Client)?;
+                .map_err(|e| BasebackupError::Client(e, "add_dbdir,PG_VERSION"))?;
 
             info!("timeline.pg_version {}", self.timeline.pg_version);
 
@@ -576,7 +579,7 @@ where
                 self.ar
                     .append(&header, &img[..])
                     .await
-                    .map_err(BasebackupError::Client)?;
+                    .map_err(|e| BasebackupError::Client(e, "add_dbdir,global/pg_filenode.map"))?;
             } else {
                 warn!("global/pg_filenode.map is missing");
             }
@@ -612,9 +615,9 @@ where
             let path = format!("base/{}", dbnode);
             let header = new_tar_header_dir(&path)?;
             self.ar
-                .append(&header, &mut io::empty())
+                .append(&header, io::empty())
                 .await
-                .map_err(BasebackupError::Client)?;
+                .map_err(|e| BasebackupError::Client(e, "add_dbdir,base"))?;
 
             if let Some(img) = relmap_img {
                 let dst_path = format!("base/{}/PG_VERSION", dbnode);
@@ -627,14 +630,14 @@ where
                 self.ar
                     .append(&header, pg_version_str.as_bytes())
                     .await
-                    .map_err(BasebackupError::Client)?;
+                    .map_err(|e| BasebackupError::Client(e, "add_dbdir,base/PG_VERSION"))?;
 
                 let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
                 let header = new_tar_header(&relmap_path, img.len() as u64)?;
                 self.ar
                     .append(&header, &img[..])
                     .await
-                    .map_err(BasebackupError::Client)?;
+                    .map_err(|e| BasebackupError::Client(e, "add_dbdir,base/pg_filenode.map"))?;
             }
         };
         Ok(())
@@ -663,7 +666,7 @@ where
         self.ar
             .append(&header, &buf[..])
             .await
-            .map_err(BasebackupError::Client)?;
+            .map_err(|e| BasebackupError::Client(e, "add_twophase_file"))?;
 
         Ok(())
     }
@@ -693,7 +696,7 @@ where
                 zenith_signal.as_bytes(),
             )
             .await
-            .map_err(BasebackupError::Client)?;
+            .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?;
 
         let checkpoint_bytes = self
             .timeline
@@ -718,7 +721,7 @@ where
         self.ar
             .append(&header, &pg_control_bytes[..])
             .await
-            .map_err(BasebackupError::Client)?;
+            .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,pg_control"))?;
 
         //send wal segment
         let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
@@ -742,7 +745,7 @@ where
         self.ar
             .append(&header, &wal_seg[..])
             .await
-            .map_err(BasebackupError::Client)?;
+            .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,wal_segment"))?;
         Ok(())
     }
 }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 025519d0ec..bc0ed4198b 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -2050,7 +2050,8 @@ impl PageServerHandler {
     {
         fn map_basebackup_error(err: BasebackupError) -> QueryError {
             match err {
-                BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)),
+                // TODO: passthrough the error site to the final error message?
+                BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)),
                 BasebackupError::Server(e) => QueryError::Other(e),
             }
         }
@@ -2151,10 +2152,12 @@ impl PageServerHandler {
                 .await
                 .map_err(map_basebackup_error)?;
             }
-            writer
-                .flush()
-                .await
-                .map_err(|e| map_basebackup_error(BasebackupError::Client(e)))?;
+            writer.flush().await.map_err(|e| {
+                map_basebackup_error(BasebackupError::Client(
+                    e,
+                    "handle_basebackup_request,flush",
+                ))
+            })?;
         }
 
         pgb.write_message_noflush(&BeMessage::CopyDone)

From 487f3202feb740fe71d8e4bf539befa676e5372e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 13 Feb 2025 21:53:39 +0100
Subject: [PATCH 469/583] pageserver read path: abort on fatal IO errors from
 disk / filesystem (#10786)

Before this PR, an IO error returned from the kernel, e.g., due to a bad
disk, would get bubbled up, all the way to a user-visible query failing.

This is against the IO error handling policy where we have established
and is hence being rectified in this PR.
[[(internal Policy document
link)]](https://github.com/neondatabase/docs/blob/bef44149f746d6705c709b6d9c5e342c0ecac49c/src/storage/handling_io_and_logical_errors.md#L33-L35)

The practice on the write path seems to be that we call
`maybe_fatal_err()` or `fatal_err()` fairly high up the stack.
That is, regardless of whether std::fs, tokio::fs, or VirtualFile is
used to perform the IO.

For the read path, I choose a centralized approach in this PR by
checking for errors as close to the kernel interface as possible.
I believe this is better for long-term consistency.

To mitigate the problem of missing context if we abort so far down in
the stack, the `on_fatal_io_error` now captures and logs a backtrace.

I grepped the pageserver code base for `fs::read` to convince myself
that all non-VirtualFile reads already handle IO errors according to
policy.

Refs

- fixes https://github.com/neondatabase/neon/issues/10454
---
 pageserver/src/virtual_file.rs | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 9d539198c7..c966ad813f 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -496,7 +496,8 @@ pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
 /// bad storage or bad configuration, and we can't fix that from inside
 /// a running process.
 pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! {
-    tracing::error!("Fatal I/O error: {e}: {context})");
+    let backtrace = std::backtrace::Backtrace::force_capture();
+    tracing::error!("Fatal I/O error: {e}: {context})\n{backtrace}");
     std::process::abort();
 }
 
@@ -947,13 +948,18 @@ impl VirtualFileInner {
     where
         Buf: tokio_epoll_uring::IoBufMut + Send,
     {
-        let file_guard = match self.lock_file().await {
+        let file_guard = match self
+            .lock_file()
+            .await
+            .maybe_fatal_err("lock_file inside VirtualFileInner::read_at")
+        {
             Ok(file_guard) => file_guard,
             Err(e) => return (buf, Err(e)),
         };
 
         observe_duration!(StorageIoOperation::Read, {
             let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
+            let res = res.maybe_fatal_err("io_engine read_at inside VirtualFileInner::read_at");
             if let Ok(size) = res {
                 STORAGE_IO_SIZE
                     .with_label_values(&[

From 5008324460b8408a93c57466742f5396bef980a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 13 Feb 2025 21:55:53 +0100
Subject: [PATCH 470/583] Fix utilization URL and ensure heartbeats work
 (#10811)

There was a typo in the name of the utilization endpoint URL, fix it.
Also, ensure that the heartbeat mechanism actually works.

Related: #10583, #10429

Part of #9011
---
 safekeeper/src/http/routes.rs                  |  2 +-
 test_runner/regress/test_storage_controller.py | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index a64bf1ddd8..41e30d838a 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -626,7 +626,7 @@ pub fn make_router(
                 failpoints_handler(r, cancel).await
             })
         })
-        .get("/v1/uzilization", |r| request_span(r, utilization_handler))
+        .get("/v1/utilization", |r| request_span(r, utilization_handler))
         .delete("/v1/tenant/:tenant_id", |r| {
             request_span(r, tenant_delete_handler)
         })
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 2750826aec..88d30308f7 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3189,15 +3189,17 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
 
     assert len(target.get_safekeepers()) == 0
 
+    sk_0 = env.safekeepers[0]
+
     body = {
         "active": True,
         "id": fake_id,
         "created_at": "2023-10-25T09:11:25Z",
         "updated_at": "2024-08-28T11:32:43Z",
         "region_id": "aws-us-east-2",
-        "host": "safekeeper-333.us-east-2.aws.neon.build",
-        "port": 6401,
-        "http_port": 7676,
+        "host": "localhost",
+        "port": sk_0.port.pg,
+        "http_port": sk_0.port.http,
         "version": 5957,
         "availability_zone_id": "us-east-2b",
     }
@@ -3243,6 +3245,13 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
     # Ensure idempotency
     target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
 
+    def storcon_heartbeat():
+        assert env.storage_controller.log_contains(
+            "Heartbeat round complete for 1 safekeepers, 0 offline"
+        )
+
+    wait_until(storcon_heartbeat)
+
 
 def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
     compared = [dict(a), dict(b)]

From 3e8bf2159d3da2ddfe431ebbb58e680134dc7656 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 13 Feb 2025 22:03:47 +0000
Subject: [PATCH 471/583] CI(build-and-test): run `benchmarks` after `deploy`
 job (#10791)

## Problem

`benchmarks` is a long-running and non-blocking job. If, on Staging, a
deploy-blocking job fails, restarting it requires cancelling any running
`benchmarks` jobs, which is a waste of CI resources and requires a
couple of extra clicks for a human to do.

Ref: https://neondb.slack.com/archives/C059ZC138NR/p1739292995400899

## Summary of changes
- Run `benchmarks` after `deploy` job
- Handle `benchmarks` run in PRs with `run-benchmarks` label but without
`deploy` job.
---
 .github/workflows/build_and_test.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 88cb395958..bc773600ea 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -263,8 +263,9 @@ jobs:
           echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT
 
   benchmarks:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
-    needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
+    # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `deploy` in PRs
+    if: github.ref_name == 'main' || (contains(github.event.pull_request.labels.*.name, 'run-benchmarks') && !failure() && !cancelled())
+    needs: [ check-permissions, build-build-tools-image, get-benchmarks-durations, deploy ]
     permissions:
       id-token: write # aws-actions/configure-aws-credentials
       statuses: write

From 8bdb1828c8feea6f115cb63dd1c184ceec3ceffd Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 14 Feb 2025 10:19:56 +0200
Subject: [PATCH 472/583] Perform seqscan to fill LFC chunks with data so that
 on-disk file size included size of table (#10775)

## Problem

See https://github.com/neondatabase/neon/issues/10755

Random access pattern of pgbench leaves sparse chunks, which makes the
on-disk size of file.cache unpredictable.

## Summary of changes

Perform seqscan to fill LFC chunks with data so that on-disk file size
included size of table.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 test_runner/regress/test_lfc_resize.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py
index 8762e6525b..ea7d38a3d9 100644
--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -72,6 +72,11 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
 
     thread.join()
 
+    # Fill LFC: seqscan should fetch the whole table in cache.
+    # It is needed for further correct evaluation of LFC file size
+    # (a sparse chunk of LFC takes less than 1 MB on disk).
+    cur.execute("select sum(abalance) from pgbench_accounts")
+
     # Before shrinking the cache, check that it really is large now
     (lfc_file_size, lfc_file_blocks) = get_lfc_size()
     assert int(lfc_file_blocks) > 128 * 1024

From 996f0a3753fd7626d935cb327abe57056a60a06c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 14 Feb 2025 09:57:19 +0000
Subject: [PATCH 473/583] storcon: fix eliding parameters from proxied URL
 labels (#10817)

## Problem

We had code for stripping IDs out of proxied paths to reduce cardinality
of metrics, but it was only stripping out tenant IDs, and leaving in
timeline IDs and query parameters (e.g. LSN in lsn->timestamp lookups).

## Summary of changes

- Use a more general regex approach.

There is still some risk that a future pageserver API might include a
parameter in `/the/path/`, but we control that API and it is not often
extended. We will also alert on metrics cardinality in staging so that
if we made that mistake we would notice.
---
 Cargo.lock                     |  1 +
 storage_controller/Cargo.toml  |  1 +
 storage_controller/src/http.rs | 29 +++++++++++++++++++++++++----
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 86d9603d36..74922d71c9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6462,6 +6462,7 @@ dependencies = [
  "pageserver_client",
  "postgres_connection",
  "rand 0.8.5",
+ "regex",
  "reqwest",
  "routerify",
  "rustls 0.23.18",
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 69276bfde4..a93bbdeaaf 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -34,6 +34,7 @@ reqwest = { workspace = true, features = ["stream"] }
 routerify.workspace = true
 safekeeper_api.workspace = true
 safekeeper_client.workspace = true
+regex.workspace = true
 rustls-native-certs.workspace = true
 serde.workspace = true
 serde_json.workspace = true
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 1a56116cad..e3e35a6303 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -516,6 +516,17 @@ async fn handle_tenant_timeline_block_unblock_gc(
     json_response(StatusCode::OK, ())
 }
 
+// For metric labels where we would like to include the approximate path, but exclude high-cardinality fields like query parameters
+// and tenant/timeline IDs.  Since we are proxying to arbitrary paths, we don't have routing templates to
+// compare to, so we can just filter out our well known ID format with regexes.
+fn path_without_ids(path: &str) -> String {
+    static ID_REGEX: std::sync::OnceLock<regex::Regex> = std::sync::OnceLock::new();
+    ID_REGEX
+        .get_or_init(|| regex::Regex::new(r"([0-9a-fA-F]{32}(-[0-9]{4})?|\?.*)").unwrap())
+        .replace_all(path, "")
+        .to_string()
+}
+
 async fn handle_tenant_timeline_passthrough(
     service: Arc<Service>,
     req: Request<Body>,
@@ -551,10 +562,7 @@ async fn handle_tenant_timeline_passthrough(
         .metrics_group
         .storage_controller_passthrough_request_latency;
 
-    // This is a bit awkward. We remove the param from the request
-    // and join the words by '_' to get a label for the request.
-    let just_path = path.replace(&tenant_shard_str, "");
-    let path_label = just_path
+    let path_label = path_without_ids(&path)
         .split('/')
         .filter(|token| !token.is_empty())
         .collect::<Vec<_>>()
@@ -2089,3 +2097,16 @@ pub fn make_router(
             )
         })
 }
+
+#[cfg(test)]
+mod test {
+
+    use super::path_without_ids;
+
+    #[test]
+    fn test_path_without_ids() {
+        assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788/timeline/AA223344556677881122334455667788"), "/v1/tenant//timeline/");
+        assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788"), "/v1/tenant//timeline/");
+        assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788?parameter=foo"), "/v1/tenant//timeline/");
+    }
+}

From 878c1c7110348ef0352f4c0cd282746cd62f0fea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 14 Feb 2025 11:21:50 +0100
Subject: [PATCH 474/583] offload_timeline: check if the timeline is archived
 on HasChildren error (#10776)

PR #10305 makes sure that there is no *actual* race, i.e. we will never
attempt to offload a timeline that has just been unarchived, or similar.

However, if a timeline has been unarchived and has children that are
unarchived too, we will get an error log line. Such races can occur as
in compaction we check if the timeline can be offloaded way before we
attempt to offload it: the result might change in the meantime.

This patch checks if the delete guard can't be obtained because the
timeline has unarchived children, and if yes, it does another check for
whether the timeline has become unarchived or not. If it is unarchived,
it just prints an info log msg and integrates itself into the error
suppression logic of the compaction calling into it.

If you squint at it really closely, there is still a possible race in
which we print an error log, but this one is unlikely because the
timeline and its children need to be archived right after the check for
whether the timeline has any unarchived children, and right before the
check whether the timeline is archived. Archival involves a network
operation while nothing between these two checks does that, so it's very
unlikely to happen in real life.


https://github.com/neondatabase/cloud/issues/23979#issuecomment-2651265729
---
 pageserver/src/tenant/timeline/offload.rs | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 3b5bf8290c..93e5a1100d 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -7,7 +7,9 @@ use super::Timeline;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
 use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind};
-use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
+use crate::tenant::{
+    DeleteTimelineError, OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded,
+};
 
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum OffloadError {
@@ -37,12 +39,25 @@ pub(crate) async fn offload_timeline(
     debug_assert_current_span_has_tenant_and_timeline_id();
     tracing::info!("offloading archived timeline");
 
-    let (timeline, guard) = make_timeline_delete_guard(
+    let delete_guard_res = make_timeline_delete_guard(
         tenant,
         timeline.timeline_id,
         TimelineDeleteGuardKind::Offload,
-    )
-    .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
+    );
+    if let Err(DeleteTimelineError::HasChildren(children)) = delete_guard_res {
+        let is_archived = timeline.is_archived();
+        if is_archived == Some(true) {
+            tracing::error!("timeline is archived but has non-archived children: {children:?}");
+            return Err(OffloadError::NotArchived);
+        }
+        tracing::info!(
+            ?is_archived,
+            "timeline is not archived and has unarchived children"
+        );
+        return Err(OffloadError::NotArchived);
+    };
+    let (timeline, guard) =
+        delete_guard_res.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
 
     let TimelineOrOffloaded::Timeline(timeline) = timeline else {
         tracing::error!("timeline already offloaded, but given timeline object");

From 646e011c4db9fee802386382fadb0060cbbf77d6 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Fri, 14 Feb 2025 12:41:57 +0100
Subject: [PATCH 475/583] Tests the test-upgrade scripts themselves (#10664)

## Problem
We run the compatibility tests only if we are upgrading the extension.
An accidental code change may break the test itself, so we have to check
this code as well.
## Summary of changes
The test is scheduled once a day to save time and resources.

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .../force-test-extensions-upgrade.yml         | 76 +++++++++++++++++++
 docker-compose/test_extensions_upgrade.sh     | 14 +++-
 2 files changed, 87 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/force-test-extensions-upgrade.yml

diff --git a/.github/workflows/force-test-extensions-upgrade.yml b/.github/workflows/force-test-extensions-upgrade.yml
new file mode 100644
index 0000000000..71c5158ef6
--- /dev/null
+++ b/.github/workflows/force-test-extensions-upgrade.yml
@@ -0,0 +1,76 @@
+name: Force Test Upgrading of Extension
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '45 2 * * *' # run once a day, timezone is utc
+  workflow_dispatch: # adds ability to run this manually
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow
+  group: ${{ github.workflow }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write # aws-actions/configure-aws-credentials
+  statuses: write
+  contents: read
+
+jobs:
+  regress:
+    strategy:
+      fail-fast: false
+      matrix:
+        pg-version: [16, 17]
+
+    runs-on: small
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: false
+
+      - name: Get the last compute release tag
+        id: get-last-compute-release-tag
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\
+            -H "Accept: application/vnd.github+json" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            "/repos/${GITHUB_REPOSITORY}/releases")
+          echo tag=${tag} >> ${GITHUB_OUTPUT}
+
+      - name: Test extension upgrade
+        timeout-minutes: 20
+        env:
+          NEWTAG: latest
+          OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
+          PG_VERSION: ${{ matrix.pg-version }}
+          FORCE_ALL_UPGRADE_TESTS: true
+        run: ./docker-compose/test_extensions_upgrade.sh
+
+      - name: Print logs and clean up
+        if: always()
+        run: |
+          docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true
+          docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down
+
+      - name: Post to the Slack channel
+        if: ${{ github.event.schedule && failure() }}
+        uses: slackapi/slack-github-action@v1
+        with:
+          channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }}
+          slack-message: |
+            Test upgrading of extensions: ${{ job.status }}
+            <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh
index 082b804a87..775acada1f 100755
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -11,6 +11,7 @@ if [ -z ${OLDTAG+x} ] || [ -z ${NEWTAG+x} ] || [ -z "${OLDTAG}" ] || [ -z "${NEW
   exit 1
 fi
 export PG_VERSION=${PG_VERSION:-16}
+export PG_TEST_VERSION=${PG_VERSION}
 function wait_for_ready {
   TIME=0
   while ! docker compose logs compute_is_ready | grep -q "accepting connections" && [ ${TIME} -le 300 ] ; do
@@ -59,8 +60,12 @@ docker compose cp  ext-src neon-test-extensions:/
 docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
 docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
 create_extensions "${EXTNAMES}"
-query="select pge.extname from pg_extension pge join (select key as extname, value as extversion from json_each_text('${new_vers}')) x on pge.extname=x.extname and pge.extversion <> x.extversion"
-exts=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
+if [ "${FORCE_ALL_UPGRADE_TESTS:-false}" = true ]; then
+  exts="${EXTNAMES}"
+else
+  query="select pge.extname from pg_extension pge join (select key as extname, value as extversion from json_each_text('${new_vers}')) x on pge.extname=x.extname and pge.extversion <> x.extversion"
+  exts=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
+fi
 if [ -z "${exts}" ]; then
   echo "No extensions were upgraded"
 else
@@ -88,7 +93,10 @@ else
       exit 1
     fi
     docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}"
-    docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh
+    if ! docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh; then
+      docker  compose exec neon-test-extensions  cat /ext-src/${EXTDIR}/regression.diffs
+      exit 1
+    fi
     docker compose exec neon-test-extensions psql -d contrib_regression -c "alter extension ${ext} update"
     docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}"
   done

From da7496e1eef145253419ae699744353c79008047 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Fri, 14 Feb 2025 13:34:09 +0100
Subject: [PATCH 476/583] proxy: Post-refactor + future clippy lint cleanup
 (#10824)

* Clean up deps and code after logging and binary refactor
* Also include future clippy lint cleanup
---
 Cargo.lock                                 | 10 --------
 proxy/Cargo.toml                           |  4 ----
 proxy/src/auth/backend/console_redirect.rs |  5 ++--
 proxy/src/auth/backend/jwt.rs              |  8 +++----
 proxy/src/binary/local_proxy.rs            | 28 +++++++++++-----------
 proxy/src/binary/pg_sni_router.rs          | 13 +++++-----
 proxy/src/binary/proxy.rs                  | 24 ++++++++++---------
 proxy/src/cache/endpoints.rs               |  2 +-
 proxy/src/compute.rs                       |  4 ++--
 proxy/src/console_redirect_proxy.rs        |  2 +-
 proxy/src/control_plane/mod.rs             |  3 +--
 proxy/src/logging.rs                       |  3 +--
 proxy/src/protocol2.rs                     |  4 ++--
 proxy/src/proxy/connect_compute.rs         |  2 +-
 proxy/src/proxy/mod.rs                     |  2 +-
 proxy/src/redis/notifications.rs           |  4 ++--
 proxy/src/serverless/backend.rs            |  2 +-
 17 files changed, 53 insertions(+), 67 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 74922d71c9..287201b4e0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1029,12 +1029,6 @@ dependencies = [
  "generic-array",
 ]
 
-[[package]]
-name = "boxcar"
-version = "0.2.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2721c3c5a6f0e7f7e607125d963fedeb765f545f67adc9d71ed934693881eb42"
-
 [[package]]
 name = "bstr"
 version = "1.5.0"
@@ -4929,7 +4923,6 @@ dependencies = [
  "aws-sdk-iam",
  "aws-sigv4",
  "base64 0.13.1",
- "boxcar",
  "bstr",
  "bytes",
  "camino",
@@ -4981,7 +4974,6 @@ dependencies = [
  "postgres-protocol2",
  "postgres_backend",
  "pq_proto",
- "prometheus",
  "rand 0.8.5",
  "rand_distr",
  "rcgen",
@@ -5006,7 +4998,6 @@ dependencies = [
  "smallvec",
  "smol_str",
  "socket2",
- "strum",
  "strum_macros",
  "subtle",
  "thiserror 1.0.69",
@@ -5021,7 +5012,6 @@ dependencies = [
  "tracing",
  "tracing-log",
  "tracing-opentelemetry",
- "tracing-serde",
  "tracing-subscriber",
  "tracing-utils",
  "try-lock",
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 3aa6ac3a76..6a381bf094 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -19,7 +19,6 @@ aws-config.workspace = true
 aws-sdk-iam.workspace = true
 aws-sigv4.workspace = true
 base64.workspace = true
-boxcar = "0.2.8"
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
 camino.workspace = true
@@ -63,7 +62,6 @@ postgres_backend.workspace = true
 postgres-client = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" }
 postgres-protocol = { package = "postgres-protocol2", path = "../libs/proxy/postgres-protocol2" }
 pq_proto.workspace = true
-prometheus.workspace = true
 rand.workspace = true
 regex.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
@@ -81,7 +79,6 @@ sha2 = { workspace = true, features = ["asm", "oid"] }
 smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
-strum.workspace = true
 strum_macros.workspace = true
 subtle.workspace = true
 thiserror.workspace = true
@@ -95,7 +92,6 @@ tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
 tracing-log.workspace = true
-tracing-serde.workspace = true
 tracing-opentelemetry.workspace = true
 try-lock.workspace = true
 typed-json.workspace = true
diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index 9be29c38c9..7503b4eac9 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -140,9 +140,8 @@ async fn authenticate(
     let (psql_session_id, waiter) = loop {
         let psql_session_id = new_psql_session_id();
 
-        match control_plane::mgmt::get_waiter(&psql_session_id) {
-            Ok(waiter) => break (psql_session_id, waiter),
-            Err(_e) => continue,
+        if let Ok(waiter) = control_plane::mgmt::get_waiter(&psql_session_id) {
+            break (psql_session_id, waiter);
         }
     };
 
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index e05a693cee..5d032c0deb 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -220,11 +220,11 @@ async fn fetch_jwks(
 }
 
 impl JwkCacheEntryLock {
-    async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
+    async fn acquire_permit(self: &Arc<Self>) -> JwkRenewalPermit<'_> {
         JwkRenewalPermit::acquire_permit(self).await
     }
 
-    fn try_acquire_permit<'a>(self: &'a Arc<Self>) -> Option<JwkRenewalPermit<'a>> {
+    fn try_acquire_permit(self: &Arc<Self>) -> Option<JwkRenewalPermit<'_>> {
         JwkRenewalPermit::try_acquire_permit(self)
     }
 
@@ -393,7 +393,7 @@ impl JwkCacheEntryLock {
                 verify_rsa_signature(header_payload.as_bytes(), &sig, key, &header.algorithm)?;
             }
             key => return Err(JwtError::UnsupportedKeyType(key.into())),
-        };
+        }
 
         tracing::debug!(?payload, "JWT signature valid with claims");
 
@@ -510,7 +510,7 @@ fn verify_rsa_signature(
             key.verify(data, &sig)?;
         }
         _ => return Err(JwtError::InvalidRsaSigningAlgorithm),
-    };
+    }
 
     Ok(())
 }
diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs
index e0d8515375..4ab11f828c 100644
--- a/proxy/src/binary/local_proxy.rs
+++ b/proxy/src/binary/local_proxy.rs
@@ -4,6 +4,20 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 
+use anyhow::{bail, ensure, Context};
+use camino::{Utf8Path, Utf8PathBuf};
+use clap::Parser;
+use compute_api::spec::LocalProxySpec;
+use futures::future::Either;
+use thiserror::Error;
+use tokio::net::TcpListener;
+use tokio::sync::Notify;
+use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
+use tracing::{debug, error, info, warn};
+use utils::sentry_init::init_sentry;
+use utils::{pid_file, project_build_tag, project_git_version};
+
 use crate::auth::backend::jwt::JwkCache;
 use crate::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP};
 use crate::auth::{self};
@@ -25,24 +39,10 @@ use crate::serverless::{self, GlobalConnPoolOptions};
 use crate::tls::client_config::compute_client_config_with_root_certs;
 use crate::types::RoleName;
 use crate::url::ApiUrl;
-use anyhow::{bail, ensure, Context};
-use camino::{Utf8Path, Utf8PathBuf};
-use compute_api::spec::LocalProxySpec;
-use futures::future::Either;
 
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 
-use clap::Parser;
-use thiserror::Error;
-use tokio::net::TcpListener;
-use tokio::sync::Notify;
-use tokio::task::JoinSet;
-use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, warn};
-use utils::sentry_init::init_sentry;
-use utils::{pid_file, project_build_tag, project_git_version};
-
 /// Neon proxy/router
 #[derive(Parser)]
 #[command(version = GIT_VERSION, about)]
diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs
index 235e9674c6..94e771a61c 100644
--- a/proxy/src/binary/pg_sni_router.rs
+++ b/proxy/src/binary/pg_sni_router.rs
@@ -5,12 +5,6 @@
 /// the outside. Similar to an ingress controller for HTTPS.
 use std::{net::SocketAddr, sync::Arc};
 
-use crate::context::RequestContext;
-use crate::metrics::{Metrics, ThreadPoolMetrics};
-use crate::protocol2::ConnectionInfo;
-use crate::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
-use crate::stream::{PqStream, Stream};
-use crate::tls::TlsServerEndPoint;
 use anyhow::{anyhow, bail, ensure, Context};
 use clap::Arg;
 use futures::future::Either;
@@ -25,6 +19,13 @@ use tracing::{error, info, Instrument};
 use utils::project_git_version;
 use utils::sentry_init::init_sentry;
 
+use crate::context::RequestContext;
+use crate::metrics::{Metrics, ThreadPoolMetrics};
+use crate::protocol2::ConnectionInfo;
+use crate::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
+use crate::stream::{PqStream, Stream};
+use crate::tls::TlsServerEndPoint;
+
 project_git_version!(GIT_VERSION);
 
 fn cli() -> clap::Command {
diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs
index e38c49ca10..b72799df54 100644
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -3,6 +3,16 @@ use std::pin::pin;
 use std::sync::Arc;
 use std::time::Duration;
 
+use anyhow::bail;
+use futures::future::Either;
+use remote_storage::RemoteStorageConfig;
+use tokio::net::TcpListener;
+use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
+use tracing::{info, warn, Instrument};
+use utils::sentry_init::init_sentry;
+use utils::{project_build_tag, project_git_version};
+
 use crate::auth::backend::jwt::JwkCache;
 use crate::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned};
 use crate::cancellation::{handle_cancel_messages, CancellationHandler};
@@ -24,15 +34,6 @@ use crate::serverless::cancel_set::CancelSet;
 use crate::serverless::GlobalConnPoolOptions;
 use crate::tls::client_config::compute_client_config_with_root_certs;
 use crate::{auth, control_plane, http, serverless, usage_metrics};
-use anyhow::bail;
-use futures::future::Either;
-use remote_storage::RemoteStorageConfig;
-use tokio::net::TcpListener;
-use tokio::task::JoinSet;
-use tokio_util::sync::CancellationToken;
-use tracing::{info, warn, Instrument};
-use utils::sentry_init::init_sentry;
-use utils::{project_build_tag, project_git_version};
 
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
@@ -303,7 +304,7 @@ pub async fn run() -> anyhow::Result<()> {
     match auth_backend {
         Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"),
         Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"),
-    };
+    }
     info!("Using region: {}", args.aws_region);
 
     // TODO: untangle the config args
@@ -803,9 +804,10 @@ fn build_auth_backend(
 mod tests {
     use std::time::Duration;
 
-    use crate::rate_limiter::RateBucketInfo;
     use clap::Parser;
 
+    use crate::rate_limiter::RateBucketInfo;
+
     #[test]
     fn parse_endpoint_rps_limit() {
         let config = super::ProxyCliArgs::parse_from([
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index b5c42cd23d..8ec1a4648b 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -242,7 +242,7 @@ impl EndpointsCache {
                             });
                             tracing::error!("error parsing value {value:?}: {err:?}");
                         }
-                    };
+                    }
                 }
                 if total.is_power_of_two() {
                     tracing::debug!("endpoints read {}", total);
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index d71465765f..5447a4a4c0 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -137,8 +137,8 @@ impl ConnCfg {
             match k {
                 // Only set `user` if it's not present in the config.
                 // Console redirect auth flow takes username from the console's response.
-                "user" if self.user_is_set() => continue,
-                "database" if self.db_is_set() => continue,
+                "user" if self.user_is_set() => {}
+                "database" if self.db_is_set() => {}
                 "options" => {
                     if let Some(options) = filtered_options(v) {
                         self.set_param(k, &options);
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index c4548a7ddd..1044f5f8e2 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -82,7 +82,7 @@ pub async fn task_main(
                     error!("per-client task finished with an error: failed to set socket option: {e:#}");
                     return;
                 }
-            };
+            }
 
             let ctx = RequestContext::new(
                 session_id,
diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs
index f92e4f3f60..89ec4f9b33 100644
--- a/proxy/src/control_plane/mod.rs
+++ b/proxy/src/control_plane/mod.rs
@@ -19,8 +19,7 @@ use crate::cache::{Cached, TimedLru};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo};
-use crate::intern::AccountIdInt;
-use crate::intern::ProjectIdInt;
+use crate::intern::{AccountIdInt, ProjectIdInt};
 use crate::types::{EndpointCacheKey, EndpointId};
 use crate::{compute, scram};
 
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index 97c9f5a59c..fbd4811b54 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -7,9 +7,8 @@ use chrono::{DateTime, Utc};
 use opentelemetry::trace::TraceContextExt;
 use scopeguard::defer;
 use serde::ser::{SerializeMap, Serializer};
-use tracing::span;
 use tracing::subscriber::Interest;
-use tracing::{callsite, Event, Metadata, Span, Subscriber};
+use tracing::{callsite, span, Event, Metadata, Span, Subscriber};
 use tracing_opentelemetry::OpenTelemetrySpanExt;
 use tracing_subscriber::filter::{EnvFilter, LevelFilter};
 use tracing_subscriber::fmt::format::{Format, Full};
diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index 0dc97b7097..74a15d9bf4 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -119,7 +119,7 @@ pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
         // if no more bytes available then exit
         if bytes_read == 0 {
             return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing));
-        };
+        }
 
         // check if we have enough bytes to continue
         if let Some(header) = buf.try_get::<ProxyProtocolV2Header>() {
@@ -169,7 +169,7 @@ fn process_proxy_payload(
                 header.version_and_command
             ),
         )),
-    };
+    }
 
     let size_err =
         "invalid proxy protocol length. payload not large enough to fit requested IP addresses";
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index dd145e6bb2..26fb1754bf 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -198,7 +198,7 @@ where
 
                 warn!(error = ?e, num_retries, retriable = true, COULD_NOT_CONNECT);
             }
-        };
+        }
 
         let wait_duration = retry_after(num_retries, compute.retry);
         num_retries += 1;
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 8a407c8119..2a406fcb34 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -118,7 +118,7 @@ pub async fn task_main(
                     error!("per-client task finished with an error: failed to set socket option: {e:#}");
                     return;
                 }
-            };
+            }
 
             let ctx = RequestContext::new(
                 session_id,
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 1a7024588a..5f9f2509e2 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -169,7 +169,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                         });
                         tracing::error!("broken message: {e}");
                     }
-                };
+                }
                 return Ok(());
             }
             Ok(msg) => msg,
@@ -180,7 +180,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                 match serde_json::from_str::<NotificationHeader>(&payload) {
                     Ok(header) => tracing::error!(topic = header.topic, "broken message: {e}"),
                     Err(_) => tracing::error!("broken message: {e}"),
-                };
+                }
                 return Ok(());
             }
         };
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index edc2935618..6a59d413c4 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -651,7 +651,7 @@ async fn connect_http2(
                     e,
                 )));
             }
-        };
+        }
     };
 
     let (client, connection) = hyper::client::conn::http2::Builder::new(TokioExecutor::new())

From a82a6631fdfb4471aeb090c8cee9e0e53b4f96ad Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 14 Feb 2025 13:25:43 +0000
Subject: [PATCH 477/583] storage controller: prioritize reconciles for
 user-facing operations (#10822)

## Problem

Some situations may produce a large number of pending reconciles. If we
experience an issue where reconciles are processed more slowly than
expected, that can prevent us responding promptly to user requests like
tenant/timeline CRUD.

This is a cleaner implementation of the hotfix in
https://github.com/neondatabase/neon/pull/10815

## Summary of changes

- Introduce a second semaphore for high priority tasks, with
configurable units (default 256). The intent is that in practical
situations these user-facing requests should never have to wait.
- Use the high priority semaphore for: tenant/timeline CRUD, and shard
splitting operations. Use normal priority for everything else.
---
 storage_controller/src/main.rs                |  12 +-
 storage_controller/src/reconciler.rs          |  33 ++++-
 storage_controller/src/service.rs             | 124 ++++++++++++++----
 .../src/service/chaos_injector.rs             |   6 +-
 4 files changed, 143 insertions(+), 32 deletions(-)

diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 07279a67ff..ea6bc38e89 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -12,7 +12,8 @@ use storage_controller::persistence::Persistence;
 use storage_controller::service::chaos_injector::ChaosInjector;
 use storage_controller::service::{
     Config, Service, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT,
-    MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
+    MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
+    PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
 };
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
@@ -75,10 +76,14 @@ struct Cli {
     #[arg(long)]
     split_threshold: Option<u64>,
 
-    /// Maximum number of reconcilers that may run in parallel
+    /// Maximum number of normal-priority reconcilers that may run in parallel
     #[arg(long)]
     reconciler_concurrency: Option<usize>,
 
+    /// Maximum number of high-priority reconcilers that may run in parallel
+    #[arg(long)]
+    priority_reconciler_concurrency: Option<usize>,
+
     /// How long to wait for the initial database connection to be available.
     #[arg(long, default_value = "5s")]
     db_connect_timeout: humantime::Duration,
@@ -289,6 +294,9 @@ async fn async_main() -> anyhow::Result<()> {
         reconciler_concurrency: args
             .reconciler_concurrency
             .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
+        priority_reconciler_concurrency: args
+            .priority_reconciler_concurrency
+            .unwrap_or(PRIORITY_RECONCILER_CONCURRENCY_DEFAULT),
         split_threshold: args.split_threshold,
         neon_local_repo_dir: args.neon_local_repo_dir,
         max_secondary_lag_bytes: args.max_secondary_lag_bytes,
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 8c7e9b1726..48f0804926 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -91,9 +91,10 @@ pub(crate) struct ReconcilerConfigBuilder {
 }
 
 impl ReconcilerConfigBuilder {
-    pub(crate) fn new() -> Self {
+    /// Priority is special: you must pick one thoughtfully, do not just use 'normal' as the default
+    pub(crate) fn new(priority: ReconcilerPriority) -> Self {
         Self {
-            config: ReconcilerConfig::default(),
+            config: ReconcilerConfig::new(priority),
         }
     }
 
@@ -129,8 +130,18 @@ impl ReconcilerConfigBuilder {
     }
 }
 
-#[derive(Default, Debug, Copy, Clone)]
+// Higher priorities are used for user-facing tasks, so that a long backlog of housekeeping work (e.g. reconciling on startup, rescheduling
+// things on node changes) does not starve user-facing tasks.
+#[derive(Debug, Copy, Clone)]
+pub(crate) enum ReconcilerPriority {
+    Normal,
+    High,
+}
+
+#[derive(Debug, Copy, Clone)]
 pub(crate) struct ReconcilerConfig {
+    pub(crate) priority: ReconcilerPriority,
+
     // During live migration give up on warming-up the secondary
     // after this timeout.
     secondary_warmup_timeout: Option<Duration>,
@@ -145,6 +156,18 @@ pub(crate) struct ReconcilerConfig {
 }
 
 impl ReconcilerConfig {
+    /// Configs are always constructed with an explicit priority, to force callers to think about whether
+    /// the operation they're scheduling is high-priority or not. Normal priority is not a safe default, because
+    /// scheduling something user-facing at normal priority can result in it getting starved out by background work.
+    pub(crate) fn new(priority: ReconcilerPriority) -> Self {
+        Self {
+            priority,
+            secondary_warmup_timeout: None,
+            secondary_download_request_timeout: None,
+            tenant_creation_hint: false,
+        }
+    }
+
     pub(crate) fn get_secondary_warmup_timeout(&self) -> Duration {
         const SECONDARY_WARMUP_TIMEOUT_DEFAULT: Duration = Duration::from_secs(300);
         self.secondary_warmup_timeout
@@ -164,7 +187,9 @@ impl ReconcilerConfig {
 
 impl From<&MigrationConfig> for ReconcilerConfig {
     fn from(value: &MigrationConfig) -> Self {
-        let mut builder = ReconcilerConfigBuilder::new();
+        // Run reconciler at high priority because MigrationConfig comes from human requests that should
+        // be presumed urgent.
+        let mut builder = ReconcilerConfigBuilder::new(ReconcilerPriority::High);
 
         if let Some(timeout) = value.secondary_warmup_timeout {
             builder = builder.secondary_warmup_timeout(timeout)
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index c1da9374e4..d5713d49ee 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -30,7 +30,10 @@ use crate::{
         AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence,
         ShardGenerationState, TenantFilter,
     },
-    reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
+    reconciler::{
+        ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder,
+        ReconcilerPriority,
+    },
     safekeeper::Safekeeper,
     scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
     tenant_shard::{
@@ -79,7 +82,7 @@ use pageserver_api::{
     },
 };
 use pageserver_client::{mgmt_api, BlockUnblock};
-use tokio::sync::mpsc::error::TrySendError;
+use tokio::sync::{mpsc::error::TrySendError, TryAcquireError};
 use tokio_util::sync::CancellationToken;
 use utils::{
     completion::Barrier,
@@ -195,6 +198,7 @@ pub(crate) enum LeadershipStatus {
 }
 
 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
+pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;
 
 // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
 // This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly
@@ -366,9 +370,12 @@ pub struct Config {
     /// and/or upon handling the re-attach request from a node.
     pub max_warming_up_interval: Duration,
 
-    /// How many Reconcilers may be spawned concurrently
+    /// How many normal-priority Reconcilers may be spawned concurrently
     pub reconciler_concurrency: usize,
 
+    /// How many high-priority Reconcilers may be spawned concurrently
+    pub priority_reconciler_concurrency: usize,
+
     /// How large must a shard grow in bytes before we split it?
     /// None disables auto-splitting.
     pub split_threshold: Option<u64>,
@@ -436,9 +443,14 @@ pub struct Service {
     // that transition it to/from Active.
     node_op_locks: IdLockMap<NodeId, NodeOperations>,
 
-    // Limit how many Reconcilers we will spawn concurrently
+    // Limit how many Reconcilers we will spawn concurrently for normal-priority tasks such as background reconciliations
+    // and reconciliation on startup.
     reconciler_concurrency: Arc<tokio::sync::Semaphore>,
 
+    // Limit how many Reconcilers we will spawn concurrently for high-priority tasks such as tenant/timeline CRUD, which
+    // a human user might be waiting for.
+    priority_reconciler_concurrency: Arc<tokio::sync::Semaphore>,
+
     /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile
     /// Send into this queue to promptly attempt to reconcile this shard next time units are available.
     ///
@@ -1263,12 +1275,15 @@ impl Service {
         }
 
         // Maybe some other work can proceed now that this job finished.
+        //
+        // Only bother with this if we have some semaphore units available in the normal-priority semaphore (these
+        // reconciles are scheduled at `[ReconcilerPriority::Normal]`).
         if self.reconciler_concurrency.available_permits() > 0 {
             while let Ok(tenant_shard_id) = locked.delayed_reconcile_rx.try_recv() {
                 let (nodes, tenants, _scheduler) = locked.parts_mut();
                 if let Some(shard) = tenants.get_mut(&tenant_shard_id) {
                     shard.delayed_reconcile = false;
-                    self.maybe_reconcile_shard(shard, nodes);
+                    self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::Normal);
                 }
 
                 if self.reconciler_concurrency.available_permits() == 0 {
@@ -1565,6 +1580,9 @@ impl Service {
             reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new(
                 config.reconciler_concurrency,
             )),
+            priority_reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new(
+                config.priority_reconciler_concurrency,
+            )),
             delayed_reconcile_tx,
             abort_tx,
             startup_complete: startup_complete.clone(),
@@ -2337,7 +2355,7 @@ impl Service {
         let waiters = {
             let mut locked = self.inner.write().unwrap();
             let (nodes, tenants, _scheduler) = locked.parts_mut();
-            let config = ReconcilerConfigBuilder::new()
+            let config = ReconcilerConfigBuilder::new(ReconcilerPriority::High)
                 .tenant_creation_hint(true)
                 .build();
             tenants
@@ -2812,7 +2830,8 @@ impl Service {
 
                         shard.schedule(scheduler, &mut schedule_context)?;
 
-                        let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
+                        let maybe_waiter =
+                            self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High);
                         if let Some(waiter) = maybe_waiter {
                             waiters.push(waiter);
                         }
@@ -2933,7 +2952,9 @@ impl Service {
             let (nodes, tenants, _scheduler) = locked.parts_mut();
             for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
                 shard.config = config.clone();
-                if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
+                if let Some(waiter) =
+                    self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High)
+                {
                     waiters.push(waiter);
                 }
             }
@@ -3215,7 +3236,9 @@ impl Service {
                 debug_assert!(shard.intent.get_attached().is_none());
                 debug_assert!(shard.intent.get_secondary().is_empty());
 
-                if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
+                if let Some(waiter) =
+                    self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High)
+                {
                     detach_waiters.push(waiter);
                 }
             }
@@ -3367,7 +3390,7 @@ impl Service {
 
             // In case scheduling is being switched back on, try it now.
             shard.schedule(scheduler, &mut schedule_context).ok();
-            self.maybe_reconcile_shard(shard, nodes);
+            self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High);
         }
 
         Ok(())
@@ -4416,7 +4439,7 @@ impl Service {
                     tracing::warn!("Failed to schedule {tenant_shard_id} during shard abort: {e}")
                 }
 
-                self.maybe_reconcile_shard(shard, nodes);
+                self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High);
             }
 
             // We don't expect any new_shard_count shards to exist here, but drop them just in case
@@ -4582,7 +4605,11 @@ impl Service {
                         tracing::warn!("Failed to schedule child shard {child}: {e}");
                     }
                     // In the background, attach secondary locations for the new shards
-                    if let Some(waiter) = self.maybe_reconcile_shard(&mut child_state, nodes) {
+                    if let Some(waiter) = self.maybe_reconcile_shard(
+                        &mut child_state,
+                        nodes,
+                        ReconcilerPriority::High,
+                    ) {
                         waiters.push(waiter);
                     }
 
@@ -4947,7 +4974,9 @@ impl Service {
                 shard.intent.clear_secondary(scheduler);
 
                 // Run Reconciler to execute detach fo secondary locations.
-                if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
+                if let Some(waiter) =
+                    self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High)
+                {
                     waiters.push(waiter);
                 }
             }
@@ -5215,7 +5244,7 @@ impl Service {
 
             let reconciler_config = match migrate_req.migration_config {
                 Some(cfg) => (&cfg).into(),
-                None => ReconcilerConfig::default(),
+                None => ReconcilerConfig::new(ReconcilerPriority::High),
             };
 
             self.maybe_configured_reconcile_shard(shard, nodes, reconciler_config)
@@ -5281,7 +5310,7 @@ impl Service {
                 );
             }
 
-            self.maybe_reconcile_shard(shard, nodes)
+            self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High)
         };
 
         if let Some(waiter) = waiter {
@@ -5693,7 +5722,7 @@ impl Service {
                             )
                         }
 
-                        self.maybe_reconcile_shard(shard, nodes);
+                        self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::Normal);
                     }
 
                     // Here we remove an existing observed location for the node we're removing, and it will
@@ -6062,7 +6091,14 @@ impl Service {
                                     tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id);
                                 }
                                 Ok(()) => {
-                                    if self.maybe_reconcile_shard(tenant_shard, nodes).is_some() {
+                                    if self
+                                        .maybe_reconcile_shard(
+                                            tenant_shard,
+                                            nodes,
+                                            ReconcilerPriority::Normal,
+                                        )
+                                        .is_some()
+                                    {
                                         tenants_affected += 1;
                                     };
                                 }
@@ -6093,7 +6129,11 @@ impl Service {
 
                     if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) {
                         if observed_loc.conf.is_none() {
-                            self.maybe_reconcile_shard(tenant_shard, nodes);
+                            self.maybe_reconcile_shard(
+                                tenant_shard,
+                                nodes,
+                                ReconcilerPriority::Normal,
+                            );
                         }
                     }
                 }
@@ -6457,8 +6497,36 @@ impl Service {
         &self,
         shard: &mut TenantShard,
         nodes: &Arc<HashMap<NodeId, Node>>,
+        priority: ReconcilerPriority,
     ) -> Option<ReconcilerWaiter> {
-        self.maybe_configured_reconcile_shard(shard, nodes, ReconcilerConfig::default())
+        self.maybe_configured_reconcile_shard(shard, nodes, ReconcilerConfig::new(priority))
+    }
+
+    /// Before constructing a Reconciler, acquire semaphore units from the appropriate concurrency limit (depends on priority)
+    fn get_reconciler_units(
+        &self,
+        priority: ReconcilerPriority,
+    ) -> Result<ReconcileUnits, TryAcquireError> {
+        let units = match priority {
+            ReconcilerPriority::Normal => self.reconciler_concurrency.clone().try_acquire_owned(),
+            ReconcilerPriority::High => {
+                match self
+                    .priority_reconciler_concurrency
+                    .clone()
+                    .try_acquire_owned()
+                {
+                    Ok(u) => Ok(u),
+                    Err(TryAcquireError::NoPermits) => {
+                        // If the high priority semaphore is exhausted, then high priority tasks may steal units from
+                        // the normal priority semaphore.
+                        self.reconciler_concurrency.clone().try_acquire_owned()
+                    }
+                    Err(e) => Err(e),
+                }
+            }
+        };
+
+        units.map(ReconcileUnits::new)
     }
 
     /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`],
@@ -6478,8 +6546,8 @@ impl Service {
             }
         };
 
-        let units = match self.reconciler_concurrency.clone().try_acquire_owned() {
-            Ok(u) => ReconcileUnits::new(u),
+        let units = match self.get_reconciler_units(reconciler_config.priority) {
+            Ok(u) => u,
             Err(_) => {
                 tracing::info!(tenant_id=%shard.tenant_shard_id.tenant_id, shard_id=%shard.tenant_shard_id.shard_slug(),
                     "Concurrency limited: enqueued for reconcile later");
@@ -6572,7 +6640,10 @@ impl Service {
 
             // Eventual consistency: if an earlier reconcile job failed, and the shard is still
             // dirty, spawn another rone
-            if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
+            if self
+                .maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal)
+                .is_some()
+            {
                 reconciles_spawned += 1;
             } else if shard.delayed_reconcile {
                 // Shard wanted to reconcile but for some reason couldn't.
@@ -6658,7 +6729,10 @@ impl Service {
             tracing::info!(tenant_shard_id=%tenant_shard_id, "Applying optimization: {optimization:?}");
             if shard.apply_optimization(scheduler, optimization) {
                 optimizations_applied += 1;
-                if self.maybe_reconcile_shard(shard, nodes).is_some() {
+                if self
+                    .maybe_reconcile_shard(shard, nodes, ReconcilerPriority::Normal)
+                    .is_some()
+                {
                     reconciles_spawned += 1;
                 }
             }
@@ -7208,7 +7282,7 @@ impl Service {
         // to not stall the operation when a cold secondary is encountered.
         const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20);
         const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5);
-        let reconciler_config = ReconcilerConfigBuilder::new()
+        let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal)
             .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT)
             .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT)
             .build();
@@ -7541,7 +7615,7 @@ impl Service {
     ) -> Result<(), OperationError> {
         const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20);
         const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5);
-        let reconciler_config = ReconcilerConfigBuilder::new()
+        let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal)
             .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT)
             .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT)
             .build();
diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs
index 91d7183fde..aa0ee0df5a 100644
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -88,7 +88,11 @@ impl ChaosInjector {
 
         shard.intent.demote_attached(scheduler, old_location);
         shard.intent.promote_attached(scheduler, new_location);
-        self.service.maybe_reconcile_shard(shard, nodes);
+        self.service.maybe_reconcile_shard(
+            shard,
+            nodes,
+            crate::reconciler::ReconcilerPriority::Normal,
+        );
     }
 
     async fn inject_chaos(&mut self) {

From fac5db3c8de25b6f44b267365926fd122c901a44 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 14 Feb 2025 15:37:03 +0100
Subject: [PATCH 478/583] page_service: emit periodic log message while
 response flush is slow (#10813)

The logic might seem a bit intricate / over-optimized, but I recently
spent time benchmarking this code path in the context of a nightly
pagebench regression
(https://github.com/neondatabase/cloud/issues/21759)
and I want to avoid regressing it any further.

Ideally would also log the socket send & recv queue length like we do on
the compute side in
- https://github.com/neondatabase/neon/pull/10673

But that is proving difficult due to the Rust abstractions that wrap the
socket fd.
Work in progress on that is happening in
- https://github.com/neondatabase/neon/pull/10823

Regarding production impact, I am worried at a theoretical level that
the additional logging may cause a downward spiral in the case where a
pageserver is slow to flush because there is not enough CPU. The logging
would consume more CPU and thereby slow down flushes even more. However,
I don't think this matters practically speaking.


# Refs

- context:
https://neondb.slack.com/archives/C08DE6Q9C3B/p1739464533762049?thread_ts=1739462628.361019&cid=C08DE6Q9C3B
- fixes https://github.com/neondatabase/neon/issues/10668
- part of https://github.com/neondatabase/cloud/issues/23515

# Testing

Tested locally by running

```
./target/debug/pagebench get-page-latest-lsn --num-clients=1000 --queue-depth=1000
```
in one terminal, waiting a bit, then
```
pkill -STOP pagebench
```
then wait for slow logs to show up in `pageserver.log`.
To see that the completion log message is logged, run
```
pkill -CONT pagebench
```
---
 pageserver/src/metrics.rs | 42 +++++++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 983a3079e4..6a5dc3e749 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1439,27 +1439,43 @@ impl Drop for SmgrOpTimer {
 }
 
 impl SmgrOpFlushInProgress {
-    pub(crate) async fn measure<Fut, O>(self, mut started_at: Instant, mut fut: Fut) -> O
+    pub(crate) async fn measure<Fut, O>(self, started_at: Instant, mut fut: Fut) -> O
     where
         Fut: std::future::Future<Output = O>,
     {
         let mut fut = std::pin::pin!(fut);
 
-        // Whenever observe_guard gets called, or dropped,
-        // it adds the time elapsed since its last call to metrics.
-        // Last call is tracked in `now`.
+        let mut logged = false;
+        let mut last_counter_increment_at = started_at;
         let mut observe_guard = scopeguard::guard(
-            || {
+            |is_timeout| {
                 let now = Instant::now();
-                let elapsed = now - started_at;
-                self.global_micros
-                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
-                self.per_timeline_micros
-                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
-                started_at = now;
+
+                // Increment counter
+                {
+                    let elapsed_since_last_observe = now - last_counter_increment_at;
+                    self.global_micros
+                        .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap());
+                    self.per_timeline_micros
+                        .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap());
+                    last_counter_increment_at = now;
+                }
+
+                // Log something on every timeout, and on completion but only if we hit a timeout.
+                if is_timeout || logged {
+                    logged = true;
+                    let elapsed_total = now - started_at;
+                    let msg = if is_timeout {
+                        "slow flush ongoing"
+                    } else {
+                        "slow flush completed or cancelled"
+                    };
+                    let elapsed_total_secs = format!("{:.6}", elapsed_total.as_secs_f64());
+                    tracing::info!(elapsed_total_secs, msg);
+                }
             },
             |mut observe| {
-                observe();
+                observe(false);
             },
         );
 
@@ -1467,7 +1483,7 @@ impl SmgrOpFlushInProgress {
             match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
                 Ok(v) => return v,
                 Err(_timeout) => {
-                    (*observe_guard)();
+                    (*observe_guard)(true);
                 }
             }
         }

From 3d7a32f6196e87b00491fcdc4887ec9ed1bd1640 Mon Sep 17 00:00:00 2001
From: Gleb Novikov <NanoBjorn@users.noreply.github.com>
Date: Fri, 14 Feb 2025 16:10:06 +0000
Subject: [PATCH 479/583] fast import: allow restore to provided connection
 string (#10407)

Within https://github.com/neondatabase/cloud/issues/22089 we decided
that would be nice to start with import that runs dump-restore into a
running compute (more on this
[here](https://www.notion.so/neondatabase/2024-Jan-13-Migration-Assistant-Next-Steps-Proposal-Revised-17af189e004780228bdbcad13eeda93f?pvs=4#17af189e004780de816ccd9c13afd953))
We could do it by writing another tool or by extending existing
`fast_import.rs`, we chose the latter.

In this PR, I have added optional `restore_connection_string` as a cli
arg and as a part of the json spec. If specified, the script will not
run postgres and will just perform restore into provided connection
string.

TODO:
- [x] fast_import.rs:
	- [x] cli arg in the fast_import.rs
	- [x] encoded connstring in json spec
- [x] simplify `fn main` a little, take out too verbose stuff to some
functions
- [ ] ~~allow streaming from dump stdout to restore stdin~~ will do in a
separate PR
- [ ] ~~address
https://github.com/neondatabase/neon/pull/10251#pullrequestreview-2551877845~~
will do in a separate PR
- [x] tests:
    - [x] restore with cli arg in the fast_import.rs
    - [x] restore with encoded connstring in json spec in s3
    - [ ] ~~test with custom dbname~~ will do in a separate PR
- [ ] ~~test with s3 + pageserver + fast import binary~~
https://github.com/neondatabase/neon/pull/10487
- [ ]
~~https://github.com/neondatabase/neon/pull/10271#discussion_r1923715493~~
will do in a separate PR

neondatabase/cloud#22775

---------

Co-authored-by: Eduard Dykman <bird.duskpoet@gmail.com>
---
 compute_tools/src/bin/fast_import.rs      | 656 ++++++++++++++--------
 poetry.lock                               |  15 +-
 pyproject.toml                            |   2 +-
 test_runner/fixtures/fast_import.py       |  62 +-
 test_runner/fixtures/neon_fixtures.py     |  27 +
 test_runner/regress/test_import_pgdata.py | 346 +++++++++++-
 6 files changed, 866 insertions(+), 242 deletions(-)

diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index 27cf1c2317..dad15d67b7 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -25,10 +25,10 @@
 //! docker push localhost:3030/localregistry/compute-node-v14:latest
 //! ```
 
-use anyhow::Context;
+use anyhow::{bail, Context};
 use aws_config::BehaviorVersion;
 use camino::{Utf8Path, Utf8PathBuf};
-use clap::Parser;
+use clap::{Parser, Subcommand};
 use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion};
 use nix::unistd::Pid;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -44,32 +44,59 @@ mod s3_uri;
 const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600);
 const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300);
 
+#[derive(Subcommand, Debug)]
+enum Command {
+    /// Runs local postgres (neon binary), restores into it,
+    /// uploads pgdata to s3 to be consumed by pageservers
+    Pgdata {
+        /// Raw connection string to the source database. Used only in tests,
+        /// real scenario uses encrypted connection string in spec.json from s3.
+        #[clap(long)]
+        source_connection_string: Option<String>,
+        /// If specified, will not shut down the local postgres after the import. Used in local testing
+        #[clap(short, long)]
+        interactive: bool,
+        /// Port to run postgres on. Default is 5432.
+        #[clap(long, default_value_t = 5432)]
+        pg_port: u16, // port to run postgres on, 5432 is default
+
+        /// Number of CPUs in the system. This is used to configure # of
+        /// parallel worker processes, for index creation.
+        #[clap(long, env = "NEON_IMPORTER_NUM_CPUS")]
+        num_cpus: Option<usize>,
+
+        /// Amount of RAM in the system. This is used to configure shared_buffers
+        /// and maintenance_work_mem.
+        #[clap(long, env = "NEON_IMPORTER_MEMORY_MB")]
+        memory_mb: Option<usize>,
+    },
+
+    /// Runs pg_dump-pg_restore from source to destination without running local postgres.
+    DumpRestore {
+        /// Raw connection string to the source database. Used only in tests,
+        /// real scenario uses encrypted connection string in spec.json from s3.
+        #[clap(long)]
+        source_connection_string: Option<String>,
+        /// Raw connection string to the destination database. Used only in tests,
+        /// real scenario uses encrypted connection string in spec.json from s3.
+        #[clap(long)]
+        destination_connection_string: Option<String>,
+    },
+}
+
 #[derive(clap::Parser)]
 struct Args {
-    #[clap(long)]
+    #[clap(long, env = "NEON_IMPORTER_WORKDIR")]
     working_directory: Utf8PathBuf,
     #[clap(long, env = "NEON_IMPORTER_S3_PREFIX")]
     s3_prefix: Option<s3_uri::S3Uri>,
-    #[clap(long)]
-    source_connection_string: Option<String>,
-    #[clap(short, long)]
-    interactive: bool,
-    #[clap(long)]
+    #[clap(long, env = "NEON_IMPORTER_PG_BIN_DIR")]
     pg_bin_dir: Utf8PathBuf,
-    #[clap(long)]
+    #[clap(long, env = "NEON_IMPORTER_PG_LIB_DIR")]
     pg_lib_dir: Utf8PathBuf,
-    #[clap(long)]
-    pg_port: Option<u16>, // port to run postgres on, 5432 is default
 
-    /// Number of CPUs in the system. This is used to configure # of
-    /// parallel worker processes, for index creation.
-    #[clap(long, env = "NEON_IMPORTER_NUM_CPUS")]
-    num_cpus: Option<usize>,
-
-    /// Amount of RAM in the system. This is used to configure shared_buffers
-    /// and maintenance_work_mem.
-    #[clap(long, env = "NEON_IMPORTER_MEMORY_MB")]
-    memory_mb: Option<usize>,
+    #[clap(subcommand)]
+    command: Command,
 }
 
 #[serde_with::serde_as]
@@ -78,6 +105,8 @@ struct Spec {
     encryption_secret: EncryptionSecret,
     #[serde_as(as = "serde_with::base64::Base64")]
     source_connstring_ciphertext_base64: Vec<u8>,
+    #[serde_as(as = "Option<serde_with::base64::Base64>")]
+    destination_connstring_ciphertext_base64: Option<Vec<u8>>,
 }
 
 #[derive(serde::Deserialize)]
@@ -93,192 +122,151 @@ const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
     "C.UTF-8"
 };
 
-#[tokio::main]
-pub(crate) async fn main() -> anyhow::Result<()> {
-    utils::logging::init(
-        utils::logging::LogFormat::Plain,
-        utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
-        utils::logging::Output::Stdout,
-    )?;
-
-    info!("starting");
-
-    let args = Args::parse();
-
-    // Validate arguments
-    if args.s3_prefix.is_none() && args.source_connection_string.is_none() {
-        anyhow::bail!("either s3_prefix or source_connection_string must be specified");
-    }
-    if args.s3_prefix.is_some() && args.source_connection_string.is_some() {
-        anyhow::bail!("only one of s3_prefix or source_connection_string can be specified");
-    }
-
-    let working_directory = args.working_directory;
-    let pg_bin_dir = args.pg_bin_dir;
-    let pg_lib_dir = args.pg_lib_dir;
-    let pg_port = args.pg_port.unwrap_or_else(|| {
-        info!("pg_port not specified, using default 5432");
-        5432
-    });
-
-    // Initialize AWS clients only if s3_prefix is specified
-    let (aws_config, kms_client) = if args.s3_prefix.is_some() {
-        let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
-        let kms = aws_sdk_kms::Client::new(&config);
-        (Some(config), Some(kms))
-    } else {
-        (None, None)
-    };
-
-    // Get source connection string either from S3 spec or direct argument
-    let source_connection_string = if let Some(s3_prefix) = &args.s3_prefix {
-        let spec: Spec = {
-            let spec_key = s3_prefix.append("/spec.json");
-            let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap());
-            let object = s3_client
-                .get_object()
-                .bucket(&spec_key.bucket)
-                .key(spec_key.key)
-                .send()
-                .await
-                .context("get spec from s3")?
-                .body
-                .collect()
-                .await
-                .context("download spec body")?;
-            serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
-        };
-
-        match spec.encryption_secret {
-            EncryptionSecret::KMS { key_id } => {
-                let mut output = kms_client
-                    .unwrap()
-                    .decrypt()
-                    .key_id(key_id)
-                    .ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
-                        spec.source_connstring_ciphertext_base64,
-                    ))
-                    .send()
-                    .await
-                    .context("decrypt source connection string")?;
-                let plaintext = output
-                    .plaintext
-                    .take()
-                    .context("get plaintext source connection string")?;
-                String::from_utf8(plaintext.into_inner())
-                    .context("parse source connection string as utf8")?
-            }
-        }
-    } else {
-        args.source_connection_string.unwrap()
-    };
-
-    match tokio::fs::create_dir(&working_directory).await {
-        Ok(()) => {}
-        Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
-            if !is_directory_empty(&working_directory)
-                .await
-                .context("check if working directory is empty")?
-            {
-                anyhow::bail!("working directory is not empty");
-            } else {
-                // ok
-            }
-        }
-        Err(e) => return Err(anyhow::Error::new(e).context("create working directory")),
-    }
-
-    let pgdata_dir = working_directory.join("pgdata");
-    tokio::fs::create_dir(&pgdata_dir)
+async fn decode_connstring(
+    kms_client: &aws_sdk_kms::Client,
+    key_id: &String,
+    connstring_ciphertext_base64: Vec<u8>,
+) -> Result<String, anyhow::Error> {
+    let mut output = kms_client
+        .decrypt()
+        .key_id(key_id)
+        .ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
+            connstring_ciphertext_base64,
+        ))
+        .send()
         .await
-        .context("create pgdata directory")?;
+        .context("decrypt connection string")?;
 
-    let pgbin = pg_bin_dir.join("postgres");
-    let pg_version = match get_pg_version(pgbin.as_ref()) {
-        PostgresMajorVersion::V14 => 14,
-        PostgresMajorVersion::V15 => 15,
-        PostgresMajorVersion::V16 => 16,
-        PostgresMajorVersion::V17 => 17,
-    };
-    let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded
-    postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
-        superuser,
-        locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded,
-        pg_version,
-        initdb_bin: pg_bin_dir.join("initdb").as_ref(),
-        library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local.
-        pgdata: &pgdata_dir,
-    })
-    .await
-    .context("initdb")?;
+    let plaintext = output
+        .plaintext
+        .take()
+        .context("get plaintext connection string")?;
 
-    // If the caller didn't specify CPU / RAM to use for sizing, default to
-    // number of CPUs in the system, and pretty arbitrarily, 256 MB of RAM.
-    let nproc = args.num_cpus.unwrap_or_else(num_cpus::get);
-    let memory_mb = args.memory_mb.unwrap_or(256);
+    String::from_utf8(plaintext.into_inner()).context("parse connection string as utf8")
+}
 
-    // Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for
-    // maintenance_work_mem (i.e. for sorting during index creation), and leave the rest
-    // available for misc other stuff that PostgreSQL uses memory for.
-    let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize;
-    let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize;
+struct PostgresProcess {
+    pgdata_dir: Utf8PathBuf,
+    pg_bin_dir: Utf8PathBuf,
+    pgbin: Utf8PathBuf,
+    pg_lib_dir: Utf8PathBuf,
+    postgres_proc: Option<tokio::process::Child>,
+}
 
-    //
-    // Launch postgres process
-    //
-    let mut postgres_proc = tokio::process::Command::new(pgbin)
-        .arg("-D")
-        .arg(&pgdata_dir)
-        .args(["-p", &format!("{pg_port}")])
-        .args(["-c", "wal_level=minimal"])
-        .args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")])
-        .args(["-c", "max_wal_senders=0"])
-        .args(["-c", "fsync=off"])
-        .args(["-c", "full_page_writes=off"])
-        .args(["-c", "synchronous_commit=off"])
-        .args([
-            "-c",
-            &format!("maintenance_work_mem={maintenance_work_mem_mb}MB"),
-        ])
-        .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")])
-        .args(["-c", &format!("max_parallel_workers={nproc}")])
-        .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
-        .args(["-c", &format!("max_worker_processes={nproc}")])
-        .args([
-            "-c",
-            &format!(
-                "effective_io_concurrency={}",
-                if cfg!(target_os = "macos") { 0 } else { 100 }
-            ),
-        ])
-        .env_clear()
-        .env("LD_LIBRARY_PATH", &pg_lib_dir)
-        .env(
-            "ASAN_OPTIONS",
-            std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+impl PostgresProcess {
+    fn new(pgdata_dir: Utf8PathBuf, pg_bin_dir: Utf8PathBuf, pg_lib_dir: Utf8PathBuf) -> Self {
+        Self {
+            pgdata_dir,
+            pgbin: pg_bin_dir.join("postgres"),
+            pg_bin_dir,
+            pg_lib_dir,
+            postgres_proc: None,
+        }
+    }
+
+    async fn prepare(&self, initdb_user: &str) -> Result<(), anyhow::Error> {
+        tokio::fs::create_dir(&self.pgdata_dir)
+            .await
+            .context("create pgdata directory")?;
+
+        let pg_version = match get_pg_version(self.pgbin.as_ref()) {
+            PostgresMajorVersion::V14 => 14,
+            PostgresMajorVersion::V15 => 15,
+            PostgresMajorVersion::V16 => 16,
+            PostgresMajorVersion::V17 => 17,
+        };
+        postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
+            superuser: initdb_user,
+            locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded,
+            pg_version,
+            initdb_bin: self.pg_bin_dir.join("initdb").as_ref(),
+            library_search_path: &self.pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local.
+            pgdata: &self.pgdata_dir,
+        })
+        .await
+        .context("initdb")
+    }
+
+    async fn start(
+        &mut self,
+        initdb_user: &str,
+        port: u16,
+        nproc: usize,
+        memory_mb: usize,
+    ) -> Result<&tokio::process::Child, anyhow::Error> {
+        self.prepare(initdb_user).await?;
+
+        // Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for
+        // maintenance_work_mem (i.e. for sorting during index creation), and leave the rest
+        // available for misc other stuff that PostgreSQL uses memory for.
+        let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize;
+        let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize;
+
+        //
+        // Launch postgres process
+        //
+        let mut proc = tokio::process::Command::new(&self.pgbin)
+            .arg("-D")
+            .arg(&self.pgdata_dir)
+            .args(["-p", &format!("{port}")])
+            .args(["-c", "wal_level=minimal"])
+            .args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")])
+            .args(["-c", "shared_buffers=10GB"])
+            .args(["-c", "max_wal_senders=0"])
+            .args(["-c", "fsync=off"])
+            .args(["-c", "full_page_writes=off"])
+            .args(["-c", "synchronous_commit=off"])
+            .args([
+                "-c",
+                &format!("maintenance_work_mem={maintenance_work_mem_mb}MB"),
+            ])
+            .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")])
+            .args(["-c", &format!("max_parallel_workers={nproc}")])
+            .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
+            .args(["-c", &format!("max_worker_processes={nproc}")])
+            .args(["-c", "effective_io_concurrency=100"])
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &self.pg_lib_dir)
+            .env(
+                "ASAN_OPTIONS",
+                std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+            )
+            .env(
+                "UBSAN_OPTIONS",
+                std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+            )
+            .stdout(std::process::Stdio::piped())
+            .stderr(std::process::Stdio::piped())
+            .spawn()
+            .context("spawn postgres")?;
+
+        info!("spawned postgres, waiting for it to become ready");
+        tokio::spawn(
+            child_stdio_to_log::relay_process_output(proc.stdout.take(), proc.stderr.take())
+                .instrument(info_span!("postgres")),
+        );
+
+        self.postgres_proc = Some(proc);
+        Ok(self.postgres_proc.as_ref().unwrap())
+    }
+
+    async fn shutdown(&mut self) -> Result<(), anyhow::Error> {
+        let proc: &mut tokio::process::Child = self.postgres_proc.as_mut().unwrap();
+        info!("shutdown postgres");
+        nix::sys::signal::kill(
+            Pid::from_raw(i32::try_from(proc.id().unwrap()).expect("convert child pid to i32")),
+            nix::sys::signal::SIGTERM,
         )
-        .env(
-            "UBSAN_OPTIONS",
-            std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
-        )
-        .stdout(std::process::Stdio::piped())
-        .stderr(std::process::Stdio::piped())
-        .spawn()
-        .context("spawn postgres")?;
-
-    info!("spawned postgres, waiting for it to become ready");
-    tokio::spawn(
-        child_stdio_to_log::relay_process_output(
-            postgres_proc.stdout.take(),
-            postgres_proc.stderr.take(),
-        )
-        .instrument(info_span!("postgres")),
-    );
+        .context("signal postgres to shut down")?;
+        proc.wait()
+            .await
+            .context("wait for postgres to shut down")
+            .map(|_| ())
+    }
+}
 
+async fn wait_until_ready(connstring: String, create_dbname: String) {
     // Create neondb database in the running postgres
-    let restore_pg_connstring =
-        format!("host=localhost port={pg_port} user={superuser} dbname=postgres");
-
     let start_time = std::time::Instant::now();
 
     loop {
@@ -289,7 +277,12 @@ pub(crate) async fn main() -> anyhow::Result<()> {
             std::process::exit(1);
         }
 
-        match tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await {
+        match tokio_postgres::connect(
+            &connstring.replace("dbname=neondb", "dbname=postgres"),
+            tokio_postgres::NoTls,
+        )
+        .await
+        {
             Ok((client, connection)) => {
                 // Spawn the connection handling task to maintain the connection
                 tokio::spawn(async move {
@@ -298,9 +291,12 @@ pub(crate) async fn main() -> anyhow::Result<()> {
                     }
                 });
 
-                match client.simple_query("CREATE DATABASE neondb;").await {
+                match client
+                    .simple_query(format!("CREATE DATABASE {create_dbname};").as_str())
+                    .await
+                {
                     Ok(_) => {
-                        info!("created neondb database");
+                        info!("created {} database", create_dbname);
                         break;
                     }
                     Err(e) => {
@@ -324,10 +320,16 @@ pub(crate) async fn main() -> anyhow::Result<()> {
             }
         }
     }
+}
 
-    let restore_pg_connstring = restore_pg_connstring.replace("dbname=postgres", "dbname=neondb");
-
-    let dumpdir = working_directory.join("dumpdir");
+async fn run_dump_restore(
+    workdir: Utf8PathBuf,
+    pg_bin_dir: Utf8PathBuf,
+    pg_lib_dir: Utf8PathBuf,
+    source_connstring: String,
+    destination_connstring: String,
+) -> Result<(), anyhow::Error> {
+    let dumpdir = workdir.join("dumpdir");
 
     let common_args = [
         // schema mapping (prob suffices to specify them on one side)
@@ -356,7 +358,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
             .arg("--no-sync")
             // POSITIONAL args
             // source db (db name included in connection string)
-            .arg(&source_connection_string)
+            .arg(&source_connstring)
             // how we run it
             .env_clear()
             .env("LD_LIBRARY_PATH", &pg_lib_dir)
@@ -376,19 +378,18 @@ pub(crate) async fn main() -> anyhow::Result<()> {
         let st = pg_dump.wait().await.context("wait for pg_dump")?;
         info!(status=?st, "pg_dump exited");
         if !st.success() {
-            warn!(status=%st, "pg_dump failed, restore will likely fail as well");
+            error!(status=%st, "pg_dump failed, restore will likely fail as well");
+            bail!("pg_dump failed");
         }
     }
 
-    // TODO: do it in a streaming way, plenty of internal research done on this already
+    // TODO: maybe do it in a streaming way, plenty of internal research done on this already
     // TODO: do the unlogged table trick
-
-    info!("restore from working directory into vanilla postgres");
     {
         let mut pg_restore = tokio::process::Command::new(pg_bin_dir.join("pg_restore"))
             .args(&common_args)
             .arg("-d")
-            .arg(&restore_pg_connstring)
+            .arg(&destination_connstring)
             // POSITIONAL args
             .arg(&dumpdir)
             // how we run it
@@ -411,33 +412,82 @@ pub(crate) async fn main() -> anyhow::Result<()> {
         let st = pg_restore.wait().await.context("wait for pg_restore")?;
         info!(status=?st, "pg_restore exited");
         if !st.success() {
-            warn!(status=%st, "pg_restore failed, restore will likely fail as well");
+            error!(status=%st, "pg_restore failed, restore will likely fail as well");
+            bail!("pg_restore failed");
         }
     }
 
+    Ok(())
+}
+
+#[allow(clippy::too_many_arguments)]
+async fn cmd_pgdata(
+    kms_client: Option<aws_sdk_kms::Client>,
+    maybe_s3_prefix: Option<s3_uri::S3Uri>,
+    maybe_spec: Option<Spec>,
+    source_connection_string: Option<String>,
+    interactive: bool,
+    pg_port: u16,
+    workdir: Utf8PathBuf,
+    pg_bin_dir: Utf8PathBuf,
+    pg_lib_dir: Utf8PathBuf,
+    num_cpus: Option<usize>,
+    memory_mb: Option<usize>,
+) -> Result<(), anyhow::Error> {
+    if maybe_spec.is_none() && source_connection_string.is_none() {
+        bail!("spec must be provided for pgdata command");
+    }
+    if maybe_spec.is_some() && source_connection_string.is_some() {
+        bail!("only one of spec or source_connection_string can be provided");
+    }
+
+    let source_connection_string = if let Some(spec) = maybe_spec {
+        match spec.encryption_secret {
+            EncryptionSecret::KMS { key_id } => {
+                decode_connstring(
+                    kms_client.as_ref().unwrap(),
+                    &key_id,
+                    spec.source_connstring_ciphertext_base64,
+                )
+                .await?
+            }
+        }
+    } else {
+        source_connection_string.unwrap()
+    };
+
+    let superuser = "cloud_admin";
+    let destination_connstring = format!(
+        "host=localhost port={} user={} dbname=neondb",
+        pg_port, superuser
+    );
+
+    let pgdata_dir = workdir.join("pgdata");
+    let mut proc = PostgresProcess::new(pgdata_dir.clone(), pg_bin_dir.clone(), pg_lib_dir.clone());
+    let nproc = num_cpus.unwrap_or_else(num_cpus::get);
+    let memory_mb = memory_mb.unwrap_or(256);
+    proc.start(superuser, pg_port, nproc, memory_mb).await?;
+    wait_until_ready(destination_connstring.clone(), "neondb".to_string()).await;
+
+    run_dump_restore(
+        workdir.clone(),
+        pg_bin_dir,
+        pg_lib_dir,
+        source_connection_string,
+        destination_connstring,
+    )
+    .await?;
+
     // If interactive mode, wait for Ctrl+C
-    if args.interactive {
+    if interactive {
         info!("Running in interactive mode. Press Ctrl+C to shut down.");
         tokio::signal::ctrl_c().await.context("wait for ctrl-c")?;
     }
 
-    info!("shutdown postgres");
-    {
-        nix::sys::signal::kill(
-            Pid::from_raw(
-                i32::try_from(postgres_proc.id().unwrap()).expect("convert child pid to i32"),
-            ),
-            nix::sys::signal::SIGTERM,
-        )
-        .context("signal postgres to shut down")?;
-        postgres_proc
-            .wait()
-            .await
-            .context("wait for postgres to shut down")?;
-    }
+    proc.shutdown().await?;
 
     // Only sync if s3_prefix was specified
-    if let Some(s3_prefix) = args.s3_prefix {
+    if let Some(s3_prefix) = maybe_s3_prefix {
         info!("upload pgdata");
         aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
             .await
@@ -445,7 +495,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
 
         info!("write status");
         {
-            let status_dir = working_directory.join("status");
+            let status_dir = workdir.join("status");
             std::fs::create_dir(&status_dir).context("create status directory")?;
             let status_file = status_dir.join("pgdata");
             std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
@@ -458,3 +508,153 @@ pub(crate) async fn main() -> anyhow::Result<()> {
 
     Ok(())
 }
+
+async fn cmd_dumprestore(
+    kms_client: Option<aws_sdk_kms::Client>,
+    maybe_spec: Option<Spec>,
+    source_connection_string: Option<String>,
+    destination_connection_string: Option<String>,
+    workdir: Utf8PathBuf,
+    pg_bin_dir: Utf8PathBuf,
+    pg_lib_dir: Utf8PathBuf,
+) -> Result<(), anyhow::Error> {
+    let (source_connstring, destination_connstring) = if let Some(spec) = maybe_spec {
+        match spec.encryption_secret {
+            EncryptionSecret::KMS { key_id } => {
+                let source = decode_connstring(
+                    kms_client.as_ref().unwrap(),
+                    &key_id,
+                    spec.source_connstring_ciphertext_base64,
+                )
+                .await?;
+
+                let dest = if let Some(dest_ciphertext) =
+                    spec.destination_connstring_ciphertext_base64
+                {
+                    decode_connstring(kms_client.as_ref().unwrap(), &key_id, dest_ciphertext)
+                        .await?
+                } else {
+                    bail!("destination connection string must be provided in spec for dump_restore command");
+                };
+
+                (source, dest)
+            }
+        }
+    } else {
+        (
+            source_connection_string.unwrap(),
+            if let Some(val) = destination_connection_string {
+                val
+            } else {
+                bail!("destination connection string must be provided for dump_restore command");
+            },
+        )
+    };
+
+    run_dump_restore(
+        workdir,
+        pg_bin_dir,
+        pg_lib_dir,
+        source_connstring,
+        destination_connstring,
+    )
+    .await
+}
+
+#[tokio::main]
+pub(crate) async fn main() -> anyhow::Result<()> {
+    utils::logging::init(
+        utils::logging::LogFormat::Json,
+        utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
+        utils::logging::Output::Stdout,
+    )?;
+
+    info!("starting");
+
+    let args = Args::parse();
+
+    // Initialize AWS clients only if s3_prefix is specified
+    let (aws_config, kms_client) = if args.s3_prefix.is_some() {
+        let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
+        let kms = aws_sdk_kms::Client::new(&config);
+        (Some(config), Some(kms))
+    } else {
+        (None, None)
+    };
+
+    let spec: Option<Spec> = if let Some(s3_prefix) = &args.s3_prefix {
+        let spec_key = s3_prefix.append("/spec.json");
+        let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap());
+        let object = s3_client
+            .get_object()
+            .bucket(&spec_key.bucket)
+            .key(spec_key.key)
+            .send()
+            .await
+            .context("get spec from s3")?
+            .body
+            .collect()
+            .await
+            .context("download spec body")?;
+        serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
+    } else {
+        None
+    };
+
+    match tokio::fs::create_dir(&args.working_directory).await {
+        Ok(()) => {}
+        Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
+            if !is_directory_empty(&args.working_directory)
+                .await
+                .context("check if working directory is empty")?
+            {
+                bail!("working directory is not empty");
+            } else {
+                // ok
+            }
+        }
+        Err(e) => return Err(anyhow::Error::new(e).context("create working directory")),
+    }
+
+    match args.command {
+        Command::Pgdata {
+            source_connection_string,
+            interactive,
+            pg_port,
+            num_cpus,
+            memory_mb,
+        } => {
+            cmd_pgdata(
+                kms_client,
+                args.s3_prefix,
+                spec,
+                source_connection_string,
+                interactive,
+                pg_port,
+                args.working_directory,
+                args.pg_bin_dir,
+                args.pg_lib_dir,
+                num_cpus,
+                memory_mb,
+            )
+            .await?;
+        }
+        Command::DumpRestore {
+            source_connection_string,
+            destination_connection_string,
+        } => {
+            cmd_dumprestore(
+                kms_client,
+                spec,
+                source_connection_string,
+                destination_connection_string,
+                args.working_directory,
+                args.pg_bin_dir,
+                args.pg_lib_dir,
+            )
+            .await?;
+        }
+    }
+
+    Ok(())
+}
diff --git a/poetry.lock b/poetry.lock
index fd200159b9..e2c71ca012 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -412,6 +412,7 @@ files = [
 
 [package.dependencies]
 botocore-stubs = "*"
+mypy-boto3-kms = {version = ">=1.26.0,<1.27.0", optional = true, markers = "extra == \"kms\""}
 mypy-boto3-s3 = {version = ">=1.26.0,<1.27.0", optional = true, markers = "extra == \"s3\""}
 types-s3transfer = "*"
 typing-extensions = ">=4.1.0"
@@ -2022,6 +2023,18 @@ install-types = ["pip"]
 mypyc = ["setuptools (>=50)"]
 reports = ["lxml"]
 
+[[package]]
+name = "mypy-boto3-kms"
+version = "1.26.147"
+description = "Type annotations for boto3.KMS 1.26.147 service generated with mypy-boto3-builder 7.14.5"
+optional = false
+python-versions = ">=3.7"
+groups = ["main"]
+files = [
+    {file = "mypy-boto3-kms-1.26.147.tar.gz", hash = "sha256:816a4d1bb0585e1b9620a3f96c1d69a06f53b7b5621858579dd77c60dbb5fa5c"},
+    {file = "mypy_boto3_kms-1.26.147-py3-none-any.whl", hash = "sha256:493f0db674a25c88769f5cb8ab8ac00d3dda5dfc903d5cda34c990ee64689f79"},
+]
+
 [[package]]
 name = "mypy-boto3-s3"
 version = "1.26.0.post1"
@@ -3807,4 +3820,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.11"
-content-hash = "4dc3165fe22c0e0f7a030ea0f8a680ae2ff74561d8658c393abbe9112caaf5d7"
+content-hash = "03697c0a4d438ef088b0d397b8f0570aa3998ccf833fe612400824792498878b"
diff --git a/pyproject.toml b/pyproject.toml
index e299c421e9..51cd68e002 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ Jinja2 = "^3.1.5"
 types-requests = "^2.31.0.0"
 types-psycopg2 = "^2.9.21.20241019"
 boto3 = "^1.34.11"
-boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
+boto3-stubs = {extras = ["s3", "kms"], version = "^1.26.16"}
 moto = {extras = ["server"], version = "^5.0.6"}
 backoff = "^2.2.1"
 pytest-lazy-fixture = "^0.6.3"
diff --git a/test_runner/fixtures/fast_import.py b/test_runner/fixtures/fast_import.py
index 33248132ab..d674be99de 100644
--- a/test_runner/fixtures/fast_import.py
+++ b/test_runner/fixtures/fast_import.py
@@ -4,8 +4,10 @@ import subprocess
 import tempfile
 from collections.abc import Iterator
 from pathlib import Path
+from typing import cast
 
 import pytest
+from _pytest.config import Config
 
 from fixtures.log_helper import log
 from fixtures.neon_cli import AbstractNeonCli
@@ -23,6 +25,7 @@ class FastImport(AbstractNeonCli):
         pg_distrib_dir: Path,
         pg_version: PgVersion,
         workdir: Path,
+        cleanup: bool = True,
     ):
         if extra_env is None:
             env_vars = {}
@@ -47,12 +50,43 @@ class FastImport(AbstractNeonCli):
         if not workdir.exists():
             raise Exception(f"Working directory '{workdir}' does not exist")
         self.workdir = workdir
+        self.cleanup = cleanup
+
+    def run_pgdata(
+        self,
+        s3prefix: str | None = None,
+        pg_port: int | None = None,
+        source_connection_string: str | None = None,
+        interactive: bool = False,
+    ):
+        return self.run(
+            "pgdata",
+            s3prefix=s3prefix,
+            pg_port=pg_port,
+            source_connection_string=source_connection_string,
+            interactive=interactive,
+        )
+
+    def run_dump_restore(
+        self,
+        s3prefix: str | None = None,
+        source_connection_string: str | None = None,
+        destination_connection_string: str | None = None,
+    ):
+        return self.run(
+            "dump-restore",
+            s3prefix=s3prefix,
+            source_connection_string=source_connection_string,
+            destination_connection_string=destination_connection_string,
+        )
 
     def run(
         self,
-        pg_port: int,
-        source_connection_string: str | None = None,
+        command: str,
         s3prefix: str | None = None,
+        pg_port: int | None = None,
+        source_connection_string: str | None = None,
+        destination_connection_string: str | None = None,
         interactive: bool = False,
     ) -> subprocess.CompletedProcess[str]:
         if self.cmd is not None:
@@ -60,13 +94,17 @@ class FastImport(AbstractNeonCli):
         args = [
             f"--pg-bin-dir={self.pg_bin}",
             f"--pg-lib-dir={self.pg_lib}",
-            f"--pg-port={pg_port}",
             f"--working-directory={self.workdir}",
         ]
-        if source_connection_string is not None:
-            args.append(f"--source-connection-string={source_connection_string}")
         if s3prefix is not None:
             args.append(f"--s3-prefix={s3prefix}")
+        args.append(command)
+        if pg_port is not None:
+            args.append(f"--pg-port={pg_port}")
+        if source_connection_string is not None:
+            args.append(f"--source-connection-string={source_connection_string}")
+        if destination_connection_string is not None:
+            args.append(f"--destination-connection-string={destination_connection_string}")
         if interactive:
             args.append("--interactive")
 
@@ -77,7 +115,7 @@ class FastImport(AbstractNeonCli):
         return self
 
     def __exit__(self, *args):
-        if self.workdir.exists():
+        if self.workdir.exists() and self.cleanup:
             shutil.rmtree(self.workdir)
 
 
@@ -87,9 +125,17 @@ def fast_import(
     test_output_dir: Path,
     neon_binpath: Path,
     pg_distrib_dir: Path,
+    pytestconfig: Config,
 ) -> Iterator[FastImport]:
-    workdir = Path(tempfile.mkdtemp())
-    with FastImport(None, neon_binpath, pg_distrib_dir, pg_version, workdir) as fi:
+    workdir = Path(tempfile.mkdtemp(dir=test_output_dir, prefix="fast_import_"))
+    with FastImport(
+        None,
+        neon_binpath,
+        pg_distrib_dir,
+        pg_version,
+        workdir,
+        cleanup=not cast(bool, pytestconfig.getoption("--preserve-database-files")),
+    ) as fi:
         yield fi
 
         if fi.cmd is None:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 469bc8a1e5..73607db7d8 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -27,6 +27,7 @@ from urllib.parse import quote, urlparse
 
 import asyncpg
 import backoff
+import boto3
 import httpx
 import psycopg2
 import psycopg2.sql
@@ -37,6 +38,8 @@ from _pytest.config import Config
 from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
 from jwcrypto import jwk
+from mypy_boto3_kms import KMSClient
+from mypy_boto3_s3 import S3Client
 
 # Type-related stuff
 from psycopg2.extensions import connection as PgConnection
@@ -199,6 +202,30 @@ def mock_s3_server(port_distributor: PortDistributor) -> Iterator[MockS3Server]:
     mock_s3_server.kill()
 
 
+@pytest.fixture(scope="session")
+def mock_kms(mock_s3_server: MockS3Server) -> Iterator[KMSClient]:
+    yield boto3.client(
+        "kms",
+        endpoint_url=mock_s3_server.endpoint(),
+        region_name=mock_s3_server.region(),
+        aws_access_key_id=mock_s3_server.access_key(),
+        aws_secret_access_key=mock_s3_server.secret_key(),
+        aws_session_token=mock_s3_server.session_token(),
+    )
+
+
+@pytest.fixture(scope="session")
+def mock_s3_client(mock_s3_server: MockS3Server) -> Iterator[S3Client]:
+    yield boto3.client(
+        "s3",
+        endpoint_url=mock_s3_server.endpoint(),
+        region_name=mock_s3_server.region(),
+        aws_access_key_id=mock_s3_server.access_key(),
+        aws_secret_access_key=mock_s3_server.secret_key(),
+        aws_session_token=mock_s3_server.session_token(),
+    )
+
+
 class PgProtocol:
     """Reusable connection logic"""
 
diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py
index ea86eb62eb..71e0d16edd 100644
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -1,7 +1,9 @@
+import base64
 import json
 import re
 import time
 from enum import Enum
+from pathlib import Path
 
 import psycopg2
 import psycopg2.errors
@@ -14,8 +16,12 @@ from fixtures.pageserver.http import (
     ImportPgdataIdemptencyKey,
     PageserverApiException,
 )
+from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
-from fixtures.remote_storage import RemoteStorageKind
+from fixtures.remote_storage import MockS3Server, RemoteStorageKind
+from mypy_boto3_kms import KMSClient
+from mypy_boto3_kms.type_defs import EncryptResponseTypeDef
+from mypy_boto3_s3 import S3Client
 from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -103,13 +109,15 @@ def test_pgdata_import_smoke(
     while True:
         relblock_size = vanilla_pg.safe_psql_scalar("select pg_relation_size('t')")
         log.info(
-            f"relblock size: {relblock_size/8192} pages (target: {target_relblock_size//8192}) pages"
+            f"relblock size: {relblock_size / 8192} pages (target: {target_relblock_size // 8192}) pages"
         )
         if relblock_size >= target_relblock_size:
             break
         addrows = int((target_relblock_size - relblock_size) // 8192)
         assert addrows >= 1, "forward progress"
-        vanilla_pg.safe_psql(f"insert into t select generate_series({nrows+1}, {nrows + addrows})")
+        vanilla_pg.safe_psql(
+            f"insert into t select generate_series({nrows + 1}, {nrows + addrows})"
+        )
         nrows += addrows
     expect_nrows = nrows
     expect_sum = (
@@ -332,6 +340,224 @@ def test_pgdata_import_smoke(
         br_initdb_endpoint.safe_psql("select * from othertable")
 
 
+def test_fast_import_with_pageserver_ingest(
+    test_output_dir,
+    vanilla_pg: VanillaPostgres,
+    port_distributor: PortDistributor,
+    fast_import: FastImport,
+    pg_distrib_dir: Path,
+    pg_version: PgVersion,
+    mock_s3_server: MockS3Server,
+    mock_kms: KMSClient,
+    mock_s3_client: S3Client,
+    neon_env_builder: NeonEnvBuilder,
+    make_httpserver: HTTPServer,
+):
+    # Prepare KMS and S3
+    key_response = mock_kms.create_key(
+        Description="Test key",
+        KeyUsage="ENCRYPT_DECRYPT",
+        Origin="AWS_KMS",
+    )
+    key_id = key_response["KeyMetadata"]["KeyId"]
+
+    def encrypt(x: str) -> EncryptResponseTypeDef:
+        return mock_kms.encrypt(KeyId=key_id, Plaintext=x)
+
+    # Start source postgres and ingest data
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);")
+
+    # Setup pageserver and fake cplane for import progress
+    def handler(request: Request) -> Response:
+        log.info(f"control plane request: {request.json}")
+        return Response(json.dumps({}), status=200)
+
+    cplane_mgmt_api_server = make_httpserver
+    cplane_mgmt_api_server.expect_request(re.compile(".*")).respond_with_handler(handler)
+
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
+    env = neon_env_builder.init_start()
+
+    env.pageserver.patch_config_toml_nonrecursive(
+        {
+            "import_pgdata_upcall_api": f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/path/to/mgmt/api",
+            # because import_pgdata code uses this endpoint, not the one in common remote storage config
+            # TODO: maybe use common remote_storage config in pageserver?
+            "import_pgdata_aws_endpoint_url": env.s3_mock_server.endpoint(),
+        }
+    )
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    # Encrypt connstrings and put spec into S3
+    source_connstring_encrypted = encrypt(vanilla_pg.connstr())
+    spec = {
+        "encryption_secret": {"KMS": {"key_id": key_id}},
+        "source_connstring_ciphertext_base64": base64.b64encode(
+            source_connstring_encrypted["CiphertextBlob"]
+        ).decode("utf-8"),
+        "project_id": "someproject",
+        "branch_id": "somebranch",
+    }
+
+    bucket = "test-bucket"
+    key_prefix = "test-prefix"
+    mock_s3_client.create_bucket(Bucket=bucket)
+    mock_s3_client.put_object(Bucket=bucket, Key=f"{key_prefix}/spec.json", Body=json.dumps(spec))
+
+    # Create timeline with import_pgdata
+    tenant_id = TenantId.generate()
+    env.storage_controller.tenant_create(tenant_id)
+
+    timeline_id = TimelineId.generate()
+    log.info("starting import")
+    start = time.monotonic()
+
+    idempotency = ImportPgdataIdemptencyKey.random()
+    log.info(f"idempotency key {idempotency}")
+    # TODO: teach neon_local CLI about the idempotency & 429 error so we can run inside the loop
+    # and check for 429
+
+    import_branch_name = "imported"
+    env.storage_controller.timeline_create(
+        tenant_id,
+        {
+            "new_timeline_id": str(timeline_id),
+            "import_pgdata": {
+                "idempotency_key": str(idempotency),
+                "location": {
+                    "AwsS3": {
+                        "region": env.s3_mock_server.region(),
+                        "bucket": bucket,
+                        "key": key_prefix,
+                    }
+                },
+            },
+        },
+    )
+    env.neon_cli.mappings_map_branch(import_branch_name, tenant_id, timeline_id)
+
+    # Run fast_import
+    if fast_import.extra_env is None:
+        fast_import.extra_env = {}
+    fast_import.extra_env["AWS_ACCESS_KEY_ID"] = mock_s3_server.access_key()
+    fast_import.extra_env["AWS_SECRET_ACCESS_KEY"] = mock_s3_server.secret_key()
+    fast_import.extra_env["AWS_SESSION_TOKEN"] = mock_s3_server.session_token()
+    fast_import.extra_env["AWS_REGION"] = mock_s3_server.region()
+    fast_import.extra_env["AWS_ENDPOINT_URL"] = mock_s3_server.endpoint()
+    fast_import.extra_env["RUST_LOG"] = "aws_config=debug,aws_sdk_kms=debug"
+    pg_port = port_distributor.get_port()
+    fast_import.run_pgdata(pg_port=pg_port, s3prefix=f"s3://{bucket}/{key_prefix}")
+    vanilla_pg.stop()
+
+    def validate_vanilla_equivalence(ep):
+        res = ep.safe_psql("SELECT count(*), sum(a) FROM foo;", dbname="neondb")
+        assert res[0] == (10, 55), f"got result: {res}"
+
+    # Sanity check that data in pgdata is expected:
+    pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version)
+    with VanillaPostgres(
+        fast_import.workdir / "pgdata", pgbin, pg_port, False
+    ) as new_pgdata_vanilla_pg:
+        new_pgdata_vanilla_pg.start()
+
+        # database name and user are hardcoded in fast_import binary, and they are different from normal vanilla postgres
+        conn = PgProtocol(dsn=f"postgresql://cloud_admin@localhost:{pg_port}/neondb")
+        validate_vanilla_equivalence(conn)
+
+    # Poll pageserver statuses in s3
+    while True:
+        locations = env.storage_controller.locate(tenant_id)
+        active_count = 0
+        for location in locations:
+            shard_id = TenantShardId.parse(location["shard_id"])
+            ps = env.get_pageserver(location["node_id"])
+            try:
+                detail = ps.http_client().timeline_detail(shard_id, timeline_id)
+                log.info(f"timeline {tenant_id}/{timeline_id} detail: {detail}")
+                state = detail["state"]
+                log.info(f"shard {shard_id} state: {state}")
+                if state == "Active":
+                    active_count += 1
+            except PageserverApiException as e:
+                if e.status_code == 404:
+                    log.info("not found, import is in progress")
+                    continue
+                elif e.status_code == 429:
+                    log.info("import is in progress")
+                    continue
+                else:
+                    raise
+
+            if state == "Active":
+                key = f"{key_prefix}/status/shard-{shard_id.shard_index}"
+                shard_status_file_contents = (
+                    mock_s3_client.get_object(Bucket=bucket, Key=key)["Body"].read().decode("utf-8")
+                )
+                shard_status = json.loads(shard_status_file_contents)
+                assert shard_status["done"] is True
+
+        if active_count == len(locations):
+            log.info("all shards are active")
+            break
+        time.sleep(0.5)
+
+    import_duration = time.monotonic() - start
+    log.info(f"import complete; duration={import_duration:.2f}s")
+
+    ep = env.endpoints.create_start(branch_name=import_branch_name, tenant_id=tenant_id)
+
+    # check that data is there
+    validate_vanilla_equivalence(ep)
+
+    # check that we can do basic ops
+
+    ep.safe_psql("create table othertable(values text)", dbname="neondb")
+    rw_lsn = Lsn(ep.safe_psql_scalar("select pg_current_wal_flush_lsn()"))
+    ep.stop()
+
+    # ... at the tip
+    _ = env.create_branch(
+        new_branch_name="br-tip",
+        ancestor_branch_name=import_branch_name,
+        tenant_id=tenant_id,
+        ancestor_start_lsn=rw_lsn,
+    )
+    br_tip_endpoint = env.endpoints.create_start(
+        branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id
+    )
+    validate_vanilla_equivalence(br_tip_endpoint)
+    br_tip_endpoint.safe_psql("select * from othertable", dbname="neondb")
+    br_tip_endpoint.stop()
+
+    # ... at the initdb lsn
+    locations = env.storage_controller.locate(tenant_id)
+    [shard_zero] = [
+        loc for loc in locations if TenantShardId.parse(loc["shard_id"]).shard_number == 0
+    ]
+    shard_zero_ps = env.get_pageserver(shard_zero["node_id"])
+    shard_zero_timeline_info = shard_zero_ps.http_client().timeline_detail(
+        shard_zero["shard_id"], timeline_id
+    )
+    initdb_lsn = Lsn(shard_zero_timeline_info["initdb_lsn"])
+    _ = env.create_branch(
+        new_branch_name="br-initdb",
+        ancestor_branch_name=import_branch_name,
+        tenant_id=tenant_id,
+        ancestor_start_lsn=initdb_lsn,
+    )
+    br_initdb_endpoint = env.endpoints.create_start(
+        branch_name="br-initdb", endpoint_id="br-initdb-ro", tenant_id=tenant_id
+    )
+    validate_vanilla_equivalence(br_initdb_endpoint)
+    with pytest.raises(psycopg2.errors.UndefinedTable):
+        br_initdb_endpoint.safe_psql("select * from othertable", dbname="neondb")
+    br_initdb_endpoint.stop()
+
+    env.pageserver.stop(immediate=True)
+
+
 def test_fast_import_binary(
     test_output_dir,
     vanilla_pg: VanillaPostgres,
@@ -342,7 +568,7 @@ def test_fast_import_binary(
     vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);")
 
     pg_port = port_distributor.get_port()
-    fast_import.run(pg_port, vanilla_pg.connstr())
+    fast_import.run_pgdata(pg_port=pg_port, source_connection_string=vanilla_pg.connstr())
     vanilla_pg.stop()
 
     pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version)
@@ -358,6 +584,118 @@ def test_fast_import_binary(
         assert res[0][0] == 10
 
 
+def test_fast_import_restore_to_connstring(
+    test_output_dir,
+    vanilla_pg: VanillaPostgres,
+    port_distributor: PortDistributor,
+    fast_import: FastImport,
+    pg_distrib_dir: Path,
+    pg_version: PgVersion,
+):
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);")
+
+    pgdatadir = test_output_dir / "destination-pgdata"
+    pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version)
+    port = port_distributor.get_port()
+    with VanillaPostgres(pgdatadir, pg_bin, port) as destination_vanilla_pg:
+        destination_vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"])
+        destination_vanilla_pg.start()
+
+        # create another database & role and try to restore there
+        destination_vanilla_pg.safe_psql("""
+            CREATE ROLE testrole WITH
+                LOGIN
+                PASSWORD 'testpassword'
+                NOSUPERUSER
+                NOCREATEDB
+                NOCREATEROLE;
+        """)
+        destination_vanilla_pg.safe_psql("CREATE DATABASE testdb OWNER testrole;")
+
+        destination_connstring = destination_vanilla_pg.connstr(
+            dbname="testdb", user="testrole", password="testpassword"
+        )
+        fast_import.run_dump_restore(
+            source_connection_string=vanilla_pg.connstr(),
+            destination_connection_string=destination_connstring,
+        )
+        vanilla_pg.stop()
+        conn = PgProtocol(dsn=destination_connstring)
+        res = conn.safe_psql("SELECT count(*) FROM foo;")
+        log.info(f"Result: {res}")
+        assert res[0][0] == 10
+
+
+def test_fast_import_restore_to_connstring_from_s3_spec(
+    test_output_dir,
+    vanilla_pg: VanillaPostgres,
+    port_distributor: PortDistributor,
+    fast_import: FastImport,
+    pg_distrib_dir: Path,
+    pg_version: PgVersion,
+    mock_s3_server: MockS3Server,
+    mock_kms: KMSClient,
+    mock_s3_client: S3Client,
+):
+    # Prepare KMS and S3
+    key_response = mock_kms.create_key(
+        Description="Test key",
+        KeyUsage="ENCRYPT_DECRYPT",
+        Origin="AWS_KMS",
+    )
+    key_id = key_response["KeyMetadata"]["KeyId"]
+
+    def encrypt(x: str) -> EncryptResponseTypeDef:
+        return mock_kms.encrypt(KeyId=key_id, Plaintext=x)
+
+    # Start source postgres and ingest data
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);")
+
+    # Start target postgres
+    pgdatadir = test_output_dir / "destination-pgdata"
+    pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version)
+    port = port_distributor.get_port()
+    with VanillaPostgres(pgdatadir, pg_bin, port) as destination_vanilla_pg:
+        destination_vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"])
+        destination_vanilla_pg.start()
+
+        # Encrypt connstrings and put spec into S3
+        source_connstring_encrypted = encrypt(vanilla_pg.connstr())
+        destination_connstring_encrypted = encrypt(destination_vanilla_pg.connstr())
+        spec = {
+            "encryption_secret": {"KMS": {"key_id": key_id}},
+            "source_connstring_ciphertext_base64": base64.b64encode(
+                source_connstring_encrypted["CiphertextBlob"]
+            ).decode("utf-8"),
+            "destination_connstring_ciphertext_base64": base64.b64encode(
+                destination_connstring_encrypted["CiphertextBlob"]
+            ).decode("utf-8"),
+        }
+
+        mock_s3_client.create_bucket(Bucket="test-bucket")
+        mock_s3_client.put_object(
+            Bucket="test-bucket", Key="test-prefix/spec.json", Body=json.dumps(spec)
+        )
+
+        # Run fast_import
+        if fast_import.extra_env is None:
+            fast_import.extra_env = {}
+        fast_import.extra_env["AWS_ACCESS_KEY_ID"] = mock_s3_server.access_key()
+        fast_import.extra_env["AWS_SECRET_ACCESS_KEY"] = mock_s3_server.secret_key()
+        fast_import.extra_env["AWS_SESSION_TOKEN"] = mock_s3_server.session_token()
+        fast_import.extra_env["AWS_REGION"] = mock_s3_server.region()
+        fast_import.extra_env["AWS_ENDPOINT_URL"] = mock_s3_server.endpoint()
+        fast_import.extra_env["RUST_LOG"] = "aws_config=debug,aws_sdk_kms=debug"
+        fast_import.run_dump_restore(s3prefix="s3://test-bucket/test-prefix")
+        vanilla_pg.stop()
+
+        res = destination_vanilla_pg.safe_psql("SELECT count(*) FROM foo;")
+        log.info(f"Result: {res}")
+        assert res[0][0] == 10
+
+
 # TODO: Maybe test with pageserver?
 # 1. run whole neon env
 # 2. create timeline with some s3 path???

From b992a1a62a2d4029de0a8b0cd343b1909d8bb311 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 14 Feb 2025 17:20:07 +0100
Subject: [PATCH 480/583] page_service: include socket send & recv queue length
 in slow flush log mesage (#10823)

# Summary

In
- https://github.com/neondatabase/neon/pull/10813

we added slow flush logging but it didn't log the TCP send & recv queue
length.
This PR adds that data to the log message.

I believe the implementation to be safe & correct right now, but it's
brittle and thus this PR should be reverted or improved upon once the
investigation is over.

Refs:
- stacked atop https://github.com/neondatabase/neon/pull/10813
- context:
https://neondb.slack.com/archives/C08DE6Q9C3B/p1739464533762049?thread_ts=1739462628.361019&cid=C08DE6Q9C3B
- improves  https://github.com/neondatabase/neon/issues/10668
- part of https://github.com/neondatabase/cloud/issues/23515

# How It Works

The trouble is two-fold:
1. getting to the raw socket file descriptor through the many Rust types
that wrap it and
2. integrating with the `measure()` function

Rust wraps it in types to model file descriptor lifetimes and ownership,
and usually one can get access using `as_raw_fd()`.
However, we `split()` the stream and the resulting
[`tokio::io::WriteHalf`](https://docs.rs/tokio/latest/tokio/io/struct.WriteHalf.html)
.
Check the PR commit history for my attempts to do it.

My solution is to get the socket fd before we wrap it in our protocol
types, and to store that fd in the new `PostgresBackend::socket_fd`
field.
I believe it's safe because the lifetime of `PostgresBackend::socket_fd`
value == the lifetime of the `TcpStream` that wrap and store in
`PostgresBackend::framed`.
Specifically, the only place that close()s the socket is the `impl Drop
for TcpStream`.
I think the protocol stack calls `TcpStream::shutdown()`, but, that
doesn't `close()` the file descriptor underneath.

Regarding integration with the `measure()` function, the trouble is that
`flush_fut` is currently a generic `Future` type. So, we just pass in
the `socket_fd` as a separate argument.

A clean implementation would convert the `pgb_writer.flush()` to a named
future that provides an accessor for the socket fd while not being
polled.
I tried (see PR history), but failed to break through the `WriteHalf`.


# Testing

Tested locally by running

```
./target/debug/pagebench get-page-latest-lsn --num-clients=1000 --queue-depth=1000
```
in one terminal, waiting a bit, then
```
pkill -STOP pagebench
```
then wait for slow logs to show up in `pageserver.log`.
Pick one of the slow log message's port pairs, e.g., `127.0.0.1:39500`,
and then checking sockstat output
```
ss -ntp | grep '127.0.0.1:39500'
```

to ensure that send & recv queue size match those in the log message.
---
 libs/postgres_backend/src/lib.rs     |  7 ++++++
 libs/utils/Cargo.toml                |  2 +-
 libs/utils/src/lib.rs                |  3 +++
 libs/utils/src/linux_socket_ioctl.rs | 35 ++++++++++++++++++++++++++++
 pageserver/src/metrics.rs            | 27 +++++++++++++++++++--
 pageserver/src/page_service.rs       | 14 +++++++----
 safekeeper/src/wal_service.rs        |  5 +++-
 7 files changed, 85 insertions(+), 8 deletions(-)
 create mode 100644 libs/utils/src/linux_socket_ioctl.rs

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 8c024375c1..f74b229ac4 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -9,6 +9,8 @@ use bytes::Bytes;
 use serde::{Deserialize, Serialize};
 use std::io::ErrorKind;
 use std::net::SocketAddr;
+use std::os::fd::AsRawFd;
+use std::os::fd::RawFd;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{ready, Poll};
@@ -268,6 +270,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
 }
 
 pub struct PostgresBackend<IO> {
+    pub socket_fd: RawFd,
     framed: MaybeWriteOnly<IO>,
 
     pub state: ProtoState,
@@ -293,9 +296,11 @@ impl PostgresBackend<tokio::net::TcpStream> {
         tls_config: Option<Arc<rustls::ServerConfig>>,
     ) -> io::Result<Self> {
         let peer_addr = socket.peer_addr()?;
+        let socket_fd = socket.as_raw_fd();
         let stream = MaybeTlsStream::Unencrypted(socket);
 
         Ok(Self {
+            socket_fd,
             framed: MaybeWriteOnly::Full(Framed::new(stream)),
             state: ProtoState::Initialization,
             auth_type,
@@ -307,6 +312,7 @@ impl PostgresBackend<tokio::net::TcpStream> {
 
 impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
     pub fn new_from_io(
+        socket_fd: RawFd,
         socket: IO,
         peer_addr: SocketAddr,
         auth_type: AuthType,
@@ -315,6 +321,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
         let stream = MaybeTlsStream::Unencrypted(socket);
 
         Ok(Self {
+            socket_fd,
             framed: MaybeWriteOnly::Full(Framed::new(stream)),
             state: ProtoState::Initialization,
             auth_type,
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 0f10300959..e9611a0f12 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -28,7 +28,7 @@ inferno.workspace = true
 fail.workspace = true
 futures = { workspace = true }
 jsonwebtoken.workspace = true
-nix.workspace = true
+nix = {workspace = true, features = [ "ioctl" ] }
 once_cell.workspace = true
 pin-project-lite.workspace = true
 regex.workspace = true
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 820ff2d5ea..9389a27bf3 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -93,6 +93,9 @@ pub mod try_rcu;
 
 pub mod guard_arc_swap;
 
+#[cfg(target_os = "linux")]
+pub mod linux_socket_ioctl;
+
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
 #[doc(hidden)]
 pub use git_version;
diff --git a/libs/utils/src/linux_socket_ioctl.rs b/libs/utils/src/linux_socket_ioctl.rs
new file mode 100644
index 0000000000..5ae0e86af8
--- /dev/null
+++ b/libs/utils/src/linux_socket_ioctl.rs
@@ -0,0 +1,35 @@
+//! Linux-specific socket ioctls.
+//!
+//! <https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27>
+
+use std::{
+    io,
+    mem::MaybeUninit,
+    os::{fd::RawFd, raw::c_int},
+};
+
+use nix::libc::{FIONREAD, TIOCOUTQ};
+
+unsafe fn do_ioctl(socket_fd: RawFd, cmd: nix::libc::Ioctl) -> io::Result<c_int> {
+    let mut inq: MaybeUninit<c_int> = MaybeUninit::uninit();
+    let err = nix::libc::ioctl(socket_fd, cmd, inq.as_mut_ptr());
+    if err == 0 {
+        Ok(inq.assume_init())
+    } else {
+        Err(io::Error::last_os_error())
+    }
+}
+
+/// # Safety
+///
+/// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor.
+pub unsafe fn inq(socket_fd: RawFd) -> io::Result<c_int> {
+    do_ioctl(socket_fd, FIONREAD)
+}
+
+/// # Safety
+///
+/// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor.
+pub unsafe fn outq(socket_fd: RawFd) -> io::Result<c_int> {
+    do_ioctl(socket_fd, TIOCOUTQ)
+}
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 6a5dc3e749..0ffd4e851a 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,5 +1,6 @@
 use std::collections::HashMap;
 use std::num::NonZeroUsize;
+use std::os::fd::RawFd;
 use std::pin::Pin;
 use std::sync::atomic::AtomicU64;
 use std::sync::{Arc, Mutex};
@@ -1439,7 +1440,13 @@ impl Drop for SmgrOpTimer {
 }
 
 impl SmgrOpFlushInProgress {
-    pub(crate) async fn measure<Fut, O>(self, started_at: Instant, mut fut: Fut) -> O
+    /// The caller must guarantee that `socket_fd`` outlives this function.
+    pub(crate) async fn measure<Fut, O>(
+        self,
+        started_at: Instant,
+        mut fut: Fut,
+        socket_fd: RawFd,
+    ) -> O
     where
         Fut: std::future::Future<Output = O>,
     {
@@ -1470,8 +1477,24 @@ impl SmgrOpFlushInProgress {
                     } else {
                         "slow flush completed or cancelled"
                     };
+
+                    let (inq, outq) = {
+                        // SAFETY: caller guarantees that `socket_fd` outlives this function.
+                        #[cfg(target_os = "linux")]
+                        unsafe {
+                            (
+                                utils::linux_socket_ioctl::inq(socket_fd).unwrap_or(-2),
+                                utils::linux_socket_ioctl::outq(socket_fd).unwrap_or(-2),
+                            )
+                        }
+                        #[cfg(not(target_os = "linux"))]
+                        {
+                            (-1, -1)
+                        }
+                    };
+
                     let elapsed_total_secs = format!("{:.6}", elapsed_total.as_secs_f64());
-                    tracing::info!(elapsed_total_secs, msg);
+                    tracing::info!(elapsed_total_secs, inq, outq, msg);
                 }
             },
             |mut observe| {
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index bc0ed4198b..e9d87dec71 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -73,6 +73,7 @@ use pageserver_api::models::PageTraceEvent;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
+use std::os::fd::AsRawFd;
 
 /// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which
 /// is not yet in state [`TenantState::Active`].
@@ -257,6 +258,8 @@ async fn page_service_conn_main(
         .set_nodelay(true)
         .context("could not set TCP_NODELAY")?;
 
+    let socket_fd = socket.as_raw_fd();
+
     let peer_addr = socket.peer_addr().context("get peer address")?;
     tracing::Span::current().record("peer_addr", field::display(peer_addr));
 
@@ -305,7 +308,7 @@ async fn page_service_conn_main(
         cancel.clone(),
         gate_guard,
     );
-    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
+    let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
 
     match pgbackend.run(&mut conn_handler, &cancel).await {
         Ok(()) => {
@@ -1286,12 +1289,15 @@ impl PageServerHandler {
             ))?;
 
             // what we want to do
+            let socket_fd = pgb_writer.socket_fd;
             let flush_fut = pgb_writer.flush();
             // metric for how long flushing takes
             let flush_fut = match flushing_timer {
-                Some(flushing_timer) => {
-                    futures::future::Either::Left(flushing_timer.measure(Instant::now(), flush_fut))
-                }
+                Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure(
+                    Instant::now(),
+                    flush_fut,
+                    socket_fd,
+                )),
                 None => futures::future::Either::Right(flush_fut),
             };
             // do it while respecting cancellation
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index 1ebcb060e7..e5ccbb3230 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -13,6 +13,8 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{auth::Scope, measured_stream::MeasuredStream};
 
+use std::os::fd::AsRawFd;
+
 use crate::metrics::TrafficMetrics;
 use crate::SafeKeeperConf;
 use crate::{handler::SafekeeperPostgresHandler, GlobalTimelines};
@@ -62,6 +64,7 @@ async fn handle_socket(
     global_timelines: Arc<GlobalTimelines>,
 ) -> Result<(), QueryError> {
     socket.set_nodelay(true)?;
+    let socket_fd = socket.as_raw_fd();
     let peer_addr = socket.peer_addr()?;
 
     // Set timeout on reading from the socket. It prevents hanged up connection
@@ -107,7 +110,7 @@ async fn handle_socket(
         auth_pair,
         global_timelines,
     );
-    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
+    let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
     // libpq protocol between safekeeper and walproposer / pageserver
     // We don't use shutdown.
     pgbackend

From 9177312ba6bd1b8ba85e77d4490517a7f4c01ec5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 14 Feb 2025 18:57:18 +0100
Subject: [PATCH 481/583] basebackup: use `Timeline::get` for `get_rel` instead
 of `get_rel_page_at_lsn` (#10476)

I noticed the opportunity to simplify here while working on
https://github.com/neondatabase/neon/pull/9353 .

The only difference is the zero-fill behavior: if one reads past rel
size,
`get_rel_page_at_lsn` returns a zeroed page whereas `Timeline::get`
returns an error.

However, the `endblk` is at most rel size large, because `nblocks` is eq
`get_rel_size`, see a few lines above this change.

We're using the same LSN (`self.lsn`) for everything, so there is no
chance of non-determinism.

Refs:

- Slack discussion debating correctness:
https://neondb.slack.com/archives/C033RQ5SPDH/p1737457010607119
---
 pageserver/src/basebackup.rs | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 25078b57c8..e03b1bbe96 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,7 @@
 use anyhow::{anyhow, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::Key;
+use pageserver_api::key::{rel_block_to_key, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::{Instant, SystemTime};
@@ -501,13 +501,9 @@ where
             for blknum in startblk..endblk {
                 let img = self
                     .timeline
-                    .get_rel_page_at_lsn(
-                        src,
-                        blknum,
-                        Version::Lsn(self.lsn),
-                        self.ctx,
-                        self.io_concurrency.clone(),
-                    )
+                    // TODO: investigate using get_vectored for the entire startblk..endblk range.
+                    // But this code path is not on the critical path for most basebackups (?).
+                    .get(rel_block_to_key(src, blknum), self.lsn, self.ctx)
                     .await
                     .map_err(|e| BasebackupError::Server(e.into()))?;
                 segment_data.extend_from_slice(&img[..]);

From a32e8871acc1922f8bfd8057c08a97e504b1dacc Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 14 Feb 2025 21:11:42 +0100
Subject: [PATCH 482/583] compute/pageserver: correlation of logs through
 backend PID (via `application_name`) (#10810)

This PR makes compute set the `application_name` field to the
PG backend process PID which is also included in each compute log line.

This allows correlation of Pageserver connection logs with compute logs
in a way that was guesswork before this PR.

In future, we can switch for a more unique identifier for a page_service
session.

Refs
- discussion in
https://neondb.slack.com/archives/C08DE6Q9C3B/p1739465208296169?thread_ts=1739462628.361019&cid=C08DE6Q9C3B
- fixes https://github.com/neondatabase/neon/issues/10808
---
 pageserver/src/page_service.rs | 11 +++++++++--
 pgxn/neon/libpagestore.c       | 31 ++++++++++++++++++++++++-------
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index e9d87dec71..53a6a7124d 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -237,7 +237,7 @@ pub async fn libpq_listener_main(
 
 type ConnectionHandlerResult = anyhow::Result<()>;
 
-#[instrument(skip_all, fields(peer_addr))]
+#[instrument(skip_all, fields(peer_addr, application_name))]
 #[allow(clippy::too_many_arguments)]
 async fn page_service_conn_main(
     conf: &'static PageServerConf,
@@ -2463,9 +2463,16 @@ where
     fn startup(
         &mut self,
         _pgb: &mut PostgresBackend<IO>,
-        _sm: &FeStartupPacket,
+        sm: &FeStartupPacket,
     ) -> Result<(), QueryError> {
         fail::fail_point!("ps::connection-start::startup-packet");
+
+        if let FeStartupPacket::StartupMessage { params, .. } = sm {
+            if let Some(app_name) = params.get("application_name") {
+                Span::current().record("application_name", field::display(app_name));
+            }
+        };
+
         Ok(())
     }
 
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 22aeb2e2d6..fc1aecd340 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -378,8 +378,9 @@ pageserver_connect(shardno_t shard_no, int elevel)
 	{
 	case PS_Disconnected:
 	{
-		const char *keywords[3];
-		const char *values[3];
+		const char *keywords[4];
+		const char *values[4];
+		char pid_str[16];
 		int			n_pgsql_params;
 		TimestampTz	now;
 		int64		us_since_last_attempt;
@@ -424,14 +425,30 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		 * can override the password from the env variable. Seems useful, although
 		 * we don't currently use that capability anywhere.
 		 */
-		keywords[0] = "dbname";
-		values[0] = connstr;
-		n_pgsql_params = 1;
+		n_pgsql_params = 0;
+
+		/*
+		 * Pageserver logs include this in the connection's tracing span.
+		 * This allows for reasier log correlation between compute and pageserver.
+		 */
+		keywords[n_pgsql_params] = "application_name";
+		{
+			int ret = snprintf(pid_str, sizeof(pid_str), "%d", MyProcPid);
+			if (ret < 0 || ret >= (int)(sizeof(pid_str)))
+				elog(FATAL, "stack-allocated buffer too small to hold pid");
+		}
+		/* lifetime: PQconnectStartParams strdups internally */
+		values[n_pgsql_params] = (const char*) pid_str;
+		n_pgsql_params++;
+
+		keywords[n_pgsql_params] = "dbname";
+		values[n_pgsql_params] = connstr;
+		n_pgsql_params++;
 
 		if (neon_auth_token)
 		{
-			keywords[1] = "password";
-			values[1] = neon_auth_token;
+			keywords[n_pgsql_params] = "password";
+			values[n_pgsql_params] = neon_auth_token;
 			n_pgsql_params++;
 		}
 

From ae091c6913066ad6f5ad9ef5a3115fe2ff7d7597 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 14 Feb 2025 15:31:54 -0500
Subject: [PATCH 483/583] feat(pageserver): store reldir in sparse keyspace
 (#10593)

## Problem

Part of https://github.com/neondatabase/neon/issues/9516

## Summary of changes

This patch adds the support for storing reldir in the sparse keyspace.
All logic are guarded with the `rel_size_v2_enabled` flag, so if it's
set to false, the code path is exactly the same as what's currently in
prod.

Note that we did not persist the `rel_size_v2_enabled` flag and the
logic around it will be implemented in the next patch. (i.e., what if we
enabled it, restart the pageserver, and then it gets set to false? we
should still read from v2 using the rel_size_v2_migration_status in the
index_part). The persistence logic I'll implement in the next patch will
disallow switching from v2->v1 via config item.

I also refactored the metrics so that it can work with the new reldir
store. However, this metric is not correctly computed for reldirs (see
the comments) before. With the refactor, the value will be computed only
when we have an initial value for the reldir size. The refactor keeps
the incorrectness of the computation when there are more than 1
database.

For the tests, we currently run all the tests with v2, and I'll set it
to false and add some v2-specific tests before merging, probably also
v1->v2 migration tests.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/config.rs     |   4 +-
 libs/pageserver_api/src/key.rs        | 112 ++++++++-
 pageserver/src/pgdatadir_mapping.rs   | 322 +++++++++++++++++++++-----
 pageserver/src/tenant.rs              |   9 +-
 pageserver/src/tenant/config.rs       |   4 +-
 pageserver/src/tenant/timeline.rs     |  54 ++++-
 test_runner/regress/test_relations.py |  68 ++++++
 test_runner/regress/test_tenants.py   |   3 +-
 8 files changed, 507 insertions(+), 69 deletions(-)
 create mode 100644 test_runner/regress/test_relations.py

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 79f068a47b..e64052c73d 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -351,7 +351,7 @@ pub struct TenantConfigToml {
 
     /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into
     /// `index_part.json`, and it cannot be reversed.
-    pub rel_size_v2_enabled: Option<bool>,
+    pub rel_size_v2_enabled: bool,
 
     // gc-compaction related configs
     /// Enable automatic gc-compaction trigger on this tenant.
@@ -633,7 +633,7 @@ impl Default for TenantConfigToml {
             lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
             timeline_offloading: true,
             wal_receiver_protocol_override: None,
-            rel_size_v2_enabled: None,
+            rel_size_v2_enabled: false,
             gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
             gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
             gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index dbd45da314..b88a2e46a1 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,10 +1,12 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
+use bytes::Bytes;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::Oid;
 use postgres_ffi::RepOriginId;
 use serde::{Deserialize, Serialize};
 use std::{fmt, ops::Range};
+use utils::const_assert;
 
 use crate::reltag::{BlockNumber, RelTag, SlruKind};
 
@@ -49,6 +51,64 @@ pub const AUX_KEY_PREFIX: u8 = 0x62;
 /// The key prefix of ReplOrigin keys.
 pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63;
 
+/// The key prefix of db directory keys.
+pub const DB_DIR_KEY_PREFIX: u8 = 0x64;
+
+/// The key prefix of rel directory keys.
+pub const REL_DIR_KEY_PREFIX: u8 = 0x65;
+
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+pub enum RelDirExists {
+    Exists,
+    Removed,
+}
+
+#[derive(Debug)]
+pub struct DecodeError;
+
+impl fmt::Display for DecodeError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "invalid marker")
+    }
+}
+
+impl std::error::Error for DecodeError {}
+
+impl RelDirExists {
+    /// The value of the rel directory keys that indicates the existence of a relation.
+    const REL_EXISTS_MARKER: Bytes = Bytes::from_static(b"r");
+
+    pub fn encode(&self) -> Bytes {
+        match self {
+            Self::Exists => Self::REL_EXISTS_MARKER.clone(),
+            Self::Removed => SPARSE_TOMBSTONE_MARKER.clone(),
+        }
+    }
+
+    pub fn decode_option(data: Option<impl AsRef<[u8]>>) -> Result<Self, DecodeError> {
+        match data {
+            Some(marker) if marker.as_ref() == Self::REL_EXISTS_MARKER => Ok(Self::Exists),
+            // Any other marker is invalid
+            Some(_) => Err(DecodeError),
+            None => Ok(Self::Removed),
+        }
+    }
+
+    pub fn decode(data: impl AsRef<[u8]>) -> Result<Self, DecodeError> {
+        let data = data.as_ref();
+        if data == Self::REL_EXISTS_MARKER {
+            Ok(Self::Exists)
+        } else if data == SPARSE_TOMBSTONE_MARKER {
+            Ok(Self::Removed)
+        } else {
+            Err(DecodeError)
+        }
+    }
+}
+
+/// A tombstone in the sparse keyspace, which is an empty buffer.
+pub const SPARSE_TOMBSTONE_MARKER: Bytes = Bytes::from_static(b"");
+
 /// Check if the key falls in the range of metadata keys.
 pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
     key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
@@ -110,6 +170,24 @@ impl Key {
         }
     }
 
+    pub fn rel_dir_sparse_key_range() -> Range<Self> {
+        Key {
+            field1: REL_DIR_KEY_PREFIX,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }..Key {
+            field1: REL_DIR_KEY_PREFIX + 1,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }
+    }
+
     /// This function checks more extensively what keys we can take on the write path.
     /// If a key beginning with 00 does not have a global/default tablespace OID, it
     /// will be rejected on the write path.
@@ -440,6 +518,36 @@ pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
     }
 }
 
+#[inline(always)]
+pub fn rel_tag_sparse_key(spcnode: Oid, dbnode: Oid, relnode: Oid, forknum: u8) -> Key {
+    Key {
+        field1: REL_DIR_KEY_PREFIX,
+        field2: spcnode,
+        field3: dbnode,
+        field4: relnode,
+        field5: forknum,
+        field6: 1,
+    }
+}
+
+pub fn rel_tag_sparse_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
+    Key {
+        field1: REL_DIR_KEY_PREFIX,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }..Key {
+        field1: REL_DIR_KEY_PREFIX,
+        field2: spcnode,
+        field3: dbnode,
+        field4: u32::MAX,
+        field5: u8::MAX,
+        field6: u32::MAX,
+    } // it's fine to exclude the last key b/c we only use field6 == 1
+}
+
 #[inline(always)]
 pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
     Key {
@@ -734,9 +842,9 @@ impl Key {
         self.field1 == RELATION_SIZE_PREFIX
     }
 
-    pub fn sparse_non_inherited_keyspace() -> Range<Key> {
+    pub const fn sparse_non_inherited_keyspace() -> Range<Key> {
         // The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace
-        debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX);
+        const_assert!(AUX_KEY_PREFIX + 1 == REPL_ORIGIN_KEY_PREFIX);
         Key {
             field1: AUX_KEY_PREFIX,
             field2: 0,
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index f2dca8befa..ae2762bd1e 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -23,13 +23,14 @@ use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
 use itertools::Itertools;
-use pageserver_api::key::Key;
 use pageserver_api::key::{
     dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
-    relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
-    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
-    CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
+    rel_tag_sparse_key_range, relmap_file_key, repl_origin_key, repl_origin_key_range,
+    slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key,
+    twophase_file_key, twophase_key_range, CompactKey, RelDirExists, AUX_FILES_KEY, CHECKPOINT_KEY,
+    CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
+use pageserver_api::key::{rel_tag_sparse_key, Key};
 use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
@@ -490,12 +491,33 @@ impl Timeline {
         if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) {
             return Ok(false);
         }
-        // fetch directory listing
+
+        // Read path: first read the new reldir keyspace. Early return if the relation exists.
+        // Otherwise, read the old reldir keyspace.
+        // TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2.
+
+        if self.get_rel_size_v2_enabled() {
+            // fetch directory listing (new)
+            let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum);
+            let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?)
+                .map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?;
+            let exists_v2 = buf == RelDirExists::Exists;
+            // Fast path: if the relation exists in the new format, return true.
+            // TODO: we should have a verification mode that checks both keyspaces
+            // to ensure the relation only exists in one of them.
+            if exists_v2 {
+                return Ok(true);
+            }
+        }
+
+        // fetch directory listing (old)
+
         let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
         let buf = version.get(self, key, ctx).await?;
 
         let dir = RelDirectory::des(&buf)?;
-        Ok(dir.rels.contains(&(tag.relnode, tag.forknum)))
+        let exists_v1 = dir.rels.contains(&(tag.relnode, tag.forknum));
+        Ok(exists_v1)
     }
 
     /// Get a list of all existing relations in given tablespace and database.
@@ -513,12 +535,12 @@ impl Timeline {
         version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<HashSet<RelTag>, PageReconstructError> {
-        // fetch directory listing
+        // fetch directory listing (old)
         let key = rel_dir_to_key(spcnode, dbnode);
         let buf = version.get(self, key, ctx).await?;
 
         let dir = RelDirectory::des(&buf)?;
-        let rels: HashSet<RelTag> =
+        let rels_v1: HashSet<RelTag> =
             HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
                 spcnode,
                 dbnode,
@@ -526,6 +548,46 @@ impl Timeline {
                 forknum: *forknum,
             }));
 
+        if !self.get_rel_size_v2_enabled() {
+            return Ok(rels_v1);
+        }
+
+        // scan directory listing (new), merge with the old results
+        let key_range = rel_tag_sparse_key_range(spcnode, dbnode);
+        let io_concurrency = IoConcurrency::spawn_from_conf(
+            self.conf,
+            self.gate
+                .enter()
+                .map_err(|_| PageReconstructError::Cancelled)?,
+        );
+        let results = self
+            .scan(
+                KeySpace::single(key_range),
+                version.get_lsn(),
+                ctx,
+                io_concurrency,
+            )
+            .await?;
+        let mut rels = rels_v1;
+        for (key, val) in results {
+            let val = RelDirExists::decode(&val?)
+                .map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?;
+            assert_eq!(key.field6, 1);
+            assert_eq!(key.field2, spcnode);
+            assert_eq!(key.field3, dbnode);
+            let tag = RelTag {
+                spcnode,
+                dbnode,
+                relnode: key.field4,
+                forknum: key.field5,
+            };
+            if val == RelDirExists::Removed {
+                debug_assert!(!rels.contains(&tag), "removed reltag in v2");
+                continue;
+            }
+            let did_not_contain = rels.insert(tag);
+            debug_assert!(did_not_contain, "duplicate reltag in v2");
+        }
         Ok(rels)
     }
 
@@ -1144,7 +1206,11 @@ impl Timeline {
 
         let dense_keyspace = result.to_keyspace();
         let sparse_keyspace = SparseKeySpace(KeySpace {
-            ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()],
+            ranges: vec![
+                Key::metadata_aux_key_range(),
+                repl_origin_key_range(),
+                Key::rel_dir_sparse_key_range(),
+            ],
         });
 
         if cfg!(debug_assertions) {
@@ -1274,12 +1340,22 @@ pub struct DatadirModification<'a> {
 
     /// For special "directory" keys that store key-value maps, track the size of the map
     /// if it was updated in this modification.
-    pending_directory_entries: Vec<(DirectoryKind, usize)>,
+    pending_directory_entries: Vec<(DirectoryKind, MetricsUpdate)>,
 
     /// An **approximation** of how many metadata bytes will be written to the EphemeralFile.
     pending_metadata_bytes: usize,
 }
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum MetricsUpdate {
+    /// Set the metrics to this value
+    Set(u64),
+    /// Increment the metrics by this value
+    Add(u64),
+    /// Decrement the metrics by this value
+    Sub(u64),
+}
+
 impl DatadirModification<'_> {
     // When a DatadirModification is committed, we do a monolithic serialization of all its contents.  WAL records can
     // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
@@ -1359,7 +1435,8 @@ impl DatadirModification<'_> {
         let buf = DbDirectory::ser(&DbDirectory {
             dbdirs: HashMap::new(),
         })?;
-        self.pending_directory_entries.push((DirectoryKind::Db, 0));
+        self.pending_directory_entries
+            .push((DirectoryKind::Db, MetricsUpdate::Set(0)));
         self.put(DBDIR_KEY, Value::Image(buf.into()));
 
         let buf = if self.tline.pg_version >= 17 {
@@ -1372,7 +1449,7 @@ impl DatadirModification<'_> {
             })
         }?;
         self.pending_directory_entries
-            .push((DirectoryKind::TwoPhase, 0));
+            .push((DirectoryKind::TwoPhase, MetricsUpdate::Set(0)));
         self.put(TWOPHASEDIR_KEY, Value::Image(buf.into()));
 
         let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into();
@@ -1382,17 +1459,23 @@ impl DatadirModification<'_> {
         // harmless but they'd just be dropped on later compaction.
         if self.tline.tenant_shard_id.is_shard_zero() {
             self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone());
-            self.pending_directory_entries
-                .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
+            self.pending_directory_entries.push((
+                DirectoryKind::SlruSegment(SlruKind::Clog),
+                MetricsUpdate::Set(0),
+            ));
             self.put(
                 slru_dir_to_key(SlruKind::MultiXactMembers),
                 empty_dir.clone(),
             );
-            self.pending_directory_entries
-                .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
+            self.pending_directory_entries.push((
+                DirectoryKind::SlruSegment(SlruKind::Clog),
+                MetricsUpdate::Set(0),
+            ));
             self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir);
-            self.pending_directory_entries
-                .push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0));
+            self.pending_directory_entries.push((
+                DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets),
+                MetricsUpdate::Set(0),
+            ));
         }
 
         Ok(())
@@ -1658,10 +1741,16 @@ impl DatadirModification<'_> {
         }
         if r.is_none() {
             // Create RelDirectory
+            // TODO: if we have fully migrated to v2, no need to create this directory
             let buf = RelDirectory::ser(&RelDirectory {
                 rels: HashSet::new(),
             })?;
-            self.pending_directory_entries.push((DirectoryKind::Rel, 0));
+            self.pending_directory_entries
+                .push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
+            if self.tline.get_rel_size_v2_enabled() {
+                self.pending_directory_entries
+                    .push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
+            }
             self.put(
                 rel_dir_to_key(spcnode, dbnode),
                 Value::Image(Bytes::from(buf)),
@@ -1685,8 +1774,10 @@ impl DatadirModification<'_> {
             if !dir.xids.insert(xid) {
                 anyhow::bail!("twophase file for xid {} already exists", xid);
             }
-            self.pending_directory_entries
-                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            self.pending_directory_entries.push((
+                DirectoryKind::TwoPhase,
+                MetricsUpdate::Set(dir.xids.len() as u64),
+            ));
             Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
         } else {
             let xid = xid as u32;
@@ -1694,8 +1785,10 @@ impl DatadirModification<'_> {
             if !dir.xids.insert(xid) {
                 anyhow::bail!("twophase file for xid {} already exists", xid);
             }
-            self.pending_directory_entries
-                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            self.pending_directory_entries.push((
+                DirectoryKind::TwoPhase,
+                MetricsUpdate::Set(dir.xids.len() as u64),
+            ));
             Bytes::from(TwoPhaseDirectory::ser(&dir)?)
         };
         self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
@@ -1744,8 +1837,10 @@ impl DatadirModification<'_> {
         let mut dir = DbDirectory::des(&buf)?;
         if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
             let buf = DbDirectory::ser(&dir)?;
-            self.pending_directory_entries
-                .push((DirectoryKind::Db, dir.dbdirs.len()));
+            self.pending_directory_entries.push((
+                DirectoryKind::Db,
+                MetricsUpdate::Set(dir.dbdirs.len() as u64),
+            ));
             self.put(DBDIR_KEY, Value::Image(buf.into()));
         } else {
             warn!(
@@ -1778,39 +1873,85 @@ impl DatadirModification<'_> {
         // tablespace.  Create the reldir entry for it if so.
         let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
             .context("deserialize db")?;
-        let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
-        let mut rel_dir =
+
+        let dbdir_exists =
             if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
                 // Didn't exist. Update dbdir
                 e.insert(false);
                 let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
-                self.pending_directory_entries
-                    .push((DirectoryKind::Db, dbdir.dbdirs.len()));
+                self.pending_directory_entries.push((
+                    DirectoryKind::Db,
+                    MetricsUpdate::Set(dbdir.dbdirs.len() as u64),
+                ));
                 self.put(DBDIR_KEY, Value::Image(buf.into()));
-
-                // and create the RelDirectory
-                RelDirectory::default()
+                false
             } else {
-                // reldir already exists, fetch it
-                RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
-                    .context("deserialize db")?
+                true
             };
 
+        let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
+        let mut rel_dir = if !dbdir_exists {
+            // Create the RelDirectory
+            RelDirectory::default()
+        } else {
+            // reldir already exists, fetch it
+            RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
+                .context("deserialize db")?
+        };
+
         // Add the new relation to the rel directory entry, and write it back
         if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
             return Err(RelationError::AlreadyExists);
         }
 
-        self.pending_directory_entries
-            .push((DirectoryKind::Rel, rel_dir.rels.len()));
-
-        self.put(
-            rel_dir_key,
-            Value::Image(Bytes::from(
-                RelDirectory::ser(&rel_dir).context("serialize")?,
-            )),
-        );
-
+        if self.tline.get_rel_size_v2_enabled() {
+            let sparse_rel_dir_key =
+                rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
+            // check if the rel_dir_key exists in v2
+            let val = self
+                .sparse_get(sparse_rel_dir_key, ctx)
+                .await
+                .map_err(|e| RelationError::Other(e.into()))?;
+            let val = RelDirExists::decode_option(val)
+                .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
+            if val == RelDirExists::Exists {
+                return Err(RelationError::AlreadyExists);
+            }
+            self.put(
+                sparse_rel_dir_key,
+                Value::Image(RelDirExists::Exists.encode()),
+            );
+            if !dbdir_exists {
+                self.pending_directory_entries
+                    .push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
+                self.pending_directory_entries
+                    .push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
+                // We don't write `rel_dir_key -> rel_dir.rels` back to the storage in the v2 path unless it's the initial creation.
+                // TODO: if we have fully migrated to v2, no need to create this directory. Otherwise, there
+                // will be key not found errors if we don't create an empty one for rel_size_v2.
+                self.put(
+                    rel_dir_key,
+                    Value::Image(Bytes::from(
+                        RelDirectory::ser(&RelDirectory::default()).context("serialize")?,
+                    )),
+                );
+            }
+            self.pending_directory_entries
+                .push((DirectoryKind::RelV2, MetricsUpdate::Add(1)));
+        } else {
+            if !dbdir_exists {
+                self.pending_directory_entries
+                    .push((DirectoryKind::Rel, MetricsUpdate::Set(0)))
+            }
+            self.pending_directory_entries
+                .push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
+            self.put(
+                rel_dir_key,
+                Value::Image(Bytes::from(
+                    RelDirectory::ser(&rel_dir).context("serialize")?,
+                )),
+            );
+        }
         // Put size
         let size_key = rel_size_to_key(rel);
         let buf = nblocks.to_le_bytes();
@@ -1896,9 +2037,34 @@ impl DatadirModification<'_> {
 
             let mut dirty = false;
             for rel_tag in rel_tags {
-                if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) {
+                let found = if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) {
+                    self.pending_directory_entries
+                        .push((DirectoryKind::Rel, MetricsUpdate::Sub(1)));
                     dirty = true;
+                    true
+                } else if self.tline.get_rel_size_v2_enabled() {
+                    // The rel is not found in the old reldir key, so we need to check the new sparse keyspace.
+                    // Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion
+                    // logic).
+                    let key =
+                        rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
+                    let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
+                        .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
+                    if val == RelDirExists::Exists {
+                        self.pending_directory_entries
+                            .push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
+                        // put tombstone
+                        self.put(key, Value::Image(RelDirExists::Removed.encode()));
+                        // no need to set dirty to true
+                        true
+                    } else {
+                        false
+                    }
+                } else {
+                    false
+                };
 
+                if found {
                     // update logical size
                     let size_key = rel_size_to_key(rel_tag);
                     let old_size = self.get(size_key, ctx).await?.get_u32_le();
@@ -1914,8 +2080,6 @@ impl DatadirModification<'_> {
 
             if dirty {
                 self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
-                self.pending_directory_entries
-                    .push((DirectoryKind::Rel, dir.rels.len()));
             }
         }
 
@@ -1939,8 +2103,10 @@ impl DatadirModification<'_> {
         if !dir.segments.insert(segno) {
             anyhow::bail!("slru segment {kind:?}/{segno} already exists");
         }
-        self.pending_directory_entries
-            .push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
+        self.pending_directory_entries.push((
+            DirectoryKind::SlruSegment(kind),
+            MetricsUpdate::Set(dir.segments.len() as u64),
+        ));
         self.put(
             dir_key,
             Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
@@ -1987,8 +2153,10 @@ impl DatadirModification<'_> {
         if !dir.segments.remove(&segno) {
             warn!("slru segment {:?}/{} does not exist", kind, segno);
         }
-        self.pending_directory_entries
-            .push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
+        self.pending_directory_entries.push((
+            DirectoryKind::SlruSegment(kind),
+            MetricsUpdate::Set(dir.segments.len() as u64),
+        ));
         self.put(
             dir_key,
             Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
@@ -2020,8 +2188,10 @@ impl DatadirModification<'_> {
             if !dir.xids.remove(&xid) {
                 warn!("twophase file for xid {} does not exist", xid);
             }
-            self.pending_directory_entries
-                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            self.pending_directory_entries.push((
+                DirectoryKind::TwoPhase,
+                MetricsUpdate::Set(dir.xids.len() as u64),
+            ));
             Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
         } else {
             let xid: u32 = u32::try_from(xid)?;
@@ -2030,8 +2200,10 @@ impl DatadirModification<'_> {
             if !dir.xids.remove(&xid) {
                 warn!("twophase file for xid {} does not exist", xid);
             }
-            self.pending_directory_entries
-                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            self.pending_directory_entries.push((
+                DirectoryKind::TwoPhase,
+                MetricsUpdate::Set(dir.xids.len() as u64),
+            ));
             Bytes::from(TwoPhaseDirectory::ser(&dir)?)
         };
         self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
@@ -2147,7 +2319,7 @@ impl DatadirModification<'_> {
         }
 
         for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
-            writer.update_directory_entries_count(kind, count as u64);
+            writer.update_directory_entries_count(kind, count);
         }
 
         Ok(())
@@ -2233,7 +2405,7 @@ impl DatadirModification<'_> {
         }
 
         for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
-            writer.update_directory_entries_count(kind, count as u64);
+            writer.update_directory_entries_count(kind, count);
         }
 
         self.pending_metadata_bytes = 0;
@@ -2297,6 +2469,22 @@ impl DatadirModification<'_> {
         self.tline.get(key, lsn, ctx).await
     }
 
+    /// Get a key from the sparse keyspace. Automatically converts the missing key error
+    /// and the empty value into None.
+    async fn sparse_get(
+        &self,
+        key: Key,
+        ctx: &RequestContext,
+    ) -> Result<Option<Bytes>, PageReconstructError> {
+        let val = self.get(key, ctx).await;
+        match val {
+            Ok(val) if val.is_empty() => Ok(None),
+            Ok(val) => Ok(Some(val)),
+            Err(PageReconstructError::MissingKey(_)) => Ok(None),
+            Err(e) => Err(e),
+        }
+    }
+
     fn put(&mut self, key: Key, val: Value) {
         if Self::is_data_key(&key) {
             self.put_data(key.to_compact(), val)
@@ -2379,6 +2567,23 @@ impl Version<'_> {
         }
     }
 
+    /// Get a key from the sparse keyspace. Automatically converts the missing key error
+    /// and the empty value into None.
+    async fn sparse_get(
+        &self,
+        timeline: &Timeline,
+        key: Key,
+        ctx: &RequestContext,
+    ) -> Result<Option<Bytes>, PageReconstructError> {
+        let val = self.get(timeline, key, ctx).await;
+        match val {
+            Ok(val) if val.is_empty() => Ok(None),
+            Ok(val) => Ok(Some(val)),
+            Err(PageReconstructError::MissingKey(_)) => Ok(None),
+            Err(e) => Err(e),
+        }
+    }
+
     fn get_lsn(&self) -> Lsn {
         match self {
             Version::Lsn(lsn) => *lsn,
@@ -2438,6 +2643,7 @@ pub(crate) enum DirectoryKind {
     Rel,
     AuxFiles,
     SlruSegment(SlruKind),
+    RelV2,
 }
 
 impl DirectoryKind {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index dec585ff65..5a2c5c0c46 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3924,6 +3924,13 @@ impl Tenant {
             .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
     }
 
+    pub fn get_rel_size_v2_enabled(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .rel_size_v2_enabled
+            .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
+    }
+
     pub fn get_compaction_upper_limit(&self) -> usize {
         let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
@@ -5640,7 +5647,7 @@ pub(crate) mod harness {
                 lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
                 timeline_offloading: Some(tenant_conf.timeline_offloading),
                 wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override,
-                rel_size_v2_enabled: tenant_conf.rel_size_v2_enabled,
+                rel_size_v2_enabled: Some(tenant_conf.rel_size_v2_enabled),
                 gc_compaction_enabled: Some(tenant_conf.gc_compaction_enabled),
                 gc_compaction_initial_threshold_kb: Some(
                     tenant_conf.gc_compaction_initial_threshold_kb,
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 7fdfd736ad..c6bcfdf2fb 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -485,7 +485,9 @@ impl TenantConfOpt {
             wal_receiver_protocol_override: self
                 .wal_receiver_protocol_override
                 .or(global_conf.wal_receiver_protocol_override),
-            rel_size_v2_enabled: self.rel_size_v2_enabled.or(global_conf.rel_size_v2_enabled),
+            rel_size_v2_enabled: self
+                .rel_size_v2_enabled
+                .unwrap_or(global_conf.rel_size_v2_enabled),
             gc_compaction_enabled: self
                 .gc_compaction_enabled
                 .unwrap_or(global_conf.gc_compaction_enabled),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 782b7d88b0..277dce7761 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -117,7 +117,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL};
-use crate::pgdatadir_mapping::CalculateLogicalSizeError;
+use crate::pgdatadir_mapping::{CalculateLogicalSizeError, MetricsUpdate};
 use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::ShardIndex;
@@ -327,6 +327,7 @@ pub struct Timeline {
     // in `crate::page_service` writes these metrics.
     pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline,
 
+    directory_metrics_inited: [AtomicBool; DirectoryKind::KINDS_NUM],
     directory_metrics: [AtomicU64; DirectoryKind::KINDS_NUM],
 
     /// Ensures layers aren't frozen by checkpointer between
@@ -2355,6 +2356,14 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
     }
 
+    pub(crate) fn get_rel_size_v2_enabled(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .rel_size_v2_enabled
+            .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
+    }
+
     fn get_compaction_upper_limit(&self) -> usize {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
@@ -2664,6 +2673,7 @@ impl Timeline {
                 ),
 
                 directory_metrics: array::from_fn(|_| AtomicU64::new(0)),
+                directory_metrics_inited: array::from_fn(|_| AtomicBool::new(false)),
 
                 flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
 
@@ -3430,8 +3440,42 @@ impl Timeline {
         }
     }
 
-    pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: u64) {
-        self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed);
+    pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: MetricsUpdate) {
+        // TODO: this directory metrics is not correct -- we could have multiple reldirs in the system
+        // for each of the database, but we only store one value, and therefore each pgdirmodification
+        // would overwrite the previous value if they modify different databases.
+
+        match count {
+            MetricsUpdate::Set(count) => {
+                self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed);
+                self.directory_metrics_inited[kind.offset()].store(true, AtomicOrdering::Relaxed);
+            }
+            MetricsUpdate::Add(count) => {
+                // TODO: these operations are not atomic; but we only have one writer to the metrics, so
+                // it's fine.
+                if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) {
+                    // The metrics has been initialized with `MetricsUpdate::Set` before, so we can add/sub
+                    // the value reliably.
+                    self.directory_metrics[kind.offset()].fetch_add(count, AtomicOrdering::Relaxed);
+                }
+                // Otherwise, ignore this update
+            }
+            MetricsUpdate::Sub(count) => {
+                // TODO: these operations are not atomic; but we only have one writer to the metrics, so
+                // it's fine.
+                if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) {
+                    // The metrics has been initialized with `MetricsUpdate::Set` before.
+                    // The operation could overflow so we need to normalize the value.
+                    let prev_val =
+                        self.directory_metrics[kind.offset()].load(AtomicOrdering::Relaxed);
+                    let res = prev_val.saturating_sub(count);
+                    self.directory_metrics[kind.offset()].store(res, AtomicOrdering::Relaxed);
+                }
+                // Otherwise, ignore this update
+            }
+        };
+
+        // TODO: remove this, there's no place in the code that updates this aux metrics.
         let aux_metric =
             self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed);
 
@@ -3649,7 +3693,9 @@ impl Timeline {
             // space. If that's not the case, we had at least one key encounter a gap in the image layer
             // and stop the search as a result of that.
             let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
-            // Do not fire missing key error for sparse keys.
+            // Do not fire missing key error and end early for sparse keys. Note that we hava already removed
+            // non-inherited keyspaces before, so we can safely do a full `SPARSE_RANGE` remove instead of
+            // figuring out what is the inherited key range and do a fine-grained pruning.
             removed.remove_overlapping_with(&KeySpace {
                 ranges: vec![SPARSE_RANGE],
             });
diff --git a/test_runner/regress/test_relations.py b/test_runner/regress/test_relations.py
new file mode 100644
index 0000000000..3e29c92a96
--- /dev/null
+++ b/test_runner/regress/test_relations.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+)
+
+
+def test_pageserver_reldir_v2(
+    neon_env_builder: NeonEnvBuilder,
+):
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "rel_size_v2_enabled": "false",
+        }
+    )
+
+    endpoint = env.endpoints.create_start("main")
+    # Create a relation in v1
+    endpoint.safe_psql("CREATE TABLE foo1 (id INTEGER PRIMARY KEY, val text)")
+    endpoint.safe_psql("CREATE TABLE foo2 (id INTEGER PRIMARY KEY, val text)")
+
+    # Switch to v2
+    env.pageserver.http_client().update_tenant_config(
+        env.initial_tenant,
+        {
+            "rel_size_v2_enabled": True,
+        },
+    )
+
+    # Check if both relations are still accessible
+    endpoint.safe_psql("SELECT * FROM foo1")
+    endpoint.safe_psql("SELECT * FROM foo2")
+
+    # Restart the endpoint
+    endpoint.stop()
+    endpoint.start()
+
+    # Check if both relations are still accessible again after restart
+    endpoint.safe_psql("SELECT * FROM foo1")
+    endpoint.safe_psql("SELECT * FROM foo2")
+
+    # Create a relation in v2
+    endpoint.safe_psql("CREATE TABLE foo3 (id INTEGER PRIMARY KEY, val text)")
+    # Delete a relation in v1
+    endpoint.safe_psql("DROP TABLE foo1")
+
+    # Check if both relations are still accessible
+    endpoint.safe_psql("SELECT * FROM foo2")
+    endpoint.safe_psql("SELECT * FROM foo3")
+
+    # Restart the endpoint
+    endpoint.stop()
+    # This will acquire a basebackup, which lists all relations.
+    endpoint.start()
+
+    # Check if both relations are still accessible
+    endpoint.safe_psql("DROP TABLE IF EXISTS foo1")
+    endpoint.safe_psql("SELECT * FROM foo2")
+    endpoint.safe_psql("SELECT * FROM foo3")
+
+    endpoint.safe_psql("DROP TABLE foo3")
+    endpoint.stop()
+    endpoint.start()
+
+    # Check if relations are still accessible
+    endpoint.safe_psql("DROP TABLE IF EXISTS foo1")
+    endpoint.safe_psql("SELECT * FROM foo2")
+    endpoint.safe_psql("DROP TABLE IF EXISTS foo3")
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index b4c968b217..afe444f227 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -481,7 +481,8 @@ def test_pageserver_metrics_many_relations(neon_env_builder: NeonEnvBuilder):
     counts = timeline_detail["directory_entries_counts"]
     assert counts
     log.info(f"directory counts: {counts}")
-    assert counts[2] > COUNT_AT_LEAST_EXPECTED
+    # We need to add up reldir v1 + v2 counts
+    assert counts[2] + counts[7] > COUNT_AT_LEAST_EXPECTED
 
 
 def test_timelines_parallel_endpoints(neon_simple_env: NeonEnv):

From 2ec8dff6f77c605c34b6a6ed9b6e4e1b56229f26 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sat, 15 Feb 2025 10:34:11 +0000
Subject: [PATCH 484/583] CI(build-and-test-locally): set `session-timeout` for
 pytest (#10831)

## Problem

Sometimes, a regression test run gets stuck (taking more than 60
minutes) and is killed by GitHub's `timeout-minutes` without leaving any
traces in the test results database.
I find no correlation between this and either the build type, the
architecture, or the Postgres version.

See: https://neonprod.grafana.net/goto/nM7ih7cHR?orgId=1

## Summary of changes
- Bump `pytest-timeout` to the version that supports `--session-timeout`
- Set `--session-timeout` to (timeout-minutes - 10 minutes) * 60 seconds
in Attempt to stop tests gracefully to generate test reports until they
are forcibly stopped by the stricter `timeout-minutes` limit.
---
 .github/workflows/_build-and-test-locally.yml |  4 ++++
 poetry.lock                                   | 12 ++++++------
 pyproject.toml                                |  2 +-
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 86a791497c..3740e6dc9c 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -348,6 +348,10 @@ jobs:
           rerun_failed: true
           pg_version: ${{ matrix.pg_version }}
           aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
+          # Attempt to stop tests gracefully to generate test reports
+          # until they are forcibly stopped by the stricter `timeout-minutes` limit.
+          extra_params: --session-timeout=${{ inputs.sanitizers != 'enabled' && 3000 || 10200 }}
         env:
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
diff --git a/poetry.lock b/poetry.lock
index e2c71ca012..d66c3aae7a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2771,18 +2771,18 @@ pytest = ">=5,<8"
 
 [[package]]
 name = "pytest-timeout"
-version = "2.1.0"
+version = "2.3.1"
 description = "pytest plugin to abort hanging tests"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 groups = ["main"]
 files = [
-    {file = "pytest-timeout-2.1.0.tar.gz", hash = "sha256:c07ca07404c612f8abbe22294b23c368e2e5104b521c1790195561f37e1ac3d9"},
-    {file = "pytest_timeout-2.1.0-py3-none-any.whl", hash = "sha256:f6f50101443ce70ad325ceb4473c4255e9d74e3c7cd0ef827309dfa4c0d975c6"},
+    {file = "pytest-timeout-2.3.1.tar.gz", hash = "sha256:12397729125c6ecbdaca01035b9e5239d4db97352320af155b3f5de1ba5165d9"},
+    {file = "pytest_timeout-2.3.1-py3-none-any.whl", hash = "sha256:68188cb703edfc6a18fad98dc25a3c61e9f24d644b0b70f33af545219fc7813e"},
 ]
 
 [package.dependencies]
-pytest = ">=5.0.0"
+pytest = ">=7.0.0"
 
 [[package]]
 name = "pytest-xdist"
@@ -3820,4 +3820,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.11"
-content-hash = "03697c0a4d438ef088b0d397b8f0570aa3998ccf833fe612400824792498878b"
+content-hash = "00ddc42c32e235b6171845fc066dcab078282ed832cd464d5e8a0afa959dd04a"
diff --git a/pyproject.toml b/pyproject.toml
index 51cd68e002..92a660c233 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,7 @@ moto = {extras = ["server"], version = "^5.0.6"}
 backoff = "^2.2.1"
 pytest-lazy-fixture = "^0.6.3"
 prometheus-client = "^0.14.1"
-pytest-timeout = "^2.1.0"
+pytest-timeout = "^2.3.1"
 Werkzeug = "^3.0.6"
 pytest-order = "^1.1.0"
 allure-pytest = "^2.13.2"

From 2dae0612dd429ea293fa273350c725e702528f6d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sun, 16 Feb 2025 02:01:19 +0200
Subject: [PATCH 485/583] fast_import: Fix shared_buffers setting (#10837)

In commit 9537829ccd I made shared_buffers be derived from the system's
available RAM. However, I failed to remove the old hard-coded
shared_buffers=10GB settings, shared_buffers was set twice. Oopsie.
---
 compute_tools/src/bin/fast_import.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index dad15d67b7..4c8d031532 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -211,7 +211,6 @@ impl PostgresProcess {
             .args(["-p", &format!("{port}")])
             .args(["-c", "wal_level=minimal"])
             .args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")])
-            .args(["-c", "shared_buffers=10GB"])
             .args(["-c", "max_wal_senders=0"])
             .args(["-c", "fsync=off"])
             .args(["-c", "full_page_writes=off"])

From f739773eddc2bb94f7eca7b10046e77115c7d3f9 Mon Sep 17 00:00:00 2001
From: Alexander Lakhin <exclusion@gmail.com>
Date: Sun, 16 Feb 2025 06:59:52 +0200
Subject: [PATCH 486/583] Fix format of milliseconds in pytest output (#10836)

## Problem

The timestamp prefix of pytest log lines contains milliseconds without
leading zeros, so values of milliseconds less than 100 printed
incorrectly.

For example:
```
2025-02-15 12:02:51.997 INFO [_internal.py:97] 127.0.0.1 - - ...
2025-02-15 12:02:52.4   INFO [_internal.py:97] 127.0.0.1 - - ...
2025-02-15 12:02:52.9   INFO [_internal.py:97] 127.0.0.1 - - ...
2025-02-15 12:02:52.23  INFO [_internal.py:97] 127.0.0.1 - - ...
```

## Summary of changes
Fix log_format for pytest so that milliseconds are printed with leading
zeros.
---
 pytest.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytest.ini b/pytest.ini
index 7197b078c6..237066b1f6 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -11,7 +11,7 @@ markers =
 testpaths =
     test_runner
 minversion = 6.0
-log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s
+log_format = %(asctime)s.%(msecs)03d %(levelname)s [%(filename)s:%(lineno)d] %(message)s
 log_date_format = %Y-%m-%d %H:%M:%S
 log_cli = true
 timeout = 300

From d566d604cfc7e598741a2342330013c43ad3cbb6 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Mon, 17 Feb 2025 11:43:16 +0100
Subject: [PATCH 487/583] feat(compute) add pg_duckdb extension v0.3.1 (#10829)

We want to host pg_duckdb (starting with v0.3.1) on Neon.

This PR replaces https://github.com/neondatabase/neon/pull/10350 which
was for older pg_duckdb v0.2.0

Use cases
- faster OLAP queries
- access to datelake files (e.g. parquet) on S3 buckets from Neon
PostgreSQL

Because neon does not provide superuser role to neon customers we need
to grant some additional permissions to neon_superuser:

Note: some grants that we require are already granted to `PUBLIC` in new
release of pg_duckdb
[here](https://github.com/duckdb/pg_duckdb/blob/3789e4c50961c03c92b7b16776804252974f8c62/sql/pg_duckdb--0.2.0--0.3.0.sql#L1054)

```sql
GRANT ALL ON FUNCTION duckdb.install_extension(TEXT) TO neon_superuser;
GRANT ALL ON TABLE duckdb.extensions TO neon_superuser;
GRANT ALL ON SEQUENCE duckdb.extensions_table_seq TO neon_superuser;
```
---
 compute/compute-node.Dockerfile      | 28 +++++++++++++++++++++++++++-
 compute/patches/pg_duckdb_v031.patch | 11 +++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 compute/patches/pg_duckdb_v031.patch

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 30348c2b90..1236372d27 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -148,7 +148,7 @@ RUN case $DEBIAN_VERSION in \
     apt install --no-install-recommends --no-install-suggests -y \
     ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \
     zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \
-    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip \
+    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip g++ \
     $VERSION_INSTALLS \
     && apt clean && rm -rf /var/lib/apt/lists/*
 
@@ -1464,6 +1464,31 @@ RUN make release -j $(getconf _NPROCESSORS_ONLN) && \
     make install -j $(getconf _NPROCESSORS_ONLN) && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
 
+#########################################################################################
+#
+# Layer "pg-duckdb-pg-build"
+# compile pg_duckdb extension
+#
+#########################################################################################
+FROM build-deps AS pg_duckdb-src
+WORKDIR /ext-src
+COPY compute/patches/pg_duckdb_v031.patch .
+# pg_duckdb build requires source dir to be a git repo to get submodules
+# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: 
+# - extension management function duckdb.install_extension()
+# - access to duckdb.extensions table and its sequence
+RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \
+    cd pg_duckdb-src && \
+    git submodule update --init --recursive && \
+    patch -p1 < /ext-src/pg_duckdb_v031.patch
+
+FROM pg-build AS pg_duckdb-build
+ARG PG_VERSION
+COPY --from=pg_duckdb-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_duckdb-src
+RUN make install -j $(getconf _NPROCESSORS_ONLN) && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control 
+        
 #########################################################################################
 #
 # Layer "pg_repack"
@@ -1577,6 +1602,7 @@ COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
 
 #########################################################################################
diff --git a/compute/patches/pg_duckdb_v031.patch b/compute/patches/pg_duckdb_v031.patch
new file mode 100644
index 0000000000..a7e188d69e
--- /dev/null
+++ b/compute/patches/pg_duckdb_v031.patch
@@ -0,0 +1,11 @@
+diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql
+index d777d76..af60106 100644
+--- a/sql/pg_duckdb--0.2.0--0.3.0.sql
++++ b/sql/pg_duckdb--0.2.0--0.3.0.sql
+@@ -1056,3 +1056,6 @@ GRANT ALL ON FUNCTION duckdb.cache(TEXT, TEXT) TO PUBLIC;
+ GRANT ALL ON FUNCTION duckdb.cache_info() TO PUBLIC;
+ GRANT ALL ON FUNCTION duckdb.cache_delete(TEXT) TO PUBLIC;
+ GRANT ALL ON PROCEDURE duckdb.recycle_ddb() TO PUBLIC;
++GRANT ALL ON FUNCTION duckdb.install_extension(TEXT) TO neon_superuser;
++GRANT ALL ON TABLE duckdb.extensions TO neon_superuser;
++GRANT ALL ON SEQUENCE duckdb.extensions_table_seq TO neon_superuser;

From 81f08d304afab319556c969712531e4af813132e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 17 Feb 2025 11:44:44 +0100
Subject: [PATCH 488/583] Rebase Azure SDK and apply newest patch (#10825)

The [upstream PR](https://github.com/Azure/azure-sdk-for-rust/pull/1997)
has been merged with some changes to use threads with async, so apply
them to the neon specific fork to be nice to the executor (before, we
had the state as of filing of that PR). Also, rebase onto the latest
version of upstream's `legacy` branch.

current SDK commits:
[link](https://github.com/neondatabase/azure-sdk-for-rust/commits/neon-2025-02-14)
now:
[link](https://github.com/neondatabase/azure-sdk-for-rust/commits/arpad/neon-refresh)

Prior update was in #10790
---
 Cargo.lock | 10 +++++-----
 Cargo.toml |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 287201b4e0..64eb53ff00 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -786,7 +786,7 @@ dependencies = [
 [[package]]
 name = "azure_core"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=arpad%2Fneon-refresh#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
  "async-trait",
  "base64 0.22.1",
@@ -815,7 +815,7 @@ dependencies = [
 [[package]]
 name = "azure_identity"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=arpad%2Fneon-refresh#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
  "async-lock",
  "async-trait",
@@ -834,7 +834,7 @@ dependencies = [
 [[package]]
 name = "azure_storage"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=arpad%2Fneon-refresh#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
  "RustyXML",
  "async-lock",
@@ -852,7 +852,7 @@ dependencies = [
 [[package]]
 name = "azure_storage_blobs"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=arpad%2Fneon-refresh#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
  "RustyXML",
  "azure_core",
@@ -872,7 +872,7 @@ dependencies = [
 [[package]]
 name = "azure_svc_blobstorage"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=arpad%2Fneon-refresh#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
  "azure_core",
  "bytes",
diff --git a/Cargo.toml b/Cargo.toml
index 7228623c6b..0ca5ae4f5a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -222,10 +222,10 @@ postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", br
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
 
 ## Azure SDK crates
-azure_core = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
-azure_identity = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
-azure_storage = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
-azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_core = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "arpad/neon-refresh", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
+azure_identity = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "arpad/neon-refresh", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_storage = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "arpad/neon-refresh", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "arpad/neon-refresh", default-features = false, features = ["enable_reqwest_rustls"] }
 
 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }

From 8c6d133d31ced1dc9bba9fc79a9ca2d50c636b66 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 17 Feb 2025 14:54:17 +0200
Subject: [PATCH 489/583] Fix out-of-boundaries access in addSHLL function
 (#10840)

## Problem

See https://github.com/neondatabase/neon/issues/10839

rho(x,b) functions returns values in range [1,b+1] and addSHLL tries to
store it in array of size b+1.

## Summary of changes

Subtract 1 fro value returned by rho

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/hll.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pgxn/neon/hll.c b/pgxn/neon/hll.c
index 1f53c8fd36..bbaad09f5f 100644
--- a/pgxn/neon/hll.c
+++ b/pgxn/neon/hll.c
@@ -122,8 +122,8 @@ addSHLL(HyperLogLogState *cState, uint32 hash)
 	index = hash >> HLL_C_BITS;
 
 	/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
-	count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS);
-
+	count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS) - 1;
+	Assert(count <= HLL_C_BITS);
 	cState->regs[index][count] = now;
 }
 
@@ -136,7 +136,7 @@ getMaximum(const TimestampTz* reg, TimestampTz since)
 	{
 		if (reg[i] >= since)
 		{
-			max = i;
+			max = i + 1;
 		}
 	}
 

From 8a2d95b4b5d513996fda52b5029fedd0d0ebd47d Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 17 Feb 2025 15:41:22 +0100
Subject: [PATCH 490/583] pageserver: appease unused lint on macOS (#10846)

## Problem

`SmgrOpFlushInProgress::measure()` takes a `socket_fd` argument which is
only used on Linux. This causes linter warnings on macOS.

Touches #10823.

## Summary of changes

Add a noop use of `socket_fd` on non-Linux branch.
---
 pageserver/src/metrics.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 0ffd4e851a..16ca4683ad 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1489,6 +1489,7 @@ impl SmgrOpFlushInProgress {
                         }
                         #[cfg(not(target_os = "linux"))]
                         {
+                            _ = socket_fd; // appease unused lint on macOS
                             (-1, -1)
                         }
                     };

From 0330b617291f6ad6459a406a5b1a6217fcc587ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 17 Feb 2025 15:59:01 +0100
Subject: [PATCH 491/583] Azure SDK: use neon branch again (#10844)

Originally I wanted to switch back to the `neon` branch before merging
#10825, but I forgot to do it. Do it in a separate PR now.

No actual change of the source code, only changes the branch name (so
that maybe in a few weeks we can delete the temporary branch
`arpad/neon-rebase`).
---
 Cargo.lock | 10 +++++-----
 Cargo.toml |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 64eb53ff00..4f75fa5733 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -786,7 +786,7 @@ dependencies = [
 [[package]]
 name = "azure_core"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=arpad%2Fneon-refresh#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
  "async-trait",
  "base64 0.22.1",
@@ -815,7 +815,7 @@ dependencies = [
 [[package]]
 name = "azure_identity"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=arpad%2Fneon-refresh#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
  "async-lock",
  "async-trait",
@@ -834,7 +834,7 @@ dependencies = [
 [[package]]
 name = "azure_storage"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=arpad%2Fneon-refresh#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
  "RustyXML",
  "async-lock",
@@ -852,7 +852,7 @@ dependencies = [
 [[package]]
 name = "azure_storage_blobs"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=arpad%2Fneon-refresh#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
  "RustyXML",
  "azure_core",
@@ -872,7 +872,7 @@ dependencies = [
 [[package]]
 name = "azure_svc_blobstorage"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=arpad%2Fneon-refresh#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
  "azure_core",
  "bytes",
diff --git a/Cargo.toml b/Cargo.toml
index 0ca5ae4f5a..7228623c6b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -222,10 +222,10 @@ postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", br
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
 
 ## Azure SDK crates
-azure_core = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "arpad/neon-refresh", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
-azure_identity = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "arpad/neon-refresh", default-features = false, features = ["enable_reqwest_rustls"] }
-azure_storage = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "arpad/neon-refresh", default-features = false, features = ["enable_reqwest_rustls"] }
-azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "arpad/neon-refresh", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_core = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
+azure_identity = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_storage = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
 
 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }

From 39d42d846ae387c1ba8f5ab2432b48bd412360b6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 17 Feb 2025 15:04:47 +0000
Subject: [PATCH 492/583] pageserver_api: fix decoding old-version TimelineInfo
 (#10845)

## Problem

In #10707 some new fields were introduced in TimelineInfo.

I forgot that we do not only use TimelineInfo for encoding, but also
decoding when the storage controller calls into a pageserver, so this
broke some calls from controller to pageserver while in a mixed-version
state.

## Summary of changes

- Make new fields have default behavior so that they are optional
---
 libs/pageserver_api/src/models.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 426222a531..3d40cfe121 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1144,6 +1144,7 @@ pub struct TimelineInfo {
     /// The LSN up to which GC has advanced: older data may still exist but it is not available for clients.
     /// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead,
     /// as it is easier to reason about.
+    #[serde(default)]
     pub applied_gc_cutoff_lsn: Lsn,
 
     /// The upper bound of data which is either already GC'ed, or elegible to be GC'ed at any time based on PITR interval.
@@ -1152,6 +1153,7 @@ pub struct TimelineInfo {
     ///
     /// Note that holders of valid LSN leases may be able to create branches and read pages earlier
     /// than this LSN, but new leases may not be taken out earlier than this LSN.
+    #[serde(default)]
     pub min_readable_lsn: Lsn,
 
     pub disk_consistent_lsn: Lsn,

From da79cc5eeee225986f1a12cb1a9dbeb6315d88ad Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 17 Feb 2025 09:40:43 -0600
Subject: [PATCH 493/583] Add neon.extension_server_{connect,request}_timeout
 (#10801)

Instead of hardcoding the request timeout, let's make it configurable as
a PGC_SUSET GUC.

Additionally, add a connect timeout GUC. Although the extension server
runs on the compute, it is always best to keep operations from hanging.
Better to present a timeout error to the user than a stuck backend.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 pgxn/neon/extension_server.c | 39 +++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c
index 6e558c433a..0331f961b4 100644
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -18,6 +18,8 @@
 #include "neon_utils.h"
 
 static int	extension_server_port = 0;
+static int	extension_server_request_timeout = 60;
+static int	extension_server_connect_timeout = 60;
 
 static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
 
@@ -34,19 +36,18 @@ static download_extension_file_hook_type prev_download_extension_file_hook = NUL
 static bool
 neon_download_extension_file_http(const char *filename, bool is_library)
 {
-	static CURL	   *handle = NULL;
-
 	CURLcode	res;
-	char	   *compute_ctl_url;
 	bool		ret = false;
+	CURL	   *handle = NULL;
+	char	   *compute_ctl_url;
 
-	if (handle == NULL)
-	{
-		handle = alloc_curl_handle();
+	handle = alloc_curl_handle();
 
-		curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST");
-		curl_easy_setopt(handle, CURLOPT_TIMEOUT, 60L /* seconds */ );
-	}
+	curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST");
+	if (extension_server_request_timeout > 0)
+		curl_easy_setopt(handle, CURLOPT_TIMEOUT, (long)extension_server_request_timeout /* seconds */ );
+	if (extension_server_connect_timeout > 0)
+		curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, (long)extension_server_connect_timeout /* seconds */ );
 
 	compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
 							   extension_server_port, filename, is_library ? "?is_library=true" : "");
@@ -57,6 +58,8 @@ neon_download_extension_file_http(const char *filename, bool is_library)
 
 	/* Perform the request, res will get the return code */
 	res = curl_easy_perform(handle);
+	curl_easy_cleanup(handle);
+
 	/* Check for errors */
 	if (res == CURLE_OK)
 	{
@@ -88,6 +91,24 @@ pg_init_extension_server()
 							0,	/* no flags required */
 							NULL, NULL, NULL);
 
+	DefineCustomIntVariable("neon.extension_server_request_timeout",
+							"timeout for fetching extensions in seconds",
+							NULL,
+							&extension_server_request_timeout,
+							60, 0, INT_MAX,
+							PGC_SUSET,
+							GUC_UNIT_S,
+							NULL, NULL, NULL);
+
+	DefineCustomIntVariable("neon.extension_server_connect_timeout",
+							"timeout for connecting to the extension server in seconds",
+							NULL,
+							&extension_server_connect_timeout,
+							60, 0, INT_MAX,
+							PGC_SUSET,
+							GUC_UNIT_S,
+							NULL, NULL, NULL);
+
 	/* set download_extension_file_hook */
 	prev_download_extension_file_hook = download_extension_file_hook;
 	download_extension_file_hook = neon_download_extension_file_http;

From 3204efc860bcd6e849733cc7759b6742e6df8d8e Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 17 Feb 2025 16:19:57 +0000
Subject: [PATCH 494/583] chore(proxy): use specially named prepared statements
 for type-checking (#10843)

I was looking into
https://github.com/neondatabase/serverless/issues/144, I recall previous
cases where proxy would trigger these prepared statements which would
conflict with other statements prepared by our client downstream.

Because of that, and also to aid in debugging, I've made sure all
prepared statements that proxy needs to make have specific names that
likely won't conflict and makes it clear in a error log if it's our
statements that are causing issues
---
 libs/proxy/tokio-postgres2/src/client.rs      | 98 +++----------------
 .../tokio-postgres2/src/generic_client.rs     |  9 +-
 libs/proxy/tokio-postgres2/src/lib.rs         |  2 -
 libs/proxy/tokio-postgres2/src/prepare.rs     | 48 ++-------
 libs/proxy/tokio-postgres2/src/query.rs       | 43 --------
 libs/proxy/tokio-postgres2/src/statement.rs   | 10 +-
 .../proxy/tokio-postgres2/src/to_statement.rs | 57 -----------
 proxy/src/serverless/backend.rs               |  2 +-
 proxy/src/serverless/local_conn_pool.rs       | 11 +--
 9 files changed, 36 insertions(+), 244 deletions(-)
 delete mode 100644 libs/proxy/tokio-postgres2/src/to_statement.rs

diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs
index 9bbbd4c260..46151ab924 100644
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -10,8 +10,8 @@ use crate::simple_query::SimpleQueryStream;
 use crate::types::{Oid, ToSql, Type};
 
 use crate::{
-    prepare, query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row,
-    SimpleQueryMessage, Statement, ToStatement, Transaction, TransactionBuilder,
+    query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row,
+    SimpleQueryMessage, Statement, Transaction, TransactionBuilder,
 };
 use bytes::BytesMut;
 use fallible_iterator::FallibleIterator;
@@ -54,18 +54,18 @@ impl Responses {
 }
 
 /// A cache of type info and prepared statements for fetching type info
-/// (corresponding to the queries in the [prepare] module).
+/// (corresponding to the queries in the [crate::prepare] module).
 #[derive(Default)]
 struct CachedTypeInfo {
     /// A statement for basic information for a type from its
-    /// OID. Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_QUERY) (or its
+    /// OID. Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_QUERY) (or its
     /// fallback).
     typeinfo: Option<Statement>,
     /// A statement for getting information for a composite type from its OID.
-    /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY).
+    /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY).
     typeinfo_composite: Option<Statement>,
     /// A statement for getting information for a composite type from its OID.
-    /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY) (or
+    /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY) (or
     /// its fallback).
     typeinfo_enum: Option<Statement>,
 
@@ -190,26 +190,6 @@ impl Client {
         &self.inner
     }
 
-    /// Creates a new prepared statement.
-    ///
-    /// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc),
-    /// which are set when executed. Prepared statements can only be used with the connection that created them.
-    pub async fn prepare(&self, query: &str) -> Result<Statement, Error> {
-        self.prepare_typed(query, &[]).await
-    }
-
-    /// Like `prepare`, but allows the types of query parameters to be explicitly specified.
-    ///
-    /// The list of types may be smaller than the number of parameters - the types of the remaining parameters will be
-    /// inferred. For example, `client.prepare_typed(query, &[])` is equivalent to `client.prepare(query)`.
-    pub async fn prepare_typed(
-        &self,
-        query: &str,
-        parameter_types: &[Type],
-    ) -> Result<Statement, Error> {
-        prepare::prepare(&self.inner, query, parameter_types).await
-    }
-
     /// Executes a statement, returning a vector of the resulting rows.
     ///
     /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
@@ -222,14 +202,11 @@ impl Client {
     /// # Panics
     ///
     /// Panics if the number of parameters provided does not match the number expected.
-    pub async fn query<T>(
+    pub async fn query(
         &self,
-        statement: &T,
+        statement: Statement,
         params: &[&(dyn ToSql + Sync)],
-    ) -> Result<Vec<Row>, Error>
-    where
-        T: ?Sized + ToStatement,
-    {
+    ) -> Result<Vec<Row>, Error> {
         self.query_raw(statement, slice_iter(params))
             .await?
             .try_collect()
@@ -250,13 +227,15 @@ impl Client {
     /// Panics if the number of parameters provided does not match the number expected.
     ///
     /// [`query`]: #method.query
-    pub async fn query_raw<'a, T, I>(&self, statement: &T, params: I) -> Result<RowStream, Error>
+    pub async fn query_raw<'a, I>(
+        &self,
+        statement: Statement,
+        params: I,
+    ) -> Result<RowStream, Error>
     where
-        T: ?Sized + ToStatement,
         I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
         I::IntoIter: ExactSizeIterator,
     {
-        let statement = statement.__convert().into_statement(self).await?;
         query::query(&self.inner, statement, params).await
     }
 
@@ -271,55 +250,6 @@ impl Client {
         query::query_txt(&self.inner, statement, params).await
     }
 
-    /// Executes a statement, returning the number of rows modified.
-    ///
-    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
-    /// provided, 1-indexed.
-    ///
-    /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
-    /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
-    /// with the `prepare` method.
-    ///
-    /// If the statement does not modify any rows (e.g. `SELECT`), 0 is returned.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the number of parameters provided does not match the number expected.
-    pub async fn execute<T>(
-        &self,
-        statement: &T,
-        params: &[&(dyn ToSql + Sync)],
-    ) -> Result<u64, Error>
-    where
-        T: ?Sized + ToStatement,
-    {
-        self.execute_raw(statement, slice_iter(params)).await
-    }
-
-    /// The maximally flexible version of [`execute`].
-    ///
-    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
-    /// provided, 1-indexed.
-    ///
-    /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
-    /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
-    /// with the `prepare` method.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the number of parameters provided does not match the number expected.
-    ///
-    /// [`execute`]: #method.execute
-    pub async fn execute_raw<'a, T, I>(&self, statement: &T, params: I) -> Result<u64, Error>
-    where
-        T: ?Sized + ToStatement,
-        I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
-        I::IntoIter: ExactSizeIterator,
-    {
-        let statement = statement.__convert().into_statement(self).await?;
-        query::execute(self.inner(), statement, params).await
-    }
-
     /// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows.
     ///
     /// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that
diff --git a/libs/proxy/tokio-postgres2/src/generic_client.rs b/libs/proxy/tokio-postgres2/src/generic_client.rs
index 768213f8ed..042b5a675e 100644
--- a/libs/proxy/tokio-postgres2/src/generic_client.rs
+++ b/libs/proxy/tokio-postgres2/src/generic_client.rs
@@ -1,7 +1,8 @@
+#![allow(async_fn_in_trait)]
+
 use crate::query::RowStream;
 use crate::types::Type;
 use crate::{Client, Error, Transaction};
-use async_trait::async_trait;
 use postgres_protocol2::Oid;
 
 mod private {
@@ -11,7 +12,6 @@ mod private {
 /// A trait allowing abstraction over connections and transactions.
 ///
 /// This trait is "sealed", and cannot be implemented outside of this crate.
-#[async_trait]
 pub trait GenericClient: private::Sealed {
     /// Like `Client::query_raw_txt`.
     async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
@@ -26,7 +26,6 @@ pub trait GenericClient: private::Sealed {
 
 impl private::Sealed for Client {}
 
-#[async_trait]
 impl GenericClient for Client {
     async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
     where
@@ -39,14 +38,12 @@ impl GenericClient for Client {
 
     /// Query for type information
     async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
-        self.get_type(oid).await
+        crate::prepare::get_type(self.inner(), oid).await
     }
 }
 
 impl private::Sealed for Transaction<'_> {}
 
-#[async_trait]
-#[allow(clippy::needless_lifetimes)]
 impl GenericClient for Transaction<'_> {
     async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
     where
diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs
index 9155dd8279..7426279167 100644
--- a/libs/proxy/tokio-postgres2/src/lib.rs
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -14,7 +14,6 @@ pub use crate::row::{Row, SimpleQueryRow};
 pub use crate::simple_query::SimpleQueryStream;
 pub use crate::statement::{Column, Statement};
 pub use crate::tls::NoTls;
-pub use crate::to_statement::ToStatement;
 pub use crate::transaction::Transaction;
 pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder};
 use crate::types::ToSql;
@@ -65,7 +64,6 @@ pub mod row;
 mod simple_query;
 mod statement;
 pub mod tls;
-mod to_statement;
 mod transaction;
 mod transaction_builder;
 pub mod types;
diff --git a/libs/proxy/tokio-postgres2/src/prepare.rs b/libs/proxy/tokio-postgres2/src/prepare.rs
index da0c755c5b..58bbb26cbc 100644
--- a/libs/proxy/tokio-postgres2/src/prepare.rs
+++ b/libs/proxy/tokio-postgres2/src/prepare.rs
@@ -1,7 +1,6 @@
 use crate::client::InnerClient;
 use crate::codec::FrontendMessage;
 use crate::connection::RequestMessages;
-use crate::error::SqlState;
 use crate::types::{Field, Kind, Oid, Type};
 use crate::{query, slice_iter};
 use crate::{Column, Error, Statement};
@@ -13,7 +12,6 @@ use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
 use std::future::Future;
 use std::pin::Pin;
-use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
 
 pub(crate) const TYPEINFO_QUERY: &str = "\
@@ -24,14 +22,6 @@ INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
 WHERE t.oid = $1
 ";
 
-// Range types weren't added until Postgres 9.2, so pg_range may not exist
-const TYPEINFO_FALLBACK_QUERY: &str = "\
-SELECT t.typname, t.typtype, t.typelem, NULL::OID, t.typbasetype, n.nspname, t.typrelid
-FROM pg_catalog.pg_type t
-INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
-WHERE t.oid = $1
-";
-
 const TYPEINFO_ENUM_QUERY: &str = "\
 SELECT enumlabel
 FROM pg_catalog.pg_enum
@@ -39,14 +29,6 @@ WHERE enumtypid = $1
 ORDER BY enumsortorder
 ";
 
-// Postgres 9.0 didn't have enumsortorder
-const TYPEINFO_ENUM_FALLBACK_QUERY: &str = "\
-SELECT enumlabel
-FROM pg_catalog.pg_enum
-WHERE enumtypid = $1
-ORDER BY oid
-";
-
 pub(crate) const TYPEINFO_COMPOSITE_QUERY: &str = "\
 SELECT attname, atttypid
 FROM pg_catalog.pg_attribute
@@ -56,15 +38,13 @@ AND attnum > 0
 ORDER BY attnum
 ";
 
-static NEXT_ID: AtomicUsize = AtomicUsize::new(0);
-
 pub async fn prepare(
     client: &Arc<InnerClient>,
+    name: &'static str,
     query: &str,
     types: &[Type],
 ) -> Result<Statement, Error> {
-    let name = format!("s{}", NEXT_ID.fetch_add(1, Ordering::SeqCst));
-    let buf = encode(client, &name, query, types)?;
+    let buf = encode(client, name, query, types)?;
     let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
 
     match responses.next().await? {
@@ -105,10 +85,11 @@ pub async fn prepare(
 
 fn prepare_rec<'a>(
     client: &'a Arc<InnerClient>,
+    name: &'static str,
     query: &'a str,
     types: &'a [Type],
 ) -> Pin<Box<dyn Future<Output = Result<Statement, Error>> + 'a + Send>> {
-    Box::pin(prepare(client, query, types))
+    Box::pin(prepare(client, name, query, types))
 }
 
 fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result<Bytes, Error> {
@@ -192,13 +173,8 @@ async fn typeinfo_statement(client: &Arc<InnerClient>) -> Result<Statement, Erro
         return Ok(stmt);
     }
 
-    let stmt = match prepare_rec(client, TYPEINFO_QUERY, &[]).await {
-        Ok(stmt) => stmt,
-        Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_TABLE) => {
-            prepare_rec(client, TYPEINFO_FALLBACK_QUERY, &[]).await?
-        }
-        Err(e) => return Err(e),
-    };
+    let typeinfo = "neon_proxy_typeinfo";
+    let stmt = prepare_rec(client, typeinfo, TYPEINFO_QUERY, &[]).await?;
 
     client.set_typeinfo(&stmt);
     Ok(stmt)
@@ -219,13 +195,8 @@ async fn typeinfo_enum_statement(client: &Arc<InnerClient>) -> Result<Statement,
         return Ok(stmt);
     }
 
-    let stmt = match prepare_rec(client, TYPEINFO_ENUM_QUERY, &[]).await {
-        Ok(stmt) => stmt,
-        Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_COLUMN) => {
-            prepare_rec(client, TYPEINFO_ENUM_FALLBACK_QUERY, &[]).await?
-        }
-        Err(e) => return Err(e),
-    };
+    let typeinfo = "neon_proxy_typeinfo_enum";
+    let stmt = prepare_rec(client, typeinfo, TYPEINFO_ENUM_QUERY, &[]).await?;
 
     client.set_typeinfo_enum(&stmt);
     Ok(stmt)
@@ -255,7 +226,8 @@ async fn typeinfo_composite_statement(client: &Arc<InnerClient>) -> Result<State
         return Ok(stmt);
     }
 
-    let stmt = prepare_rec(client, TYPEINFO_COMPOSITE_QUERY, &[]).await?;
+    let typeinfo = "neon_proxy_typeinfo_composite";
+    let stmt = prepare_rec(client, typeinfo, TYPEINFO_COMPOSITE_QUERY, &[]).await?;
 
     client.set_typeinfo_composite(&stmt);
     Ok(stmt)
diff --git a/libs/proxy/tokio-postgres2/src/query.rs b/libs/proxy/tokio-postgres2/src/query.rs
index 534195a707..e21631c85d 100644
--- a/libs/proxy/tokio-postgres2/src/query.rs
+++ b/libs/proxy/tokio-postgres2/src/query.rs
@@ -157,49 +157,6 @@ where
     })
 }
 
-pub async fn execute<'a, I>(
-    client: &InnerClient,
-    statement: Statement,
-    params: I,
-) -> Result<u64, Error>
-where
-    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
-    I::IntoIter: ExactSizeIterator,
-{
-    let buf = if log_enabled!(Level::Debug) {
-        let params = params.into_iter().collect::<Vec<_>>();
-        debug!(
-            "executing statement {} with parameters: {:?}",
-            statement.name(),
-            BorrowToSqlParamsDebug(params.as_slice()),
-        );
-        encode(client, &statement, params)?
-    } else {
-        encode(client, &statement, params)?
-    };
-    let mut responses = start(client, buf).await?;
-
-    let mut rows = 0;
-    loop {
-        match responses.next().await? {
-            Message::DataRow(_) => {}
-            Message::CommandComplete(body) => {
-                rows = body
-                    .tag()
-                    .map_err(Error::parse)?
-                    .rsplit(' ')
-                    .next()
-                    .unwrap()
-                    .parse()
-                    .unwrap_or(0);
-            }
-            Message::EmptyQueryResponse => rows = 0,
-            Message::ReadyForQuery(_) => return Ok(rows),
-            _ => return Err(Error::unexpected_message()),
-        }
-    }
-}
-
 async fn start(client: &InnerClient, buf: Bytes) -> Result<Responses, Error> {
     let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
 
diff --git a/libs/proxy/tokio-postgres2/src/statement.rs b/libs/proxy/tokio-postgres2/src/statement.rs
index 22e160fc05..591872fbc5 100644
--- a/libs/proxy/tokio-postgres2/src/statement.rs
+++ b/libs/proxy/tokio-postgres2/src/statement.rs
@@ -13,7 +13,7 @@ use std::{
 
 struct StatementInner {
     client: Weak<InnerClient>,
-    name: String,
+    name: &'static str,
     params: Vec<Type>,
     columns: Vec<Column>,
 }
@@ -22,7 +22,7 @@ impl Drop for StatementInner {
     fn drop(&mut self) {
         if let Some(client) = self.client.upgrade() {
             let buf = client.with_buf(|buf| {
-                frontend::close(b'S', &self.name, buf).unwrap();
+                frontend::close(b'S', self.name, buf).unwrap();
                 frontend::sync(buf);
                 buf.split().freeze()
             });
@@ -40,7 +40,7 @@ pub struct Statement(Arc<StatementInner>);
 impl Statement {
     pub(crate) fn new(
         inner: &Arc<InnerClient>,
-        name: String,
+        name: &'static str,
         params: Vec<Type>,
         columns: Vec<Column>,
     ) -> Statement {
@@ -55,14 +55,14 @@ impl Statement {
     pub(crate) fn new_anonymous(params: Vec<Type>, columns: Vec<Column>) -> Statement {
         Statement(Arc::new(StatementInner {
             client: Weak::new(),
-            name: String::new(),
+            name: "<anonymous>",
             params,
             columns,
         }))
     }
 
     pub(crate) fn name(&self) -> &str {
-        &self.0.name
+        self.0.name
     }
 
     /// Returns the expected types of the statement's parameters.
diff --git a/libs/proxy/tokio-postgres2/src/to_statement.rs b/libs/proxy/tokio-postgres2/src/to_statement.rs
deleted file mode 100644
index 7e12992728..0000000000
--- a/libs/proxy/tokio-postgres2/src/to_statement.rs
+++ /dev/null
@@ -1,57 +0,0 @@
-use crate::to_statement::private::{Sealed, ToStatementType};
-use crate::Statement;
-
-mod private {
-    use crate::{Client, Error, Statement};
-
-    pub trait Sealed {}
-
-    pub enum ToStatementType<'a> {
-        Statement(&'a Statement),
-        Query(&'a str),
-    }
-
-    impl ToStatementType<'_> {
-        pub async fn into_statement(self, client: &Client) -> Result<Statement, Error> {
-            match self {
-                ToStatementType::Statement(s) => Ok(s.clone()),
-                ToStatementType::Query(s) => client.prepare(s).await,
-            }
-        }
-    }
-}
-
-/// A trait abstracting over prepared and unprepared statements.
-///
-/// Many methods are generic over this bound, so that they support both a raw query string as well as a statement which
-/// was prepared previously.
-///
-/// This trait is "sealed" and cannot be implemented by anything outside this crate.
-pub trait ToStatement: Sealed {
-    #[doc(hidden)]
-    fn __convert(&self) -> ToStatementType<'_>;
-}
-
-impl ToStatement for Statement {
-    fn __convert(&self) -> ToStatementType<'_> {
-        ToStatementType::Statement(self)
-    }
-}
-
-impl Sealed for Statement {}
-
-impl ToStatement for str {
-    fn __convert(&self) -> ToStatementType<'_> {
-        ToStatementType::Query(self)
-    }
-}
-
-impl Sealed for str {}
-
-impl ToStatement for String {
-    fn __convert(&self) -> ToStatementType<'_> {
-        ToStatementType::Query(self)
-    }
-}
-
-impl Sealed for String {}
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 6a59d413c4..f35c375ba2 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -372,7 +372,7 @@ impl PoolingBackend {
             debug!("setting up backend session state");
 
             // initiates the auth session
-            if let Err(e) = client.execute("select auth.init()", &[]).await {
+            if let Err(e) = client.batch_execute("select auth.init();").await {
                 discard.discard();
                 return Err(e.into());
             }
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index fe33f0ff65..7ed514ff65 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -23,7 +23,6 @@ use indexmap::IndexMap;
 use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
 use parking_lot::RwLock;
 use postgres_client::tls::NoTlsStream;
-use postgres_client::types::ToSql;
 use postgres_client::AsyncMessage;
 use serde_json::value::RawValue;
 use tokio::net::TcpStream;
@@ -281,13 +280,9 @@ impl ClientInnerCommon<postgres_client::Client> {
             let token = resign_jwt(&local_data.key, payload, local_data.jti)?;
 
             // initiates the auth session
-            self.inner.batch_execute("discard all").await?;
-            self.inner
-                .execute(
-                    "select auth.jwt_session_init($1)",
-                    &[&&*token as &(dyn ToSql + Sync)],
-                )
-                .await?;
+            // this is safe from query injections as the jwt format free of any escape characters.
+            let query = format!("discard all; select auth.jwt_session_init('{token}')");
+            self.inner.batch_execute(&query).await?;
 
             let pid = self.inner.get_process_id();
             info!(pid, jti = local_data.jti, "user session state init");

From b10890b81c5121735480f17dee244917bb575096 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 17 Feb 2025 16:32:24 +0000
Subject: [PATCH 495/583] tests: compare digests in test_peer_recovery (#10853)

## Problem

Test fails when comparing the first WAL segment because the system id in
the segment header is different. The system id is not consistently set
correctly since segments are usually inited on the safekeeper sync step
with sysid 0.

## Summary of Chnages

Compare timeline digests instead. This skips the header.

Closes https://github.com/neondatabase/neon/issues/10596
---
 test_runner/regress/test_wal_acceptor.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 21b2ad479c..c5045fe4a4 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1445,6 +1445,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
 
     # roughly fills one segment
     endpoint.safe_psql("insert into t select generate_series(1,250000), 'payload'")
+    lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
 
     endpoint.stop()  # stop compute
 
@@ -1473,7 +1474,15 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
         "flush_lsn to get aligned",
     )
 
-    cmp_sk_wal([sk1, sk2], tenant_id, timeline_id)
+    sk1_digest = sk1.http_client().timeline_digest(
+        tenant_id, timeline_id, sk1.get_timeline_start_lsn(tenant_id, timeline_id), lsn
+    )
+
+    sk2_digest = sk1.http_client().timeline_digest(
+        tenant_id, timeline_id, sk2.get_timeline_start_lsn(tenant_id, timeline_id), lsn
+    )
+
+    assert sk1_digest == sk2_digest
 
     # stop one of safekeepers which weren't recovering and insert a bit more to check we can commit
     env.safekeepers[2].stop()

From 84bbe87d605fdd9daf0b2aff1fac7da40b43f725 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 17 Feb 2025 18:24:17 +0100
Subject: [PATCH 496/583] pageserver: tweak `pageserver_layers_per_read`
 histogram resolution (#10847)

## Problem

The current `pageserver_layers_per_read` histogram buckets don't
represent the current reality very well. For the percentiles we care
about (e.g. p50 and p99), we often see fairly high read amp, especially
during ingestion, and anything below 4 can be considered very good.

## Summary of changes

Change the per-timeline read amp histogram buckets to `[4.0, 8.0, 16.0,
32.0, 64.0, 128.0, 256.0]`.
---
 pageserver/src/metrics.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 16ca4683ad..e1c26b0684 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -130,7 +130,7 @@ pub(crate) static LAYERS_PER_READ: Lazy<HistogramVec> = Lazy::new(|| {
         "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
         &["tenant_id", "shard_id", "timeline_id"],
         // Low resolution to reduce cardinality.
-        vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0],
+        vec![4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0],
     )
     .expect("failed to define a metric")
 });

From b34598516f25857969679c10ec6ebdbe0e523d55 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 17 Feb 2025 13:02:16 -0600
Subject: [PATCH 497/583] Warn when PR may require regenerating cloud PG
 settings (#10229)

These generated Postgres settings JSON files can get out of sync causing
the control plane to reject updated to an endpoint or project's Postgres
settings.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 .github/workflows/regenerate-pg-setting.yml | 41 +++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 .github/workflows/regenerate-pg-setting.yml

diff --git a/.github/workflows/regenerate-pg-setting.yml b/.github/workflows/regenerate-pg-setting.yml
new file mode 100644
index 0000000000..1e9d2ec5e2
--- /dev/null
+++ b/.github/workflows/regenerate-pg-setting.yml
@@ -0,0 +1,41 @@
+name: Regenerate Postgres Settings
+
+on:
+  pull_request:
+    types:
+      - opened
+      - synchronize
+      - reopened
+    paths:
+      - pgxn/neon/**.c
+      - vendor/postgres-v*
+      - vendor/revisions.json
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref }}
+  cancel-in-progress: true
+
+permissions:
+  pull-requests: write
+
+jobs:
+  regenerate-pg-settings:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: Add comment
+        uses: thollander/actions-comment-pull-request@v3
+        with:
+          comment-tag: ${{ github.job }}
+          pr-number: ${{ github.event.number }}
+          message: |
+            If this PR added a GUC in the Postgres fork or `neon` extension,
+            please regenerate the Postgres settings in the `cloud` repo:
+
+            ```
+            make NEON_WORKDIR=path/to/neon/checkout \
+              -C goapp/internal/shareddomain/postgres generate
+            ```
+
+            If you're an external contributor, a Neon employee will assist in
+            making sure this step is done.

From 2884917bd429a1b01e1d1f1a99cffd046a789578 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 17 Feb 2025 21:42:57 +0200
Subject: [PATCH 498/583] compute: Allow postgres user to power off the VM also
 on <= v16 (#10860)

I did this for debian bookworm variant in PR #10710, but forgot to
update the "bullseye" dockerfile that is used to build older PostgreSQL
versions.
---
 compute/vm-image-spec-bullseye.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml
index 124c40cf5d..6617c98599 100644
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -47,7 +47,9 @@ files:
       # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
       # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
       # regardless of hostname (ALL)
-      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota
+      #
+      # Also allow it to shut down the VM. The fast_import job does that when it's finished.
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
   - filename: cgconfig.conf
     content: |
       # Configuration for cgroups in VM compute nodes

From 811506aaa2b4f35de3415b6ba98c90200a0b1741 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 17 Feb 2025 22:07:31 +0200
Subject: [PATCH 499/583] fast_import: Use rust s3 client for uploading
 (#10777)

This replaces the use of the awscli utility. awscli binary is massive,
it added about 200 MB to the docker image size, while the s3 client was
already a dependency so using that is essentially free, as far as binary
size is concerned.

I implemented a simple upload function that tries to keep 10 uploads
going in parallel. I believe that's the default behavior of the "aws s3
sync" command too.
---
 Cargo.lock                                    |   2 +
 compute/compute-node.Dockerfile               |  26 ----
 compute_tools/Cargo.toml                      |   2 +
 compute_tools/src/bin/fast_import.rs          |  30 +++--
 .../src/bin/fast_import/aws_s3_sync.rs        | 116 +++++++++++++++---
 5 files changed, 122 insertions(+), 54 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4f75fa5733..12c12bc771 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1303,6 +1303,7 @@ dependencies = [
  "aws-config",
  "aws-sdk-kms",
  "aws-sdk-s3",
+ "aws-smithy-types",
  "axum",
  "base64 0.13.1",
  "bytes",
@@ -1351,6 +1352,7 @@ dependencies = [
  "utils",
  "uuid",
  "vm_monitor",
+ "walkdir",
  "workspace_hack",
  "zstd",
 ]
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 1236372d27..082dea6f1b 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1695,29 +1695,6 @@ RUN if [ "$TARGETARCH" = "amd64" ]; then\
     && echo "${pgbouncer_exporter_sha256} pgbouncer_exporter" | sha256sum -c -\
     && echo "${sql_exporter_sha256} sql_exporter" | sha256sum -c -
 
-#########################################################################################
-#
-# Layer "awscli"
-#
-#########################################################################################
-FROM build-deps AS awscli
-ARG TARGETARCH
-RUN set -ex; \
-    if [ "${TARGETARCH}" = "amd64" ]; then \
-        TARGETARCH_ALT="x86_64"; \
-        CHECKSUM="c9a9df3770a3ff9259cb469b6179e02829687a464e0824d5c32d378820b53a00"; \
-    elif [ "${TARGETARCH}" = "arm64" ]; then \
-        TARGETARCH_ALT="aarch64"; \
-        CHECKSUM="8181730be7891582b38b028112e81b4899ca817e8c616aad807c9e9d1289223a"; \
-    else \
-        echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \
-    fi; \
-    curl --retry 5 -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \
-    echo "${CHECKSUM}  /tmp/awscliv2.zip" | sha256sum -c -; \
-    unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \
-    /tmp/awscliv2/aws/install; \
-    rm -rf /tmp/awscliv2.zip /tmp/awscliv2
-
 #########################################################################################
 #
 # Clean up postgres folder before inclusion
@@ -1887,9 +1864,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
     mkdir /usr/local/download_extensions && \
     chown -R postgres:postgres /usr/local/download_extensions
 
-# aws cli is used by fast_import
-COPY --from=awscli /usr/local/aws-cli /usr/local/aws-cli
-
 # pgbouncer and its config
 COPY --from=pgbouncer         /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
 COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index b8828fa49f..81dcf99560 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -14,6 +14,7 @@ base64.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
 aws-sdk-kms.workspace = true
+aws-smithy-types.workspace = true
 anyhow.workspace = true
 axum = { workspace = true, features = [] }
 camino.workspace = true
@@ -54,6 +55,7 @@ thiserror.workspace = true
 url.workspace = true
 uuid.workspace = true
 prometheus.workspace = true
+walkdir.workspace = true
 
 postgres_initdb.workspace = true
 compute_api.workspace = true
diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index 4c8d031532..614a93f48b 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -421,6 +421,7 @@ async fn run_dump_restore(
 
 #[allow(clippy::too_many_arguments)]
 async fn cmd_pgdata(
+    s3_client: Option<aws_sdk_s3::Client>,
     kms_client: Option<aws_sdk_kms::Client>,
     maybe_s3_prefix: Option<s3_uri::S3Uri>,
     maybe_spec: Option<Spec>,
@@ -488,9 +489,13 @@ async fn cmd_pgdata(
     // Only sync if s3_prefix was specified
     if let Some(s3_prefix) = maybe_s3_prefix {
         info!("upload pgdata");
-        aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
-            .await
-            .context("sync dump directory to destination")?;
+        aws_s3_sync::upload_dir_recursive(
+            s3_client.as_ref().unwrap(),
+            Utf8Path::new(&pgdata_dir),
+            &s3_prefix.append("/pgdata/"),
+        )
+        .await
+        .context("sync dump directory to destination")?;
 
         info!("write status");
         {
@@ -499,9 +504,13 @@ async fn cmd_pgdata(
             let status_file = status_dir.join("pgdata");
             std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
                 .context("write status file")?;
-            aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
-                .await
-                .context("sync status directory to destination")?;
+            aws_s3_sync::upload_dir_recursive(
+                s3_client.as_ref().unwrap(),
+                &status_dir,
+                &s3_prefix.append("/status/"),
+            )
+            .await
+            .context("sync status directory to destination")?;
         }
     }
 
@@ -573,18 +582,20 @@ pub(crate) async fn main() -> anyhow::Result<()> {
     let args = Args::parse();
 
     // Initialize AWS clients only if s3_prefix is specified
-    let (aws_config, kms_client) = if args.s3_prefix.is_some() {
+    let (s3_client, kms_client) = if args.s3_prefix.is_some() {
         let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
+        let s3_client = aws_sdk_s3::Client::new(&config);
         let kms = aws_sdk_kms::Client::new(&config);
-        (Some(config), Some(kms))
+        (Some(s3_client), Some(kms))
     } else {
         (None, None)
     };
 
     let spec: Option<Spec> = if let Some(s3_prefix) = &args.s3_prefix {
         let spec_key = s3_prefix.append("/spec.json");
-        let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap());
         let object = s3_client
+            .as_ref()
+            .unwrap()
             .get_object()
             .bucket(&spec_key.bucket)
             .key(spec_key.key)
@@ -624,6 +635,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
             memory_mb,
         } => {
             cmd_pgdata(
+                s3_client,
                 kms_client,
                 args.s3_prefix,
                 spec,
diff --git a/compute_tools/src/bin/fast_import/aws_s3_sync.rs b/compute_tools/src/bin/fast_import/aws_s3_sync.rs
index 5fa58c8f87..1be10b36d6 100644
--- a/compute_tools/src/bin/fast_import/aws_s3_sync.rs
+++ b/compute_tools/src/bin/fast_import/aws_s3_sync.rs
@@ -1,24 +1,102 @@
-use anyhow::Context;
-use camino::Utf8Path;
+use camino::{Utf8Path, Utf8PathBuf};
+use tokio::task::JoinSet;
+use walkdir::WalkDir;
 
 use super::s3_uri::S3Uri;
 
-pub(crate) async fn sync(local: &Utf8Path, remote: &S3Uri) -> anyhow::Result<()> {
-    let mut builder = tokio::process::Command::new("aws");
-    builder
-        .arg("s3")
-        .arg("sync")
-        .arg(local.as_str())
-        .arg(remote.to_string());
-    let st = builder
-        .spawn()
-        .context("spawn aws s3 sync")?
-        .wait()
-        .await
-        .context("wait for aws s3 sync")?;
-    if st.success() {
-        Ok(())
-    } else {
-        Err(anyhow::anyhow!("aws s3 sync failed"))
+use tracing::{info, warn};
+
+const MAX_PARALLEL_UPLOADS: usize = 10;
+
+/// Upload all files from 'local' to 'remote'
+pub(crate) async fn upload_dir_recursive(
+    s3_client: &aws_sdk_s3::Client,
+    local: &Utf8Path,
+    remote: &S3Uri,
+) -> anyhow::Result<()> {
+    // Recursively scan directory
+    let mut dirwalker = WalkDir::new(local)
+        .into_iter()
+        .map(|entry| {
+            let entry = entry?;
+            let file_type = entry.file_type();
+            let path = <&Utf8Path>::try_from(entry.path())?.to_path_buf();
+            Ok((file_type, path))
+        })
+        .filter_map(|e: anyhow::Result<(std::fs::FileType, Utf8PathBuf)>| {
+            match e {
+                Ok((file_type, path)) if file_type.is_file() => Some(Ok(path)),
+                Ok((file_type, _path)) if file_type.is_dir() => {
+                    // The WalkDir iterator will recurse into directories, but we don't want
+                    // to do anything with directories as such. There's no concept of uploading
+                    // an empty directory to S3.
+                    None
+                }
+                Ok((file_type, path)) if file_type.is_symlink() => {
+                    // huh, didn't expect a symlink. Can't upload that to S3. Warn and skip.
+                    warn!("cannot upload symlink ({})", path);
+                    None
+                }
+                Ok((_file_type, path)) => {
+                    // should not happen
+                    warn!("directory entry has unexpected type ({})", path);
+                    None
+                }
+                Err(e) => Some(Err(e)),
+            }
+        });
+
+    // Spawn upload tasks for each file, keeping MAX_PARALLEL_UPLOADS active in
+    // parallel.
+    let mut joinset = JoinSet::new();
+    loop {
+        // Could we upload more?
+        while joinset.len() < MAX_PARALLEL_UPLOADS {
+            if let Some(full_local_path) = dirwalker.next() {
+                let full_local_path = full_local_path?;
+                let relative_local_path = full_local_path
+                    .strip_prefix(local)
+                    .expect("all paths start from the walkdir root");
+                let remote_path = remote.append(relative_local_path.as_str());
+                info!(
+                    "starting upload of {} to {}",
+                    &full_local_path, &remote_path
+                );
+                let upload_task = upload_file(s3_client.clone(), full_local_path, remote_path);
+                joinset.spawn(upload_task);
+            } else {
+                info!("draining upload tasks");
+                break;
+            }
+        }
+
+        // Wait for an upload to complete
+        if let Some(res) = joinset.join_next().await {
+            let _ = res?;
+        } else {
+            // all done!
+            break;
+        }
     }
+    Ok(())
+}
+
+pub(crate) async fn upload_file(
+    s3_client: aws_sdk_s3::Client,
+    local_path: Utf8PathBuf,
+    remote: S3Uri,
+) -> anyhow::Result<()> {
+    use aws_smithy_types::byte_stream::ByteStream;
+    let stream = ByteStream::from_path(&local_path).await?;
+
+    let _result = s3_client
+        .put_object()
+        .bucket(remote.bucket)
+        .key(&remote.key)
+        .body(stream)
+        .send()
+        .await?;
+    info!("upload of {} to {} finished", &local_path, &remote.key);
+
+    Ok(())
 }

From 27241f039c2411910c987466def4f72c912c982e Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 17 Feb 2025 20:29:14 +0000
Subject: [PATCH 500/583] test_runner: fix `neon_local` usage for version
 mismatch tests (#10859)

## Problem

Tests with mixed versions of binaries always pick up new versions if
services are started using `neon_local`.

## Summary of changes
- Set `neon_local_binpath` along with `neon_binpath` and
`pg_distrib_dir` for tests with mixed versions
---
 test_runner/fixtures/neon_fixtures.py |  9 ++++++++-
 test_runner/fixtures/utils.py         | 10 +++++-----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 73607db7d8..c4d4908568 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -491,6 +491,7 @@ class NeonEnvBuilder:
         self.test_may_use_compatibility_snapshot_binaries = False
         self.version_combination = combination
         self.mixdir = self.test_output_dir / "mixdir_neon"
+
         if self.version_combination is not None:
             assert (
                 self.compatibility_neon_binpath is not None
@@ -702,6 +703,11 @@ class NeonEnvBuilder:
 
     def _mix_versions(self):
         assert self.version_combination is not None, "version combination must be set"
+
+        # Always use a newer version of `neon_local`
+        (self.mixdir / "neon_local").symlink_to(self.neon_binpath / "neon_local")
+        self.neon_local_binpath = self.mixdir
+
         for component, paths in COMPONENT_BINARIES.items():
             directory = (
                 self.neon_binpath
@@ -711,9 +717,10 @@ class NeonEnvBuilder:
             for filename in paths:
                 destination = self.mixdir / filename
                 destination.symlink_to(directory / filename)
+        self.neon_binpath = self.mixdir
+
         if self.version_combination["compute"] == "old":
             self.pg_distrib_dir = self.compatibility_pg_distrib_dir
-        self.neon_binpath = self.mixdir
 
     def overlay_mount(self, ident: str, srcdir: Path, dstdir: Path):
         """
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index e160c617cd..71b2de4f65 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -52,11 +52,11 @@ COMPONENT_BINARIES = {
 # Disable auto-formatting for better readability
 # fmt: off
 VERSIONS_COMBINATIONS = (
-    {"storage_controller": "new", "storage_broker": "new", "compute": "new", "safekeeper": "new", "pageserver": "new"},
-    {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "old"},
-    {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "new"},
-    {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "new", "pageserver": "new"},
-    {"storage_controller": "old", "storage_broker": "old", "compute": "new", "safekeeper": "new", "pageserver": "new"},
+    {"storage_controller": "new", "storage_broker": "new", "compute": "new", "safekeeper": "new", "pageserver": "new"}, # combination: nnnnn
+    {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "old"}, # combination: ooonn
+    {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "new"}, # combination: ononn
+    {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "new", "pageserver": "new"}, # combination: onnnn
+    {"storage_controller": "old", "storage_broker": "old", "compute": "new", "safekeeper": "new", "pageserver": "new"}, # combination: nnnoo
 )
 # fmt: on
 

From 719ec378cdf3b5454ed4b991b78bc1ad4de382ba Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 18 Feb 2025 08:54:20 +0000
Subject: [PATCH 501/583] fix(local_proxy): discard all in tx (#10864)

## Problem

`discard all` cannot run in a transaction (even if implicit)

## Summary of changes

Split up the query into two, we don't need transaction support.
---
 proxy/src/serverless/local_conn_pool.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index 7ed514ff65..137a2d6377 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -279,9 +279,12 @@ impl ClientInnerCommon<postgres_client::Client> {
             local_data.jti += 1;
             let token = resign_jwt(&local_data.key, payload, local_data.jti)?;
 
+            // discard all cannot run in a transaction. must be executed alone.
+            self.inner.batch_execute("discard all").await?;
+
             // initiates the auth session
             // this is safe from query injections as the jwt format free of any escape characters.
-            let query = format!("discard all; select auth.jwt_session_init('{token}')");
+            let query = format!("select auth.jwt_session_init('{token}')");
             self.inner.batch_execute(&query).await?;
 
             let pid = self.inner.get_process_id();

From f81259967dacf94810ad2e883285213ebca00969 Mon Sep 17 00:00:00 2001
From: Alexander Lakhin <exclusion@gmail.com>
Date: Tue, 18 Feb 2025 15:23:18 +0200
Subject: [PATCH 502/583] Add test to make sure sanitizers really work when
 expected (#10838)

---
 test_runner/fixtures/utils.py              |  2 ++
 test_runner/regress/test_endpoint_crash.py | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 71b2de4f65..2a59eab710 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -64,6 +64,8 @@ VERSIONS_COMBINATIONS = (
 # If it is not set or set to a value not equal to "false", LFC is enabled by default.
 USE_LFC = os.environ.get("USE_LFC") != "false"
 
+WITH_SANITIZERS = os.environ.get("SANITIZERS") == "enabled"
+
 
 def subprocess_capture(
     capture_dir: Path,
diff --git a/test_runner/regress/test_endpoint_crash.py b/test_runner/regress/test_endpoint_crash.py
index 0217cd0d03..03bfd1cb8d 100644
--- a/test_runner/regress/test_endpoint_crash.py
+++ b/test_runner/regress/test_endpoint_crash.py
@@ -2,6 +2,8 @@ from __future__ import annotations
 
 import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.pg_version import PgVersion
+from fixtures.utils import WITH_SANITIZERS, run_only_on_postgres
 
 
 @pytest.mark.parametrize(
@@ -23,3 +25,20 @@ def test_endpoint_crash(neon_env_builder: NeonEnvBuilder, sql_func: str):
     endpoint.safe_psql("CREATE EXTENSION neon_test_utils;")
     with pytest.raises(Exception, match="This probably means the server terminated abnormally"):
         endpoint.safe_psql(f"SELECT {sql_func}();")
+
+
+@run_only_on_postgres([PgVersion.V17], "Currently, build vith sanitizers is possible with v17 only")
+def test_sanitizers(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that undefined behavior leads to endpoint abort with sanitizers enabled
+    """
+    env = neon_env_builder.init_start()
+    env.create_branch("test_ubsan")
+    endpoint = env.endpoints.create_start("test_ubsan")
+
+    # Test case based on https://www.postgresql.org/message-id/17167-028026e4ca333817@postgresql.org
+    if not WITH_SANITIZERS:
+        endpoint.safe_psql("SELECT 1::int4 << 128")
+    else:
+        with pytest.raises(Exception, match="This probably means the server terminated abnormally"):
+            endpoint.safe_psql("SELECT 1::int4 << 128")

From d36baae7582a7fcebea08c7aa4f525a819f1023c Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Tue, 18 Feb 2025 16:57:12 +0300
Subject: [PATCH 503/583] Add gc_blocking and restore latest_gc_cutoff in
 openapi spec (#10867)

## Problem

gc_blocking is missing in the tenant info, but cplane wants to use it.
Also, https://github.com/neondatabase/neon/pull/10707/ removed
latest_gc_cutoff from the spec, renaming it to applied_gc_cutoff.
Temporarily get it back until cplane migrates.

## Summary of changes

Add them.

ref https://neondb.slack.com/archives/C03438W3FLZ/p1739877734963979
---
 libs/pageserver_api/src/models.rs    | 3 +--
 pageserver/src/http/openapi_spec.yml | 5 +++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 3d40cfe121..dd7bea2916 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1080,8 +1080,7 @@ pub struct TenantInfo {
 
     /// Opaque explanation if gc is being blocked.
     ///
-    /// Only looked up for the individual tenant detail, not the listing. This is purely for
-    /// debugging, not included in openapi.
+    /// Only looked up for the individual tenant detail, not the listing.
     #[serde(skip_serializing_if = "Option::is_none")]
     pub gc_blocking: Option<String>,
 }
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index b8ed7aaf26..733115539a 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -882,6 +882,8 @@ components:
               properties:
                 reason:
                   type: string
+        gc_blocking:
+          type: string
 
     TenantCreateRequest:
       allOf:
@@ -1083,6 +1085,9 @@ components:
         min_readable_lsn:
           type: string
           format: hex
+        latest_gc_cutoff_lsn:
+          type: string
+          format: hex
         applied_gc_cutoff_lsn:
           type: string
           format: hex

From caece02da7d50c31542379a50229b488dae4d463 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 18 Feb 2025 15:02:22 +0100
Subject: [PATCH 504/583] move pull_timeline to safekeeper_api and add
 SafekeeperGeneration (#10863)

Preparations for a successor of #10440:

* move `pull_timeline` to `safekeeper_api` and add it to
`SafekeeperClient`. we want to do `pull_timeline` on any creations that
we couldn't do initially.
* Add a `SafekeeperGeneration` type instead of relying on a type alias.
we want to maintain a safekeeper specific generation number now in the
storcon database. A separate type is important to make it impossible to
mix it up with the tenant's pageserver specific generation number. We
absolutely want to avoid that for correctness reasons. If someone mixes
up a safekeeper and pageserver id (both use the `NodeId` type), that's
bad but there is no wrong generations flying around.

part of #9011
---
 libs/safekeeper_api/src/membership.rs       | 42 +++++++++++++++++---
 libs/safekeeper_api/src/models.rs           | 15 +++++++
 libs/utils/src/bin_ser.rs                   | 43 +++++++++++++++++++++
 safekeeper/client/src/mgmt_api.rs           | 11 +++++-
 safekeeper/src/control_file.rs              |  4 +-
 safekeeper/src/http/routes.rs               |  3 +-
 safekeeper/src/pull_timeline.rs             | 32 +++++----------
 safekeeper/src/safekeeper.rs                |  4 +-
 storage_controller/src/safekeeper_client.rs | 18 ++++++++-
 9 files changed, 137 insertions(+), 35 deletions(-)

diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs
index a39fda526f..8b14a4f290 100644
--- a/libs/safekeeper_api/src/membership.rs
+++ b/libs/safekeeper_api/src/membership.rs
@@ -9,13 +9,43 @@ use anyhow::bail;
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;
 
-/// Number uniquely identifying safekeeper configuration.
-/// Note: it is a part of sk control file.
-pub type Generation = u32;
 /// 1 is the first valid generation, 0 is used as
 /// a placeholder before we fully migrate to generations.
-pub const INVALID_GENERATION: Generation = 0;
-pub const INITIAL_GENERATION: Generation = 1;
+pub const INVALID_GENERATION: SafekeeperGeneration = SafekeeperGeneration::new(0);
+pub const INITIAL_GENERATION: SafekeeperGeneration = SafekeeperGeneration::new(1);
+
+/// Number uniquely identifying safekeeper configuration.
+/// Note: it is a part of sk control file.
+///
+/// Like tenant generations, but for safekeepers.
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
+pub struct SafekeeperGeneration(u32);
+
+impl SafekeeperGeneration {
+    pub const fn new(v: u32) -> Self {
+        Self(v)
+    }
+
+    #[track_caller]
+    pub fn previous(&self) -> Option<Self> {
+        Some(Self(self.0.checked_sub(1)?))
+    }
+
+    #[track_caller]
+    pub fn next(&self) -> Self {
+        Self(self.0 + 1)
+    }
+
+    pub fn into_inner(self) -> u32 {
+        self.0
+    }
+}
+
+impl Display for SafekeeperGeneration {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
 
 /// Membership is defined by ids so e.g. walproposer uses them to figure out
 /// quorums, but we also carry host and port to give wp idea where to connect.
@@ -89,7 +119,7 @@ impl Display for MemberSet {
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct Configuration {
     /// Unique id.
-    pub generation: Generation,
+    pub generation: SafekeeperGeneration,
     /// Current members of the configuration.
     pub members: MemberSet,
     /// Some means it is a joint conf.
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index 30418b0efd..41ccdaa428 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -282,3 +282,18 @@ pub struct TimelineTermBumpResponse {
 pub struct SafekeeperUtilization {
     pub timeline_count: u64,
 }
+
+/// pull_timeline request body.
+#[derive(Debug, Deserialize, Serialize)]
+pub struct PullTimelineRequest {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub http_hosts: Vec<String>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct PullTimelineResponse {
+    // Donor safekeeper host
+    pub safekeeper_host: String,
+    // TODO: add more fields?
+}
diff --git a/libs/utils/src/bin_ser.rs b/libs/utils/src/bin_ser.rs
index 42b45eeea0..4d173d0726 100644
--- a/libs/utils/src/bin_ser.rs
+++ b/libs/utils/src/bin_ser.rs
@@ -286,6 +286,11 @@ mod tests {
     const SHORT2_ENC_LE: &[u8] = &[8, 0, 0, 3, 7];
     const SHORT2_ENC_LE_TRAILING: &[u8] = &[8, 0, 0, 3, 7, 0xff, 0xff, 0xff];
 
+    #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
+    struct NewTypeStruct(u32);
+    const NT1: NewTypeStruct = NewTypeStruct(414243);
+    const NT1_INNER: u32 = 414243;
+
     #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
     pub struct LongMsg {
         pub tag: u8,
@@ -408,4 +413,42 @@ mod tests {
         let msg2 = LongMsg::des(&encoded).unwrap();
         assert_eq!(msg, msg2);
     }
+
+    #[test]
+    /// Ensure that newtype wrappers around u32 don't change the serialization format
+    fn be_nt() {
+        use super::BeSer;
+
+        assert_eq!(NT1.serialized_size().unwrap(), 4);
+
+        let msg = NT1;
+
+        let encoded = msg.ser().unwrap();
+        let expected = hex_literal::hex!("0006 5223");
+        assert_eq!(encoded, expected);
+
+        assert_eq!(encoded, NT1_INNER.ser().unwrap());
+
+        let msg2 = NewTypeStruct::des(&encoded).unwrap();
+        assert_eq!(msg, msg2);
+    }
+
+    #[test]
+    /// Ensure that newtype wrappers around u32 don't change the serialization format
+    fn le_nt() {
+        use super::LeSer;
+
+        assert_eq!(NT1.serialized_size().unwrap(), 4);
+
+        let msg = NT1;
+
+        let encoded = msg.ser().unwrap();
+        let expected = hex_literal::hex!("2352 0600");
+        assert_eq!(encoded, expected);
+
+        assert_eq!(encoded, NT1_INNER.ser().unwrap());
+
+        let msg2 = NewTypeStruct::des(&encoded).unwrap();
+        assert_eq!(msg, msg2);
+    }
 }
diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs
index d4f47fc96d..40e5afc4aa 100644
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -5,7 +5,10 @@
 
 use http_utils::error::HttpErrorBody;
 use reqwest::{IntoUrl, Method, StatusCode};
-use safekeeper_api::models::{SafekeeperUtilization, TimelineCreateRequest, TimelineStatus};
+use safekeeper_api::models::{
+    PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest,
+    TimelineStatus,
+};
 use std::error::Error as _;
 use utils::{
     id::{NodeId, TenantId, TimelineId},
@@ -88,6 +91,12 @@ impl Client {
         resp.json().await.map_err(Error::ReceiveBody)
     }
 
+    pub async fn pull_timeline(&self, req: &PullTimelineRequest) -> Result<PullTimelineResponse> {
+        let uri = format!("{}/v1/pull_timeline", self.mgmt_api_endpoint);
+        let resp = self.post(&uri, req).await?;
+        resp.json().await.map_err(Error::ReceiveBody)
+    }
+
     pub async fn delete_timeline(
         &self,
         tenant_id: TenantId,
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index e92ca881e1..35aebfd8ad 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -235,7 +235,7 @@ impl Storage for FileStorage {
 #[cfg(test)]
 mod test {
     use super::*;
-    use safekeeper_api::membership::{Configuration, MemberSet};
+    use safekeeper_api::membership::{Configuration, MemberSet, SafekeeperGeneration};
     use tokio::fs;
     use utils::lsn::Lsn;
 
@@ -246,7 +246,7 @@ mod test {
         let tempdir = camino_tempfile::tempdir()?;
         let mut state = TimelinePersistentState::empty();
         state.mconf = Configuration {
-            generation: 42,
+            generation: SafekeeperGeneration::new(42),
             members: MemberSet::empty(),
             new_members: None,
         };
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 41e30d838a..cd2ac5f44c 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -2,6 +2,7 @@ use http_utils::failpoints::failpoints_handler;
 use hyper::{Body, Request, Response, StatusCode};
 use safekeeper_api::models;
 use safekeeper_api::models::AcceptorStateStatus;
+use safekeeper_api::models::PullTimelineRequest;
 use safekeeper_api::models::SafekeeperStatus;
 use safekeeper_api::models::TermSwitchApiEntry;
 use safekeeper_api::models::TimelineStatus;
@@ -230,7 +231,7 @@ async fn timeline_delete_handler(mut request: Request<Body>) -> Result<Response<
 async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
 
-    let data: pull_timeline::Request = json_request(&mut request).await?;
+    let data: PullTimelineRequest = json_request(&mut request).await?;
     let conf = get_conf(&request);
     let global_timelines = get_global_timelines(&request);
 
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index f2d8e4c85f..4827b73074 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -4,10 +4,13 @@ use camino::Utf8PathBuf;
 use chrono::{DateTime, Utc};
 use futures::{SinkExt, StreamExt, TryStreamExt};
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
-use safekeeper_api::{models::TimelineStatus, Term};
+use safekeeper_api::{
+    models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus},
+    Term,
+};
 use safekeeper_client::mgmt_api;
 use safekeeper_client::mgmt_api::Client;
-use serde::{Deserialize, Serialize};
+use serde::Deserialize;
 use std::{
     cmp::min,
     io::{self, ErrorKind},
@@ -33,7 +36,7 @@ use crate::{
 };
 use utils::{
     crashsafe::fsync_async_opt,
-    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
+    id::{NodeId, TenantTimelineId},
     logging::SecretString,
     lsn::Lsn,
     pausable_failpoint,
@@ -378,21 +381,6 @@ impl WalResidentTimeline {
     }
 }
 
-/// pull_timeline request body.
-#[derive(Debug, Deserialize)]
-pub struct Request {
-    pub tenant_id: TenantId,
-    pub timeline_id: TimelineId,
-    pub http_hosts: Vec<String>,
-}
-
-#[derive(Debug, Serialize)]
-pub struct Response {
-    // Donor safekeeper host
-    pub safekeeper_host: String,
-    // TODO: add more fields?
-}
-
 /// Response for debug dump request.
 #[derive(Debug, Deserialize)]
 pub struct DebugDumpResponse {
@@ -405,10 +393,10 @@ pub struct DebugDumpResponse {
 
 /// Find the most advanced safekeeper and pull timeline from it.
 pub async fn handle_request(
-    request: Request,
+    request: PullTimelineRequest,
     sk_auth_token: Option<SecretString>,
     global_timelines: Arc<GlobalTimelines>,
-) -> Result<Response> {
+) -> Result<PullTimelineResponse> {
     let existing_tli = global_timelines.get(TenantTimelineId::new(
         request.tenant_id,
         request.timeline_id,
@@ -460,7 +448,7 @@ async fn pull_timeline(
     host: String,
     sk_auth_token: Option<SecretString>,
     global_timelines: Arc<GlobalTimelines>,
-) -> Result<Response> {
+) -> Result<PullTimelineResponse> {
     let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id);
     info!(
         "pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}",
@@ -535,7 +523,7 @@ async fn pull_timeline(
         .load_temp_timeline(ttid, &tli_dir_path, false)
         .await?;
 
-    Ok(Response {
+    Ok(PullTimelineResponse {
         safekeeper_host: host,
     })
 }
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 45e19c31b6..f816f8459a 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -1004,7 +1004,7 @@ mod tests {
 
     use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE};
     use safekeeper_api::{
-        membership::{Configuration, MemberSet, SafekeeperId},
+        membership::{Configuration, MemberSet, SafekeeperGeneration, SafekeeperId},
         ServerInfo,
     };
 
@@ -1303,7 +1303,7 @@ mod tests {
             tenant_id,
             timeline_id,
             mconf: Configuration {
-                generation: 42,
+                generation: SafekeeperGeneration::new(42),
                 members: MemberSet::new(vec![SafekeeperId {
                     id: NodeId(1),
                     host: "hehe.org".to_owned(),
diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs
index bb494f20fa..f234ab3429 100644
--- a/storage_controller/src/safekeeper_client.rs
+++ b/storage_controller/src/safekeeper_client.rs
@@ -1,5 +1,8 @@
 use crate::metrics::PageserverRequestLabelGroup;
-use safekeeper_api::models::{SafekeeperUtilization, TimelineCreateRequest, TimelineStatus};
+use safekeeper_api::models::{
+    PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest,
+    TimelineStatus,
+};
 use safekeeper_client::mgmt_api::{Client, Result};
 use utils::{
     id::{NodeId, TenantId, TimelineId},
@@ -94,6 +97,19 @@ impl SafekeeperClient {
         )
     }
 
+    #[allow(dead_code)]
+    pub(crate) async fn pull_timeline(
+        &self,
+        req: &PullTimelineRequest,
+    ) -> Result<PullTimelineResponse> {
+        measured_request!(
+            "pull_timeline",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner.pull_timeline(req).await
+        )
+    }
+
     pub(crate) async fn get_utilization(&self) -> Result<SafekeeperUtilization> {
         measured_request!(
             "utilization",

From 29e4ca351ee12c97756123420e7ce4540fbee047 Mon Sep 17 00:00:00 2001
From: Alexander Lakhin <exclusion@gmail.com>
Date: Tue, 18 Feb 2025 17:41:20 +0200
Subject: [PATCH 505/583] Pass asan/ubsan options to pg_dump/pg_restore started
 by fast_import (#10866)

---
 compute_tools/src/bin/fast_import.rs | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index 614a93f48b..585f3e4e1d 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -361,6 +361,14 @@ async fn run_dump_restore(
             // how we run it
             .env_clear()
             .env("LD_LIBRARY_PATH", &pg_lib_dir)
+            .env(
+                "ASAN_OPTIONS",
+                std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+            )
+            .env(
+                "UBSAN_OPTIONS",
+                std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+            )
             .kill_on_drop(true)
             .stdout(std::process::Stdio::piped())
             .stderr(std::process::Stdio::piped())
@@ -394,6 +402,14 @@ async fn run_dump_restore(
             // how we run it
             .env_clear()
             .env("LD_LIBRARY_PATH", &pg_lib_dir)
+            .env(
+                "ASAN_OPTIONS",
+                std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+            )
+            .env(
+                "UBSAN_OPTIONS",
+                std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+            )
             .kill_on_drop(true)
             .stdout(std::process::Stdio::piped())
             .stderr(std::process::Stdio::piped())

From 290f007b8ea9ceb243ff536dfabfdcb847980743 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 18 Feb 2025 10:43:33 -0500
Subject: [PATCH 506/583] Revert "feat(pageserver): repartition on L0-L1
 boundary (#10548)" (#10870)

This reverts commit 443c8d0b4bfead651ebbbade5dcb49c6cba00ee6.

## Problem

We observe a massive amount of compaction errors.

## Summary of changes

If the tenant did not write any L1 layers (i.e., they accumulate L0
layers where number of them is below L0 threshold), image creation will
always fail. Therefore, it's not correct to simply use the
disk_consistent_lsn or L0/L1 boundary for the image creation.
---
 pageserver/src/tenant.rs                      |  18 +-
 pageserver/src/tenant/timeline/compaction.rs  | 156 ++++++++----------
 .../regress/test_layers_from_future.py        |   3 -
 3 files changed, 69 insertions(+), 108 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 5a2c5c0c46..bab1a02527 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -7846,18 +7846,6 @@ mod tests {
             }
 
             tline.freeze_and_flush().await?;
-            // Force layers to L1
-            tline
-                .compact(
-                    &cancel,
-                    {
-                        let mut flags = EnumSet::new();
-                        flags.insert(CompactFlags::ForceL0Compaction);
-                        flags
-                    },
-                    &ctx,
-                )
-                .await?;
 
             if iter % 5 == 0 {
                 let (_, before_delta_file_accessed) =
@@ -7870,7 +7858,6 @@ mod tests {
                             let mut flags = EnumSet::new();
                             flags.insert(CompactFlags::ForceImageLayerCreation);
                             flags.insert(CompactFlags::ForceRepartition);
-                            flags.insert(CompactFlags::ForceL0Compaction);
                             flags
                         },
                         &ctx,
@@ -8317,8 +8304,6 @@ mod tests {
 
         let cancel = CancellationToken::new();
 
-        // Image layer creation happens on the disk_consistent_lsn so we need to force set it now.
-        tline.force_set_disk_consistent_lsn(Lsn(0x40));
         tline
             .compact(
                 &cancel,
@@ -8332,7 +8317,8 @@ mod tests {
             )
             .await
             .unwrap();
-        // Image layers are created at repartition LSN
+
+        // Image layers are created at last_record_lsn
         let images = tline
             .inspect_image_layers(Lsn(0x40), &ctx, io_concurrency.clone())
             .await
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 6931f360a4..e1e3eabb90 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -692,21 +692,6 @@ impl Timeline {
 
         // Define partitioning schema if needed
 
-        let l0_l1_boundary_lsn = {
-            // We do the repartition on the L0-L1 boundary. All data below the boundary
-            // are compacted by L0 with low read amplification, thus making the `repartition`
-            // function run fast.
-            let guard = self.layers.read().await;
-            let l0_min_lsn = guard
-                .layer_map()?
-                .level0_deltas()
-                .iter()
-                .map(|l| l.get_lsn_range().start)
-                .min()
-                .unwrap_or(self.get_disk_consistent_lsn());
-            l0_min_lsn.max(self.get_ancestor_lsn())
-        };
-
         // 1. L0 Compact
         let l0_outcome = {
             let timer = self.metrics.compact_time_histo.start_timer();
@@ -733,86 +718,79 @@ impl Timeline {
             return Ok(CompactionOutcome::YieldForL0);
         }
 
-        if l0_l1_boundary_lsn < self.partitioning.read().1 {
-            // We never go backwards when repartition and create image layers.
-            info!("skipping image layer generation because repartition LSN is greater than L0-L1 boundary LSN.");
-        } else {
-            // 2. Repartition and create image layers if necessary
-            match self
-                .repartition(
-                    l0_l1_boundary_lsn,
-                    self.get_compaction_target_size(),
-                    options.flags,
-                    ctx,
-                )
-                .await
-            {
-                Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
-                    // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
-                    let image_ctx = RequestContextBuilder::extend(ctx)
-                        .access_stats_behavior(AccessStatsBehavior::Skip)
-                        .build();
+        // 2. Repartition and create image layers if necessary
+        match self
+            .repartition(
+                self.get_last_record_lsn(),
+                self.get_compaction_target_size(),
+                options.flags,
+                ctx,
+            )
+            .await
+        {
+            Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
+                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
+                let image_ctx = RequestContextBuilder::extend(ctx)
+                    .access_stats_behavior(AccessStatsBehavior::Skip)
+                    .build();
 
-                    let mut partitioning = dense_partitioning;
-                    partitioning
-                        .parts
-                        .extend(sparse_partitioning.into_dense().parts);
+                let mut partitioning = dense_partitioning;
+                partitioning
+                    .parts
+                    .extend(sparse_partitioning.into_dense().parts);
 
-                    // 3. Create new image layers for partitions that have been modified "enough".
-                    let (image_layers, outcome) = self
-                        .create_image_layers(
-                            &partitioning,
-                            lsn,
-                            if options
-                                .flags
-                                .contains(CompactFlags::ForceImageLayerCreation)
-                            {
-                                ImageLayerCreationMode::Force
-                            } else {
-                                ImageLayerCreationMode::Try
-                            },
-                            &image_ctx,
-                            self.last_image_layer_creation_status
-                                .load()
-                                .as_ref()
-                                .clone(),
-                            !options.flags.contains(CompactFlags::NoYield),
-                        )
-                        .await
-                        .inspect_err(|err| {
-                            if let CreateImageLayersError::GetVectoredError(
-                                GetVectoredError::MissingKey(_),
-                            ) = err
-                            {
-                                critical!("missing key during compaction: {err:?}");
-                            }
-                        })?;
+                // 3. Create new image layers for partitions that have been modified "enough".
+                let (image_layers, outcome) = self
+                    .create_image_layers(
+                        &partitioning,
+                        lsn,
+                        if options
+                            .flags
+                            .contains(CompactFlags::ForceImageLayerCreation)
+                        {
+                            ImageLayerCreationMode::Force
+                        } else {
+                            ImageLayerCreationMode::Try
+                        },
+                        &image_ctx,
+                        self.last_image_layer_creation_status
+                            .load()
+                            .as_ref()
+                            .clone(),
+                        !options.flags.contains(CompactFlags::NoYield),
+                    )
+                    .await
+                    .inspect_err(|err| {
+                        if let CreateImageLayersError::GetVectoredError(
+                            GetVectoredError::MissingKey(_),
+                        ) = err
+                        {
+                            critical!("missing key during compaction: {err:?}");
+                        }
+                    })?;
 
-                    self.last_image_layer_creation_status
-                        .store(Arc::new(outcome.clone()));
+                self.last_image_layer_creation_status
+                    .store(Arc::new(outcome.clone()));
 
-                    self.upload_new_image_layers(image_layers)?;
-                    if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
-                        // Yield and do not do any other kind of compaction.
-                        info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
-                        return Ok(CompactionOutcome::YieldForL0);
-                    }
+                self.upload_new_image_layers(image_layers)?;
+                if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
+                    // Yield and do not do any other kind of compaction.
+                    info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
+                    return Ok(CompactionOutcome::YieldForL0);
                 }
-                Err(err) => {
-                    // no partitioning? This is normal, if the timeline was just created
-                    // as an empty timeline. Also in unit tests, when we use the timeline
-                    // as a simple key-value store, ignoring the datadir layout. Log the
-                    // error but continue.
-                    //
-                    // Suppress error when it's due to cancellation
-                    if !self.cancel.is_cancelled() && !err.is_cancelled() {
-                        tracing::error!(
-                            "could not compact, repartitioning keyspace failed: {err:?}"
-                        );
-                    }
+            }
+            Err(err) => {
+                // no partitioning? This is normal, if the timeline was just created
+                // as an empty timeline. Also in unit tests, when we use the timeline
+                // as a simple key-value store, ignoring the datadir layout. Log the
+                // error but continue.
+                //
+                // Suppress error when it's due to cancellation
+                if !self.cancel.is_cancelled() && !err.is_cancelled() {
+                    tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
                 }
-            };
-        }
+            }
+        };
 
         let partition_count = self.partitioning.read().0 .0.parts.len();
 
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 3ac4ed1a3e..872d3dc4cf 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -20,9 +20,6 @@ from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.utils import query_scalar, wait_until
 
 
-@pytest.mark.skip(
-    reason="We won't create future layers any more after https://github.com/neondatabase/neon/pull/10548"
-)
 @pytest.mark.parametrize(
     "attach_mode",
     ["default_generation", "same_generation"],

From 274cb13293f20e7206a5a6a88022c67838cd759f Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 18 Feb 2025 15:52:00 +0000
Subject: [PATCH 507/583] test_runner: fix mismatch versions tests on linux
 (#10869)

## Problem

Tests with mixed-version binaries always use the latest binaries on CI
([an
example](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-10848/13378137061/index.html#suites/8fc5d1648d2225380766afde7c428d81/1ccefc4cfd4ef176/)):

The versions of new `storage_broker` and old `pageserver` are the same:
`b45254a5605f6fdafdf475cdd3e920fe00898543`.

This affects only Linux, on macOS the version mixed correctly.

## Summary of changes
- Use hardlinks instead of symlinks to create a directory with
mixed-version binaries
---
 test_runner/fixtures/neon_fixtures.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index c4d4908568..db81e54c49 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -705,7 +705,7 @@ class NeonEnvBuilder:
         assert self.version_combination is not None, "version combination must be set"
 
         # Always use a newer version of `neon_local`
-        (self.mixdir / "neon_local").symlink_to(self.neon_binpath / "neon_local")
+        (self.mixdir / "neon_local").hardlink_to(self.neon_binpath / "neon_local")
         self.neon_local_binpath = self.mixdir
 
         for component, paths in COMPONENT_BINARIES.items():
@@ -716,7 +716,7 @@ class NeonEnvBuilder:
             )
             for filename in paths:
                 destination = self.mixdir / filename
-                destination.symlink_to(directory / filename)
+                destination.hardlink_to(directory / filename)
         self.neon_binpath = self.mixdir
 
         if self.version_combination["compute"] == "old":

From f36ec5c84b06c2f930ce63e131e546df8a6c09cd Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 18 Feb 2025 17:56:43 +0200
Subject: [PATCH 508/583] chore(compute): Postgres 17.4, 16.8, 15.12 and 14.17
 (#10868)

Update all minor versions. No conflicts.

Postgres repository PRs:
- https://github.com/neondatabase/postgres/pull/584
- https://github.com/neondatabase/postgres/pull/583
- https://github.com/neondatabase/postgres/pull/582
- https://github.com/neondatabase/postgres/pull/581
---
 vendor/postgres-v14   |  2 +-
 vendor/postgres-v15   |  2 +-
 vendor/postgres-v16   |  2 +-
 vendor/postgres-v17   |  2 +-
 vendor/revisions.json | 16 ++++++++--------
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 62a86dfc91..6254ab9b44 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 62a86dfc91e0c35a72f2ea5e99e6969b830c0c26
+Subproject commit 6254ab9b4496c3e481bc037ae69d859bbc2bdd7d
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 80ed91ce25..81e2eef061 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 80ed91ce255c765d25be0bb4a02c942fe6311fbf
+Subproject commit 81e2eef0616c65c2233c75b06f25766ae4c080c4
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 999cf81b10..9422247c58 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 999cf81b101ead40e597d5cd729458d8200f4537
+Subproject commit 9422247c582e7c1a08a4855d04af0874f8df2f34
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 4d3a722312..a8fea8b4be 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 4d3a722312b496ff7378156caa6d41c2e70c30e4
+Subproject commit a8fea8b4be43039f0782347c88a9b9b25f50c9d8
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 888f09124e..72d97d7f6a 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
-    "17.3",
-    "4d3a722312b496ff7378156caa6d41c2e70c30e4"
+    "17.4",
+    "a8fea8b4be43039f0782347c88a9b9b25f50c9d8"
   ],
   "v16": [
-    "16.7",
-    "999cf81b101ead40e597d5cd729458d8200f4537"
+    "16.8",
+    "9422247c582e7c1a08a4855d04af0874f8df2f34"
   ],
   "v15": [
-    "15.11",
-    "80ed91ce255c765d25be0bb4a02c942fe6311fbf"
+    "15.12",
+    "81e2eef0616c65c2233c75b06f25766ae4c080c4"
   ],
   "v14": [
-    "14.16",
-    "62a86dfc91e0c35a72f2ea5e99e6969b830c0c26"
+    "14.17",
+    "6254ab9b4496c3e481bc037ae69d859bbc2bdd7d"
   ]
 }

From f9a063e2e9b75a60bea9d3a523497ae6992f8b50 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 18 Feb 2025 11:06:20 -0500
Subject: [PATCH 509/583]  test(pageserver): fix
 test_pageserver_gc_compaction_idempotent  (#10833)

## Problem

ref https://github.com/neondatabase/neon/issues/10517

## Summary of changes

For some reasons the job split algorithm decides to have different image
coverage range for two compactions before/after restart. So we remove
the subcompaction key range and let it generate an image covering the
full range, which should make the test more stable.

Also slightly tuned the logging span.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs                     |  3 +++
 pageserver/src/tenant/timeline/compaction.rs | 10 ++-------
 test_runner/regress/test_compaction.py       | 22 +++++---------------
 3 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index bab1a02527..5d917da574 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3101,6 +3101,9 @@ impl Tenant {
                 if let Some(queue) = queue {
                     outcome = queue
                         .iteration(cancel, ctx, &self.gc_block, &timeline)
+                        .instrument(
+                            info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id),
+                        )
                         .await?;
                 }
             }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index e1e3eabb90..9e082d74b5 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -301,18 +301,12 @@ impl GcCompactionQueue {
                         let mut guard = self.inner.lock().unwrap();
                         guard.gc_guards.insert(id, gc_guard);
                     }
-                    let _ = timeline
-                        .compact_with_options(cancel, options, ctx)
-                        .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
-                        .await?;
+                    let _ = timeline.compact_with_options(cancel, options, ctx).await?;
                     self.notify_and_unblock(id);
                 }
             }
             GcCompactionQueueItem::SubCompactionJob(options) => {
-                let _ = timeline
-                    .compact_with_options(cancel, options, ctx)
-                    .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
-                    .await?;
+                let _ = timeline.compact_with_options(cancel, options, ctx).await?;
             }
             GcCompactionQueueItem::Notify(id) => {
                 self.notify_and_unblock(id);
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index f10872590c..c091cd0869 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -236,9 +236,7 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b
     wait_until(compaction_finished, timeout=60)
 
     # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
-    env.pageserver.assert_log_contains(
-        "scheduled_compact_timeline.*picked .* layers for compaction"
-    )
+    env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction")
 
     log.info("Validating at workload end ...")
     workload.validate(env.pageserver.id)
@@ -300,6 +298,8 @@ def test_pageserver_gc_compaction_idempotent(
     workload.churn_rows(row_count, env.pageserver.id)
     env.create_branch("child_branch")  # so that we have a retain_lsn
     workload.churn_rows(row_count, env.pageserver.id)
+    env.create_branch("child_branch_2")  # so that we have another retain_lsn
+    workload.churn_rows(row_count, env.pageserver.id)
     # compact 3 times if mode is before_restart
     n_compactions = 3 if compaction_mode == "before_restart" else 1
     ps_http.timeline_compact(
@@ -315,10 +315,6 @@ def test_pageserver_gc_compaction_idempotent(
             body={
                 "scheduled": True,
                 "sub_compaction": True,
-                "compact_key_range": {
-                    "start": "000000000000000000000000000000000000",
-                    "end": "030000000000000000000000000000000000",
-                },
                 "sub_compaction_max_job_size_mb": 16,
             },
         )
@@ -336,19 +332,13 @@ def test_pageserver_gc_compaction_idempotent(
                 body={
                     "scheduled": True,
                     "sub_compaction": True,
-                    "compact_key_range": {
-                        "start": "000000000000000000000000000000000000",
-                        "end": "030000000000000000000000000000000000",
-                    },
                     "sub_compaction_max_job_size_mb": 16,
                 },
             )
             wait_until(compaction_finished, timeout=60)
 
     # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
-    env.pageserver.assert_log_contains(
-        "scheduled_compact_timeline.*picked .* layers for compaction"
-    )
+    env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction")
 
     # ensure we hit the duplicated layer key warning at least once: we did two compactions consecutively,
     # and the second one should have hit the duplicated layer key warning.
@@ -466,9 +456,7 @@ def test_pageserver_gc_compaction_interrupt(neon_env_builder: NeonEnvBuilder):
     wait_until(compaction_finished, timeout=60)
 
     # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
-    env.pageserver.assert_log_contains(
-        "scheduled_compact_timeline.*picked .* layers for compaction"
-    )
+    env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction")
 
     log.info("Validating at workload end ...")
     workload.validate(env.pageserver.id)

From ed98f6d57e9b1baab39f4ab25372193294d60bf7 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 18 Feb 2025 11:06:39 -0500
Subject: [PATCH 510/583] feat(pageserver): log lease request (#10832)

## Problem

To investigate https://github.com/neondatabase/cloud/issues/23650

## Summary of changes

We log lease requests to see why there are clients accessing things
below gc_cutoff.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/page_service.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 53a6a7124d..0c8da6f2a8 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1799,6 +1799,13 @@ impl PageServerHandler {
                 .as_millis()
                 .to_string()
         });
+
+        info!(
+            "acquired lease for {} until {}",
+            lsn,
+            valid_until_str.as_deref().unwrap_or("<unknown>")
+        );
+
         let bytes = valid_until_str.as_ref().map(|x| x.as_bytes());
 
         pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(

From 1a69a8cba71a1f0d8cfaabf9bd4daf880b10ee8f Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 18 Feb 2025 16:09:06 +0000
Subject: [PATCH 511/583] storage: add APIs for warming up location after cold
 migrations (#10788)

## Problem

We lack an API for warming up attached locations based on the heatmap
contents.
This is problematic in two places:
1. If we manually migrate and cut over while the secondary is still cold
2. When we re-attach a previously offloaded tenant

## Summary of changes

https://github.com/neondatabase/neon/pull/10597 made heatmap generation
additive
across migrations, so we won't clobber it a after a cold migration. This
allows us to implement:

1. An endpoint for downloading all missing heatmap layers on the
pageserver:

`/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers`.
Only one such operation per timeline is allowed at any given time. The
granularity is tenant shard.
2. An endpoint to the storage controller to trigger the downloads on the
pageserver:

`/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers`.
This works both at
tenant and tenant shard level. If an unsharded tenant id is provided,
the operation is started on
all shards, otherwise only the specified shard.
3. A storcon cli command. Again, tenant and tenant-shard level
granularities are supported.

Cplane will call into storcon and trigger the downloads for all shards.
When we want to rescue a migration, we will use storcon cli targeting
the specific tenant shard.

Related:  https://github.com/neondatabase/neon/issues/10541
---
 control_plane/storcon_cli/src/main.rs         |  33 +++-
 libs/utils/src/shard.rs                       |   4 +
 pageserver/client/src/mgmt_api.rs             |  20 +++
 pageserver/src/http/openapi_spec.yml          |  32 ++++
 pageserver/src/http/routes.rs                 |  61 +++++++
 pageserver/src/tenant/timeline.rs             |  12 ++
 .../timeline/heatmap_layers_downloader.rs     | 162 ++++++++++++++++++
 storage_controller/src/http.rs                |  28 +++
 storage_controller/src/pageserver_client.rs   |  16 ++
 storage_controller/src/service.rs             |  56 ++++++
 test_runner/fixtures/neon_fixtures.py         |   8 +
 .../regress/test_pageserver_secondary.py      |  20 ++-
 12 files changed, 446 insertions(+), 6 deletions(-)
 create mode 100644 pageserver/src/tenant/timeline/heatmap_layers_downloader.rs

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 83faf6b4af..3c574efc63 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -22,7 +22,7 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api::{self};
 use reqwest::{Method, StatusCode, Url};
-use utils::id::{NodeId, TenantId};
+use utils::id::{NodeId, TenantId, TimelineId};
 
 use pageserver_api::controller_api::{
     NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
@@ -239,6 +239,19 @@ enum Command {
         #[arg(long)]
         scheduling_policy: SkSchedulingPolicyArg,
     },
+    /// Downloads any missing heatmap layers for all shard for a given timeline
+    DownloadHeatmapLayers {
+        /// Tenant ID or tenant shard ID. When an unsharded tenant ID is specified,
+        /// the operation is performed on all shards. When a sharded tenant ID is
+        /// specified, the operation is only performed on the specified shard.
+        #[arg(long)]
+        tenant_shard_id: TenantShardId,
+        #[arg(long)]
+        timeline_id: TimelineId,
+        /// Optional: Maximum download concurrency (default is 16)
+        #[arg(long)]
+        concurrency: Option<usize>,
+    },
 }
 
 #[derive(Parser)]
@@ -1247,6 +1260,24 @@ async fn main() -> anyhow::Result<()> {
                 String::from(scheduling_policy)
             );
         }
+        Command::DownloadHeatmapLayers {
+            tenant_shard_id,
+            timeline_id,
+            concurrency,
+        } => {
+            let mut path = format!(
+                "/v1/tenant/{}/timeline/{}/download_heatmap_layers",
+                tenant_shard_id, timeline_id,
+            );
+
+            if let Some(c) = concurrency {
+                path = format!("{path}?concurrency={c}");
+            }
+
+            storcon_client
+                .dispatch::<(), ()>(Method::POST, path, None)
+                .await?;
+        }
     }
 
     Ok(())
diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs
index 6352ea9f92..d98284f969 100644
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -117,6 +117,10 @@ impl TenantShardId {
         )
     }
 
+    pub fn range(&self) -> RangeInclusive<Self> {
+        RangeInclusive::new(*self, *self)
+    }
+
     pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
         ShardSlug(self)
     }
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index da7ec5abce..bb0f64ca32 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -477,6 +477,26 @@ impl Client {
         self.request(Method::POST, &uri, ()).await.map(|_| ())
     }
 
+    pub async fn timeline_download_heatmap_layers(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        concurrency: Option<usize>,
+    ) -> Result<()> {
+        let mut path = reqwest::Url::parse(&format!(
+            "{}/v1/tenant/{}/timeline/{}/download_heatmap_layers",
+            self.mgmt_api_endpoint, tenant_shard_id, timeline_id
+        ))
+        .expect("Cannot build URL");
+
+        if let Some(concurrency) = concurrency {
+            path.query_pairs_mut()
+                .append_pair("concurrency", &format!("{}", concurrency));
+        }
+
+        self.request(Method::POST, path, ()).await.map(|_| ())
+    }
+
     pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
         let uri = format!(
             "{}/v1/tenant/{}/reset",
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 733115539a..12252739fd 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -824,6 +824,38 @@ paths:
               schema:
                 $ref: "#/components/schemas/TenantConfigResponse"
 
+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: concurrency
+        description: Maximum number of concurrent downloads (capped at remote storage concurrency)
+        in: query
+        required: false
+        schema:
+          type: integer
+    post:
+      description: |
+        Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter
+        may be used to target all shards of a tenant when the unsharded form is used, or a specific
+        tenant shard with the sharded form.
+      responses:
+        "200":
+          description: Success
+    delete:
+      description: Stop any on-going background downloads of heatmap layers for the specified timeline.
+      responses:
+        "200":
+          description: Success
+
   /v1/utilization:
     get:
       description: |
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index a0c639a16d..329bf82bde 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1463,6 +1463,59 @@ async fn timeline_layer_scan_disposable_keys(
     )
 }
 
+async fn timeline_download_heatmap_layers_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    // Only used in the case where remote storage is not configured.
+    const DEFAULT_MAX_CONCURRENCY: usize = 100;
+    // A conservative default.
+    const DEFAULT_CONCURRENCY: usize = 16;
+
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+
+    let desired_concurrency =
+        parse_query_param(&request, "concurrency")?.unwrap_or(DEFAULT_CONCURRENCY);
+
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    let max_concurrency = get_config(&request)
+        .remote_storage_config
+        .as_ref()
+        .map(|c| c.concurrency_limit())
+        .unwrap_or(DEFAULT_MAX_CONCURRENCY);
+    let concurrency = std::cmp::min(max_concurrency, desired_concurrency);
+
+    timeline.start_heatmap_layers_download(concurrency).await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
+async fn timeline_shutdown_download_heatmap_layers_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    timeline.stop_and_drain_heatmap_layers_download().await;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn layer_download_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -3626,6 +3679,14 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer",
             |r| api_handler(r, layer_map_info_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers",
+            |r| api_handler(r, timeline_download_heatmap_layers_handler),
+        )
+        .delete(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers",
+            |r| api_handler(r, timeline_shutdown_download_heatmap_layers_handler),
+        )
         .get(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
             |r| api_handler(r, layer_download_handler),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 277dce7761..94b4abb7e9 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4,6 +4,7 @@ pub mod delete;
 pub(crate) mod detach_ancestor;
 mod eviction_task;
 pub(crate) mod handle;
+mod heatmap_layers_downloader;
 pub(crate) mod import_pgdata;
 mod init;
 pub mod layer_manager;
@@ -467,6 +468,10 @@ pub struct Timeline {
     pub(crate) page_trace: ArcSwapOption<Sender<PageTraceEvent>>,
 
     previous_heatmap: ArcSwapOption<PreviousHeatmap>,
+
+    /// May host a background Tokio task which downloads all the layers from the current
+    /// heatmap on demand.
+    heatmap_layers_downloader: Mutex<Option<heatmap_layers_downloader::HeatmapLayersDownloader>>,
 }
 
 pub(crate) enum PreviousHeatmap {
@@ -2039,6 +2044,11 @@ impl Timeline {
         tracing::debug!("Cancelling CancellationToken");
         self.cancel.cancel();
 
+        // If we have a background task downloading heatmap layers stop it.
+        // The background downloads are sensitive to timeline cancellation (done above),
+        // so the drain will be immediate.
+        self.stop_and_drain_heatmap_layers_download().await;
+
         // Ensure Prevent new page service requests from starting.
         self.handles.shutdown();
 
@@ -2752,6 +2762,8 @@ impl Timeline {
                 page_trace: Default::default(),
 
                 previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap),
+
+                heatmap_layers_downloader: Mutex::new(None),
             };
 
             result.repartition_threshold =
diff --git a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
new file mode 100644
index 0000000000..0ba9753e85
--- /dev/null
+++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
@@ -0,0 +1,162 @@
+//! Timeline utility module to hydrate everything from the current heatmap.
+//!
+//! Provides utilities to spawn and abort a background task where the downloads happen.
+//! See /v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers.
+
+use futures::StreamExt;
+use http_utils::error::ApiError;
+use std::sync::{Arc, Mutex};
+use tokio_util::sync::CancellationToken;
+use utils::sync::gate::Gate;
+
+use super::Timeline;
+
+// This status is not strictly necessary now, but gives us a nice place
+// to store progress information if we ever wish to expose it.
+pub(super) enum HeatmapLayersDownloadStatus {
+    InProgress,
+    Complete,
+}
+
+pub(super) struct HeatmapLayersDownloader {
+    handle: tokio::task::JoinHandle<()>,
+    status: Arc<Mutex<HeatmapLayersDownloadStatus>>,
+    cancel: CancellationToken,
+    downloads_guard: Arc<Gate>,
+}
+
+impl HeatmapLayersDownloader {
+    fn new(
+        timeline: Arc<Timeline>,
+        concurrency: usize,
+    ) -> Result<HeatmapLayersDownloader, ApiError> {
+        let tl_guard = timeline.gate.enter().map_err(|_| ApiError::Cancelled)?;
+
+        let cancel = timeline.cancel.child_token();
+        let downloads_guard = Arc::new(Gate::default());
+
+        let status = Arc::new(Mutex::new(HeatmapLayersDownloadStatus::InProgress));
+
+        let handle = tokio::task::spawn({
+            let status = status.clone();
+            let downloads_guard = downloads_guard.clone();
+            let cancel = cancel.clone();
+
+            async move {
+                let _guard = tl_guard;
+
+                scopeguard::defer! {
+                    *status.lock().unwrap() = HeatmapLayersDownloadStatus::Complete;
+                }
+
+                let Some(heatmap) = timeline.generate_heatmap().await else {
+                    tracing::info!("Heatmap layers download failed to generate heatmap");
+                    return;
+                };
+
+                tracing::info!(
+                    resident_size=%timeline.resident_physical_size(),
+                    heatmap_layers=%heatmap.layers.len(),
+                    "Starting heatmap layers download"
+                );
+
+                let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map(
+                    |layer| {
+                        let tl = timeline.clone();
+                        let dl_guard = match downloads_guard.enter() {
+                            Ok(g) => g,
+                            Err(_) => {
+                                // [`Self::shutdown`] was called. Don't spawn any more downloads.
+                                return None;
+                            }
+                        };
+
+                        Some(async move {
+                            let _dl_guard = dl_guard;
+
+                            let res = tl.download_layer(&layer.name).await;
+                            if let Err(err) = res {
+                                if !err.is_cancelled() {
+                                    tracing::warn!(layer=%layer.name,"Failed to download heatmap layer: {err}")
+                                }
+                            }
+                        })
+                    }
+                )).buffered(concurrency);
+
+                tokio::select! {
+                    _ = stream.collect::<()>() => {
+                        tracing::info!(
+                            resident_size=%timeline.resident_physical_size(),
+                            "Heatmap layers download completed"
+                        );
+                    },
+                    _ = cancel.cancelled() => {
+                        tracing::info!("Heatmap layers download cancelled");
+                    }
+                }
+            }
+        });
+
+        Ok(Self {
+            status,
+            handle,
+            cancel,
+            downloads_guard,
+        })
+    }
+
+    fn is_complete(&self) -> bool {
+        matches!(
+            *self.status.lock().unwrap(),
+            HeatmapLayersDownloadStatus::Complete
+        )
+    }
+
+    /// Drive any in-progress downloads to completion and stop spawning any new ones.
+    ///
+    /// This has two callers and they behave differently
+    /// 1. [`Timeline::shutdown`]: the drain will be immediate since downloads themselves
+    ///    are sensitive to timeline cancellation.
+    ///
+    /// 2. Endpoint handler in [`crate::http::routes`]: the drain will wait for any in-progress
+    ///    downloads to complete.
+    async fn stop_and_drain(self) {
+        // Counterintuitive: close the guard before cancelling.
+        // Something needs to poll the already created download futures to completion.
+        // If we cancel first, then the underlying task exits and we lost
+        // the poller.
+        self.downloads_guard.close().await;
+        self.cancel.cancel();
+        if let Err(err) = self.handle.await {
+            tracing::warn!("Failed to join heatmap layer downloader task: {err}");
+        }
+    }
+}
+
+impl Timeline {
+    pub(crate) async fn start_heatmap_layers_download(
+        self: &Arc<Self>,
+        concurrency: usize,
+    ) -> Result<(), ApiError> {
+        let mut locked = self.heatmap_layers_downloader.lock().unwrap();
+        if locked.as_ref().map(|dl| dl.is_complete()).unwrap_or(true) {
+            let dl = HeatmapLayersDownloader::new(self.clone(), concurrency)?;
+            *locked = Some(dl);
+            Ok(())
+        } else {
+            Err(ApiError::Conflict("Already running".to_string()))
+        }
+    }
+
+    pub(crate) async fn stop_and_drain_heatmap_layers_download(&self) {
+        // This can race with the start of a new downloader and lead to a situation
+        // where one donloader is shutting down and another one is in-flight.
+        // The only impact is that we'd end up using more remote storage semaphore
+        // units than expected.
+        let downloader = self.heatmap_layers_downloader.lock().unwrap().take();
+        if let Some(dl) = downloader {
+            dl.stop_and_drain().await;
+        }
+    }
+}
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index e3e35a6303..8994721267 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -516,6 +516,24 @@ async fn handle_tenant_timeline_block_unblock_gc(
     json_response(StatusCode::OK, ())
 }
 
+async fn handle_tenant_timeline_download_heatmap_layers(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
+
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    let concurrency: Option<usize> = parse_query_param(&req, "concurrency")?;
+
+    service
+        .tenant_timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency)
+        .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 // For metric labels where we would like to include the approximate path, but exclude high-cardinality fields like query parameters
 // and tenant/timeline IDs.  Since we are proxying to arbitrary paths, we don't have routing templates to
 // compare to, so we can just filter out our well known ID format with regexes.
@@ -2078,6 +2096,16 @@ pub fn make_router(
                 )
             },
         )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    handle_tenant_timeline_download_heatmap_layers,
+                    RequestName("v1_tenant_timeline_download_heatmap_layers"),
+                )
+            },
+        )
         // Tenant detail GET passthrough to shard zero:
         .get("/v1/tenant/:tenant_id", |r| {
             tenant_service_handler(
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 141ff6f720..645cbdfce1 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -280,6 +280,22 @@ impl PageserverClient {
         )
     }
 
+    pub(crate) async fn timeline_download_heatmap_layers(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        concurrency: Option<usize>,
+    ) -> Result<()> {
+        measured_request!(
+            "download_heatmap_layers",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner
+                .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency)
+                .await
+        )
+    }
+
     pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
         measured_request!(
             "utilization",
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index d5713d49ee..5aa744f076 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -162,6 +162,7 @@ enum TenantOperations {
     TimelineDetachAncestor,
     TimelineGcBlockUnblock,
     DropDetached,
+    DownloadHeatmapLayers,
 }
 
 #[derive(Clone, strum_macros::Display)]
@@ -3757,6 +3758,61 @@ impl Service {
         Ok(())
     }
 
+    pub(crate) async fn tenant_timeline_download_heatmap_layers(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        concurrency: Option<usize>,
+    ) -> Result<(), ApiError> {
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_shard_id.tenant_id,
+            TenantOperations::DownloadHeatmapLayers,
+        )
+        .await;
+
+        let targets = {
+            let locked = self.inner.read().unwrap();
+            let mut targets = Vec::new();
+
+            // If the request got an unsharded tenant id, then apply
+            // the operation to all shards. Otherwise, apply it to a specific shard.
+            let shards_range = if tenant_shard_id.is_unsharded() {
+                TenantShardId::tenant_range(tenant_shard_id.tenant_id)
+            } else {
+                tenant_shard_id.range()
+            };
+
+            for (tenant_shard_id, shard) in locked.tenants.range(shards_range) {
+                if let Some(node_id) = shard.intent.get_attached() {
+                    let node = locked
+                        .nodes
+                        .get(node_id)
+                        .expect("Pageservers may not be deleted while referenced");
+
+                    targets.push((*tenant_shard_id, node.clone()));
+                }
+            }
+            targets
+        };
+
+        self.tenant_for_shards_api(
+            targets,
+            |tenant_shard_id, client| async move {
+                client
+                    .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency)
+                    .await
+            },
+            1,
+            1,
+            SHORT_RECONCILE_TIMEOUT,
+            &self.cancel,
+        )
+        .await;
+
+        Ok(())
+    }
+
     /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation.
     ///
     /// On success, the returned vector contains exactly the same number of elements as the input `locations`.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index db81e54c49..12b096a2a0 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2467,6 +2467,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
         response.raise_for_status()
         return [TenantShardId.parse(tid) for tid in response.json()["updated"]]
 
+    def download_heatmap_layers(self, tenant_shard_id: TenantShardId, timeline_id: TimelineId):
+        response = self.request(
+            "POST",
+            f"{self.api}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        response.raise_for_status()
+
     def __enter__(self) -> Self:
         return self
 
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 8a91a255d8..aa375604f4 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -974,12 +974,22 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
 
     # The new layer map should contain all the layers in the pre-migration one
     # and a new in memory layer
-    assert len(heatmap_before_migration["timelines"][0]["layers"]) + 1 == len(
-        heatmap_after_migration["timelines"][0]["layers"]
+    after_migration_heatmap_layers_count = len(heatmap_after_migration["timelines"][0]["layers"])
+    assert (
+        len(heatmap_before_migration["timelines"][0]["layers"]) + 1
+        == after_migration_heatmap_layers_count
     )
 
-    log.info(
-        f'Heatmap size after cold migration is {len(heatmap_after_migration["timelines"][0]["layers"])}'
+    log.info(f"Heatmap size after cold migration is {after_migration_heatmap_layers_count}")
+
+    env.storage_controller.download_heatmap_layers(
+        TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id
     )
 
-    # TODO: Once we have an endpoint for rescuing the cold location, exercise it here.
+    def all_layers_downloaded():
+        local_layers_count = len(ps_secondary.list_layers(tenant_id, timeline_id))
+
+        log.info(f"{local_layers_count=} {after_migration_heatmap_layers_count=}")
+        assert local_layers_count == after_migration_heatmap_layers_count
+
+    wait_until(all_layers_downloaded)

From 381115b68e8060e5601beeb300d723b9ad309fac Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Tue, 18 Feb 2025 16:32:32 +0000
Subject: [PATCH 512/583] Add pgaudit and pgauditlogtofile extensions (#10763)

to compute image.

This commit doesn't enable anything yet.
It is a preparatory work for enabling audit logging in computes.
---
 compute/compute-node.Dockerfile | 69 +++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 082dea6f1b..0491abe965 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1509,6 +1509,73 @@ WORKDIR /ext-src/pg_repack-src
 RUN make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install
 
+
+#########################################################################################
+#
+# Layer "pgaudit"
+# compile pgaudit extension
+#
+#########################################################################################
+
+FROM build-deps AS pgaudit-src
+ARG PG_VERSION
+WORKDIR /ext-src
+RUN case "${PG_VERSION}" in \
+    "v14") \
+    export PGAUDIT_VERSION=1.6.2 \
+    export PGAUDIT_CHECKSUM=1f350d70a0cbf488c0f2b485e3a5c9b11f78ad9e3cbb95ef6904afa1eb3187eb \
+    ;; \
+    "v15") \
+    export PGAUDIT_VERSION=1.7.0 \
+    export PGAUDIT_CHECKSUM=8f4a73e451c88c567e516e6cba7dc1e23bc91686bb6f1f77f8f3126d428a8bd8 \
+    ;; \
+    "v16") \
+    export PGAUDIT_VERSION=16.0 \
+    export PGAUDIT_CHECKSUM=d53ef985f2d0b15ba25c512c4ce967dce07b94fd4422c95bd04c4c1a055fe738 \
+    ;; \
+    "v17") \
+    export PGAUDIT_VERSION=17.0 \
+    export PGAUDIT_CHECKSUM=7d0d08d030275d525f36cd48b38c6455f1023da863385badff0cec44965bfd8c \
+    ;; \
+    *) \
+    echo "pgaudit is not supported on this PostgreSQL version" && exit 1;; \
+    esac && \
+    wget https://github.com/pgaudit/pgaudit/archive/refs/tags/${PGAUDIT_VERSION}.tar.gz -O pgaudit.tar.gz && \
+    echo "${PGAUDIT_CHECKSUM} pgaudit.tar.gz" | sha256sum --check && \
+    mkdir pgaudit-src && cd pgaudit-src && tar xzf ../pgaudit.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pgaudit-build
+COPY --from=pgaudit-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pgaudit-src
+RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN)
+
+#########################################################################################
+#
+# Layer "pgauditlogtofile"
+# compile pgauditlogtofile extension
+#
+#########################################################################################
+
+FROM build-deps AS pgauditlogtofile-src
+ARG PG_VERSION
+WORKDIR /ext-src
+RUN case "${PG_VERSION}" in \
+    "v14" | "v15" | "v16" | "v17") \
+    export PGAUDITLOGTOFILE_VERSION=v1.6.4 \
+    export PGAUDITLOGTOFILE_CHECKSUM=ef801eb09c26aaa935c0dabd92c81eb9ebe338930daa9674d420a280c6bc2d70 \
+    ;; \
+    *) \
+    echo "pgauditlogtofile is not supported on this PostgreSQL version" && exit 1;; \
+    esac && \
+    wget https://github.com/fmbiete/pgauditlogtofile/archive/refs/tags/${PGAUDITLOGTOFILE_VERSION}.tar.gz -O pgauditlogtofile.tar.gz && \
+    echo "${PGAUDITLOGTOFILE_CHECKSUM} pgauditlogtofile.tar.gz" | sha256sum --check && \
+    mkdir pgauditlogtofile-src && cd pgauditlogtofile-src && tar xzf ../pgauditlogtofile.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pgauditlogtofile-build
+COPY --from=pgauditlogtofile-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pgauditlogtofile-src
+RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN)
+
 #########################################################################################
 #
 # Layer "neon-ext-build"
@@ -1604,6 +1671,8 @@ COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgaudit-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgauditlogtofile-build /usr/local/pgsql/ /usr/local/pgsql/
 
 #########################################################################################
 #

From 9151d3a31899d2ce58732b85cf83f83393d7df74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Tue, 18 Feb 2025 18:20:03 +0100
Subject: [PATCH 513/583] feat(ci): notify storage oncall if deploy job fails
 on release branch (#10865)

## Problem
If the deploy job on the release branch doesn't succeed, the preprod
deployment will not have happened. It was requested that this triggers a
notification in https://github.com/neondatabase/neon/issues/10662.

## Summary of changes
If we're on the release branch and the deploy job doesn't end up in
"success", notify storage oncall on slack.
---
 .github/actionlint.yml               |  1 +
 .github/workflows/build_and_test.yml | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 2b96ce95da..5114517e7f 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -28,3 +28,4 @@ config-variables:
   - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
   - SLACK_ON_CALL_STORAGE_STAGING_STREAM
   - SLACK_CICD_CHANNEL_ID
+  - SLACK_STORAGE_CHANNEL_ID
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index bc773600ea..d9bf094aa9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1178,6 +1178,22 @@ jobs:
             exit 1
           fi
 
+  notify-storage-release-deploy-failure:
+    needs: [ deploy ]
+    # We want this to run even if (transitive) dependencies are skipped, because deploy should really be successful on release branch workflow runs.
+    if: github.ref_name == 'release' && needs.deploy.result != 'success' && always()
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Post release-deploy failure to team-storage slack channel
+        uses: slackapi/slack-github-action@v2
+        with:
+          method: chat.postMessage
+          token: ${{ secrets.SLACK_BOT_TOKEN }}
+          payload: |
+            channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }}
+            text: |
+              🔴 @oncall-storage: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
+
   # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
   promote-compatibility-data:
     needs: [ deploy ]

From cb8060545d24b4abb3973f972ae371f45df12f8c Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 18 Feb 2025 18:49:01 +0100
Subject: [PATCH 514/583] pageserver: don't log noop image compaction (#10873)

## Problem

We log image compaction stats even when no image compaction happened.
This is logged every 10 seconds for every timeline.

## Summary of changes

Only log when we actually performed any image compaction.
---
 pageserver/src/tenant/timeline.rs | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 94b4abb7e9..d02ab36e78 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5167,14 +5167,16 @@ impl Timeline {
             .map(|l| l.metadata().file_size)
             .sum::<u64>();
 
-        info!(
-            "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions",
-            image_layers.len(),
-            total_layer_size,
-            duration.as_secs_f64(),
-            partition_processed,
-            total_partitions
-        );
+        if !image_layers.is_empty() {
+            info!(
+                "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions",
+                image_layers.len(),
+                total_layer_size,
+                duration.as_secs_f64(),
+                partition_processed,
+                total_partitions
+            );
+        }
 
         Ok((
             image_layers,

From 538ea03f73e7359e475edb6715ebc369ccab12ea Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 18 Feb 2025 13:54:53 -0500
Subject: [PATCH 515/583] feat(pageserver): allow read path debug in getpagelsn
 API (#10748)

## Problem

The usual workflow for me to debug read path errors in staging is:
download the tenant to my laptop, import, and then run some read tests.

With this patch, we can do this directly over staging pageservers.

## Summary of changes

* Add a new `touchpagelsn` API that does a page read but does not return
page info back.
* Allow read from latest record LSN from get/touchpagelsn
* Add read_debug config in the context.
* The read path will read the context config to decide whether to enable
read path tracing or not.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/context.rs         | 12 ++++++++
 pageserver/src/http/routes.rs     | 50 ++++++++++++++++++++++++-------
 pageserver/src/tenant/timeline.rs |  2 +-
 3 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index 8f2177fe5b..da9c095a15 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -98,6 +98,7 @@ pub struct RequestContext {
     download_behavior: DownloadBehavior,
     access_stats_behavior: AccessStatsBehavior,
     page_content_kind: PageContentKind,
+    read_path_debug: bool,
 }
 
 /// The kind of access to the page cache.
@@ -155,6 +156,7 @@ impl RequestContextBuilder {
                 download_behavior: DownloadBehavior::Download,
                 access_stats_behavior: AccessStatsBehavior::Update,
                 page_content_kind: PageContentKind::Unknown,
+                read_path_debug: false,
             },
         }
     }
@@ -168,6 +170,7 @@ impl RequestContextBuilder {
                 download_behavior: original.download_behavior,
                 access_stats_behavior: original.access_stats_behavior,
                 page_content_kind: original.page_content_kind,
+                read_path_debug: original.read_path_debug,
             },
         }
     }
@@ -191,6 +194,11 @@ impl RequestContextBuilder {
         self
     }
 
+    pub(crate) fn read_path_debug(mut self, b: bool) -> Self {
+        self.inner.read_path_debug = b;
+        self
+    }
+
     pub fn build(self) -> RequestContext {
         self.inner
     }
@@ -291,4 +299,8 @@ impl RequestContext {
     pub(crate) fn page_content_kind(&self) -> PageContentKind {
         self.page_content_kind
     }
+
+    pub(crate) fn read_path_debug(&self) -> bool {
+        self.read_path_debug
+    }
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 329bf82bde..c2d5c3a933 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -68,6 +68,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 
 use crate::config::PageServerConf;
+use crate::context::RequestContextBuilder;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
 use crate::pgdatadir_mapping::LsnForTimestamp;
@@ -2571,14 +2572,30 @@ async fn deletion_queue_flush(
     }
 }
 
-/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
 async fn getpage_at_lsn_handler(
+    request: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    getpage_at_lsn_handler_inner(false, request, cancel).await
+}
+
+async fn touchpage_at_lsn_handler(
+    request: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    getpage_at_lsn_handler_inner(true, request, cancel).await
+}
+
+/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
+async fn getpage_at_lsn_handler_inner(
+    touch: bool,
     request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    // Require pageserver admin permission for this API instead of only tenant-level token.
+    check_permission(&request, None)?;
     let state = get_state(&request);
 
     struct Key(pageserver_api::key::Key);
@@ -2593,22 +2610,29 @@ async fn getpage_at_lsn_handler(
 
     let key: Key = parse_query_param(&request, "key")?
         .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'key' query parameter")))?;
-    let lsn: Lsn = parse_query_param(&request, "lsn")?
-        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
+    let lsn: Option<Lsn> = parse_query_param(&request, "lsn")?;
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        // Enable read path debugging
+        let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true).build();
         let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
 
+        // Use last_record_lsn if no lsn is provided
+        let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
         let page = timeline.get(key.0, lsn, &ctx).await?;
 
-        Result::<_, ApiError>::Ok(
-            Response::builder()
-                .status(StatusCode::OK)
-                .header(header::CONTENT_TYPE, "application/octet-stream")
-                .body(hyper::Body::from(page))
-                .unwrap(),
-        )
+        if touch {
+            json_response(StatusCode::OK, ())
+        } else {
+            Result::<_, ApiError>::Ok(
+                Response::builder()
+                    .status(StatusCode::OK)
+                    .header(header::CONTENT_TYPE, "application/octet-stream")
+                    .body(hyper::Body::from(page))
+                    .unwrap(),
+            )
+        }
     }
     .instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
     .await
@@ -3743,6 +3767,10 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage",
             |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
         )
+        .get(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/touchpage",
+            |r| api_handler(r, touchpage_at_lsn_handler),
+        )
         .get(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
             |r| api_handler(r, timeline_collect_keyspace),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d02ab36e78..582825e890 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1298,7 +1298,7 @@ impl Timeline {
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
-        let read_path = if self.conf.enable_read_path_debugging {
+        let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() {
             Some(ReadPath::new(keyspace.clone(), lsn))
         } else {
             None

From 9d074db18db8c8a05df02f54563140e9cb2b7a63 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Tue, 18 Feb 2025 20:54:21 +0100
Subject: [PATCH 516/583] Use link to  cross-service-endpoint dashboard in
 allure reports and benchmarking workflow logs (#10874)

## Problem

We have links to deprecated dashboards in our logs

Example
https://github.com/neondatabase/neon/actions/runs/13382454571/job/37401983608#step:8:348

## Summary of changes

Use link to cross service endpoint instead.

Example:
https://github.com/neondatabase/neon/actions/runs/13395407925/job/37413056148#step:7:345
---
 test_runner/fixtures/neon_fixtures.py |  4 +-
 test_runner/fixtures/utils.py         | 82 +++++++++++----------------
 2 files changed, 35 insertions(+), 51 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 12b096a2a0..58c5dbfd29 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -96,7 +96,7 @@ from fixtures.utils import (
     ATTACHMENT_NAME_REGEX,
     COMPONENT_BINARIES,
     USE_LFC,
-    allure_add_grafana_links,
+    allure_add_grafana_link,
     assert_no_errors,
     get_dir_size,
     print_gc_result,
@@ -3255,7 +3255,7 @@ def remote_pg(
     end_ms = int(datetime.utcnow().timestamp() * 1000)
     if is_neon:
         # Add 10s margin to the start and end times
-        allure_add_grafana_links(
+        allure_add_grafana_link(
             host,
             timeline_id,
             start_ms - 10_000,
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 2a59eab710..84d62fb877 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -312,62 +312,46 @@ def allure_attach_from_dir(dir: Path, preserve_database_files: bool = False):
 
 
 GRAFANA_URL = "https://neonprod.grafana.net"
-GRAFANA_EXPLORE_URL = f"{GRAFANA_URL}/explore"
-GRAFANA_TIMELINE_INSPECTOR_DASHBOARD_URL = f"{GRAFANA_URL}/d/8G011dlnk/timeline-inspector"
-LOGS_STAGING_DATASOURCE_ID = "xHHYY0dVz"
+GRAFANA_DASHBOARD_URL = f"{GRAFANA_URL}/d/cdya0okb81zwga/cross-service-endpoint-debugging"
 
 
-def allure_add_grafana_links(host: str, timeline_id: TimelineId, start_ms: int, end_ms: int):
-    """Add links to server logs in Grafana to Allure report"""
-    links: dict[str, str] = {}
-    # We expect host to be in format like ep-divine-night-159320.us-east-2.aws.neon.build
+def allure_add_grafana_link(host: str, timeline_id: TimelineId, start_ms: int, end_ms: int):
+    """
+    Add a link to the cross-service endpoint debugging dashboard in Grafana to Allure report.
+
+    Args:
+        host (str): The host string in the format 'ep-<endpoint_id>.<region_id>.<domain>'.
+        timeline_id (TimelineId): The timeline identifier for the Grafana dashboard.
+            (currently ignored but may be needed in future verions of the dashboard)
+        start_ms (int): The start time in milliseconds for the Grafana dashboard.
+        end_ms (int): The end time in milliseconds for the Grafana dashboard.
+
+    Example:
+        Given
+        host = ''
+        timeline_id = '996926d1f5ddbe7381b8840083f8fc9a'
+
+        The generated link would be something like:
+        https://neonprod.grafana.net/d/cdya0okb81zwga/cross-service-endpoint-debugging?orgId=1&from=2025-02-17T21:10:00.000Z&to=2025-02-17T21:20:00.000Z&timezone=utc&var-env=dev%7Cstaging&var-input_endpoint_id=ep-holy-mouse-w2u462gi
+
+    """
+    # We expect host to be in format like ep-holy-mouse-w2u462gi.us-east-2.aws.neon.build
     endpoint_id, region_id, _ = host.split(".", 2)
 
-    expressions = {
-        "compute logs": f'{{app="compute-node-{endpoint_id}", neon_region="{region_id}"}}',
-        "k8s events": f'{{job="integrations/kubernetes/eventhandler"}} |~ "name=compute-node-{endpoint_id}-"',
-        "console logs": f'{{neon_service="console", neon_region="{region_id}"}} | json | endpoint_id = "{endpoint_id}"',
-        "proxy logs": f'{{neon_service="proxy-scram", neon_region="{region_id}"}}',
+    params = {
+        "orgId": 1,
+        "from": start_ms,
+        "to": end_ms,
+        "timezone": "utc",
+        "var-env": "dev|staging",
+        "var-input_endpoint_id": endpoint_id,
     }
 
-    params: dict[str, Any] = {
-        "datasource": LOGS_STAGING_DATASOURCE_ID,
-        "queries": [
-            {
-                "expr": "<PUT AN EXPRESSION HERE>",
-                "refId": "A",
-                "datasource": {"type": "loki", "uid": LOGS_STAGING_DATASOURCE_ID},
-                "editorMode": "code",
-                "queryType": "range",
-            }
-        ],
-        "range": {
-            "from": str(start_ms),
-            "to": str(end_ms),
-        },
-    }
-    for name, expr in expressions.items():
-        params["queries"][0]["expr"] = expr
-        query_string = urlencode({"orgId": 1, "left": json.dumps(params)})
-        links[name] = f"{GRAFANA_EXPLORE_URL}?{query_string}"
+    query_string = urlencode(params)
+    link = f"{GRAFANA_DASHBOARD_URL}?{query_string}"
 
-    timeline_qs = urlencode(
-        {
-            "orgId": 1,
-            "var-environment": "victoria-metrics-aws-dev",
-            "var-timeline_id": timeline_id,
-            "var-endpoint_id": endpoint_id,
-            "var-log_datasource": "grafanacloud-neonstaging-logs",
-            "from": start_ms,
-            "to": end_ms,
-        }
-    )
-    link = f"{GRAFANA_TIMELINE_INSPECTOR_DASHBOARD_URL}?{timeline_qs}"
-    links["Timeline Inspector"] = link
-
-    for name, link in links.items():
-        allure.dynamic.link(link, name=name)
-        log.info(f"{name}: {link}")
+    allure.dynamic.link(link, name="Cross-Service Endpoint Debugging")
+    log.info(f"Cross-Service Endpoint Debugging: {link}")
 
 
 def start_in_background(

From a4e3989c8da1bf0dc8a88b35a010055c29afd43f Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 18 Feb 2025 15:19:23 -0500
Subject: [PATCH 517/583] fix(pageserver): make repartition error critical
 (#10872)

## Problem

Read errors during repartition should be a critical error.

## Summary of changes

<del>We only have one call site</del> We have two call sites of
`repartition` where one of them is during the initial image upload
optimization and another is during image layer creation, so I added a
`critical!` here instead of inside `collect_keyspace`.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs                |  1 +
 pageserver/src/tenant.rs                     |  6 ++++++
 pageserver/src/tenant/tasks.rs               |  3 +++
 pageserver/src/tenant/timeline.rs            |  9 +++++++--
 pageserver/src/tenant/timeline/compaction.rs | 16 ++++++++++++++--
 5 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index c2d5c3a933..56a84a98a8 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2395,6 +2395,7 @@ async fn timeline_checkpoint_handler(
                     match e {
                         CompactionError::ShuttingDown => ApiError::ShuttingDown,
                         CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
+                        CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
                         CompactionError::Other(e) => ApiError::InternalServerError(e)
                     }
                 )?;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 5d917da574..efb35625f2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3150,6 +3150,12 @@ impl Tenant {
             // Offload failures don't trip the circuit breaker, since they're cheap to retry and
             // shouldn't block compaction.
             CompactionError::Offload(_) => {}
+            CompactionError::CollectKeySpaceError(err) => {
+                self.compaction_circuit_breaker
+                    .lock()
+                    .unwrap()
+                    .fail(&CIRCUIT_BREAKERS_BROKEN, err);
+            }
             CompactionError::Other(err) => {
                 self.compaction_circuit_breaker
                     .lock()
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 029444e973..5e63f59fd8 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -287,6 +287,7 @@ fn log_compaction_error(
     sleep_duration: Duration,
     task_cancelled: bool,
 ) {
+    use crate::pgdatadir_mapping::CollectKeySpaceError;
     use crate::tenant::upload_queue::NotInitialized;
     use crate::tenant::PageReconstructError;
     use CompactionError::*;
@@ -294,6 +295,8 @@ fn log_compaction_error(
     let level = match err {
         ShuttingDown => return,
         Offload(_) => Level::ERROR,
+        CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO,
+        CollectKeySpaceError(_) => Level::ERROR,
         _ if task_cancelled => Level::INFO,
         Other(err) => {
             let root_cause = err.root_cause();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 582825e890..ea966d2b43 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1881,7 +1881,7 @@ impl Timeline {
         // Signal compaction failure to avoid L0 flush stalls when it's broken.
         match result {
             Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
-            Err(CompactionError::Other(_)) => {
+            Err(CompactionError::Other(_)) | Err(CompactionError::CollectKeySpaceError(_)) => {
                 self.compaction_failed.store(true, AtomicOrdering::Relaxed)
             }
             // Don't change the current value on offload failure or shutdown. We don't want to
@@ -4604,7 +4604,10 @@ impl Timeline {
             ));
         }
 
-        let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
+        let (dense_ks, sparse_ks) = self
+            .collect_keyspace(lsn, ctx)
+            .await
+            .map_err(CompactionError::CollectKeySpaceError)?;
         let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
         let sparse_partitioning = SparseKeyPartitioning {
             parts: vec![sparse_ks],
@@ -5319,6 +5322,8 @@ pub(crate) enum CompactionError {
     #[error("Failed to offload timeline: {0}")]
     Offload(OffloadError),
     /// Compaction cannot be done right now; page reconstruction and so on.
+    #[error("Failed to collect keyspace: {0}")]
+    CollectKeySpaceError(CollectKeySpaceError),
     #[error(transparent)]
     Other(anyhow::Error),
 }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 9e082d74b5..4e4f906d78 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -11,7 +11,8 @@ use std::sync::Arc;
 use super::layer_manager::LayerManager;
 use super::{
     CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError,
-    ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline,
+    ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError, RecordedDuration,
+    Timeline,
 };
 
 use anyhow::{anyhow, bail, Context};
@@ -31,6 +32,7 @@ use utils::id::TimelineId;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
+use crate::pgdatadir_mapping::CollectKeySpaceError;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::gc_block::GcBlock;
@@ -781,7 +783,17 @@ impl Timeline {
                 //
                 // Suppress error when it's due to cancellation
                 if !self.cancel.is_cancelled() && !err.is_cancelled() {
-                    tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
+                    if let CompactionError::CollectKeySpaceError(
+                        CollectKeySpaceError::Decode(_)
+                        | CollectKeySpaceError::PageRead(PageReconstructError::MissingKey(_)),
+                    ) = err
+                    {
+                        critical!("could not compact, repartitioning keyspace failed: {err:?}");
+                    } else {
+                        tracing::error!(
+                            "could not compact, repartitioning keyspace failed: {err:?}"
+                        );
+                    }
                 }
             }
         };

From 7199919f04887a993d07df5d556ffa0c2b5ee251 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Wed, 19 Feb 2025 07:40:09 +0100
Subject: [PATCH 518/583] Fix the problems discovered in the upgrade test
 (#10826)

## Problem
The nightly test discovered problems in the extensions upgrade test.
1. `PLv8` has different versions on PGv17 and PGv16 and a different test
set, which was not implemented correctly
[sample](https://github.com/neondatabase/neon/actions/runs/13382330475/job/37372930271)
2. The same for `semver`
[sample](https://github.com/neondatabase/neon/actions/runs/13382330475/job/37372930017)
3. `pgtap` interfered with the other tests, e.g. tables, created by
other extensions caused the tests to fail.

## Summary of changes
The discovered problems were fixed.
1. The tests list for `PLv8` is now generated using the original
Makefile
2. The patches for `semver` are now split for PGv16 and PGv17.
3. `pgtap` is being tested in a separate database now.

---------

Co-authored-by: Mikhail Kot <mikhail@neon.tech>
---
 compute/compute-node.Dockerfile               |  3 +++
 ...st-upgrade.patch => test-upgrade-16.patch} |  0
 .../pg_semver-src/test-upgrade-17.patch       | 24 +++++++++++++++++++
 .../ext-src/pg_semver-src/test-upgrade.sh     |  3 ++-
 .../ext-src/pgtap-src/test-upgrade.patch      | 13 ++++++++++
 .../ext-src/pgtap-src/test-upgrade.sh         |  3 +--
 .../ext-src/plv8-src/test-upgrade.sh          |  3 ++-
 docker-compose/test_extensions_upgrade.sh     |  2 ++
 8 files changed, 47 insertions(+), 4 deletions(-)
 rename docker-compose/ext-src/pg_semver-src/{test-upgrade.patch => test-upgrade-16.patch} (100%)
 create mode 100644 docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 0491abe965..0b3001613d 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1847,11 +1847,14 @@ COPY --from=pg_partman-src /ext-src/ /ext-src/
 #COPY --from=pg_repack-src /ext-src/ /ext-src/
 
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
+RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl\
+   && apt clean && rm -rf /ext-src/*.tar.gz /var/lib/apt/lists/*
 ENV PATH=/usr/local/pgsql/bin:$PATH
 ENV PGHOST=compute
 ENV PGPORT=55433
 ENV PGUSER=cloud_admin
 ENV PGDATABASE=postgres
+ENV PG_VERSION=${PG_VERSION:?}
 
 #########################################################################################
 #
diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade.patch b/docker-compose/ext-src/pg_semver-src/test-upgrade-16.patch
similarity index 100%
rename from docker-compose/ext-src/pg_semver-src/test-upgrade.patch
rename to docker-compose/ext-src/pg_semver-src/test-upgrade-16.patch
diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch b/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch
new file mode 100644
index 0000000000..2d0bf280db
--- /dev/null
+++ b/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch
@@ -0,0 +1,24 @@
+diff --git a/test/sql/base.sql b/test/sql/base.sql
+index 53adb30..2eed91b 100644
+--- a/test/sql/base.sql
++++ b/test/sql/base.sql
+@@ -2,7 +2,6 @@
+ BEGIN;
+ 
+ \i test/pgtap-core.sql
+-CREATE EXTENSION semver;
+ 
+ SELECT plan(334);
+ --SELECT * FROM no_plan();
+diff --git a/test/sql/corpus.sql b/test/sql/corpus.sql
+index c0fe98e..39cdd2e 100644
+--- a/test/sql/corpus.sql
++++ b/test/sql/corpus.sql
+@@ -4,7 +4,6 @@ BEGIN;
+ -- Test the SemVer corpus from https://regex101.com/r/Ly7O1x/3/.
+ 
+ \i test/pgtap-core.sql
+-CREATE EXTENSION semver;
+ 
+ SELECT plan(76);
+ --SELECT * FROM no_plan();
diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade.sh b/docker-compose/ext-src/pg_semver-src/test-upgrade.sh
index e1541f272a..18b2848fd1 100755
--- a/docker-compose/ext-src/pg_semver-src/test-upgrade.sh
+++ b/docker-compose/ext-src/pg_semver-src/test-upgrade.sh
@@ -1,6 +1,7 @@
 #!/bin/sh
 set -ex
 cd "$(dirname ${0})"
-patch -p1 <test-upgrade.patch
+patch -p1 <test-upgrade-${PG_VERSION}.patch
+psql -d contrib_regression -c "DROP EXTENSION IF EXISTS pgtap"
 PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
 ${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin'    --inputdir=test --dbname=contrib_regression base corpus
\ No newline at end of file
diff --git a/docker-compose/ext-src/pgtap-src/test-upgrade.patch b/docker-compose/ext-src/pgtap-src/test-upgrade.patch
index 16089b2902..a4c46e93ce 100644
--- a/docker-compose/ext-src/pgtap-src/test-upgrade.patch
+++ b/docker-compose/ext-src/pgtap-src/test-upgrade.patch
@@ -1,3 +1,16 @@
+diff --git a/Makefile b/Makefile
+index f255fe6..0a0fa65 100644
+--- a/Makefile
++++ b/Makefile
+@@ -346,7 +346,7 @@ test: test-serial test-parallel
+ TB_DIR = test/build
+ GENERATED_SCHEDULE_DEPS = $(TB_DIR)/all_tests $(TB_DIR)/exclude_tests
+ REGRESS = --schedule $(TB_DIR)/run.sch # Set this again just to be safe
+-REGRESS_OPTS = --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF)
++REGRESS_OPTS = --use-existing --dbname=pgtap_regression --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF)
+ SETUP_SCH = test/schedule/main.sch # schedule to use for test setup; this can be forcibly changed by some targets!
+ IGNORE_TESTS = $(notdir $(EXCLUDE_TEST_FILES:.sql=))
+ PARALLEL_TESTS = $(filter-out $(IGNORE_TESTS),$(filter-out $(SERIAL_TESTS),$(ALL_TESTS)))
 diff --git a/test/schedule/create.sql b/test/schedule/create.sql
 index ba355ed..7e250f5 100644
 --- a/test/schedule/create.sql
diff --git a/docker-compose/ext-src/pgtap-src/test-upgrade.sh b/docker-compose/ext-src/pgtap-src/test-upgrade.sh
index a8c43dd010..5f59c636aa 100755
--- a/docker-compose/ext-src/pgtap-src/test-upgrade.sh
+++ b/docker-compose/ext-src/pgtap-src/test-upgrade.sh
@@ -2,5 +2,4 @@
 set -ex
 cd "$(dirname ${0})"
 patch -p1 <test-upgrade.patch
-PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
-${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin'    --inputdir=test --max-connections=86 --schedule test/schedule/main.sch   --schedule test/build/run.sch --dbname contrib_regression --use-existing
\ No newline at end of file
+make installcheck
\ No newline at end of file
diff --git a/docker-compose/ext-src/plv8-src/test-upgrade.sh b/docker-compose/ext-src/plv8-src/test-upgrade.sh
index 6514d4fe92..325540b85d 100755
--- a/docker-compose/ext-src/plv8-src/test-upgrade.sh
+++ b/docker-compose/ext-src/plv8-src/test-upgrade.sh
@@ -2,4 +2,5 @@
 set -ex
 cd "$(dirname ${0})"
 PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
-${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin'  --use-existing --dbname=contrib_regression plv8 plv8-errors scalar_args inline json startup_pre startup varparam json_conv jsonb_conv window guc es6 arraybuffer composites currentresource startup_perms bytea find_function_perms memory_limits reset show array_spread regression dialect bigint procedure
\ No newline at end of file
+REGRESS="$(make -n installcheck | awk '{print substr($0,index($0,"init-extension")+15);}')"
+${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin'  --use-existing --dbname=contrib_regression ${REGRESS}
\ No newline at end of file
diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh
index 775acada1f..4a9024569b 100755
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -59,6 +59,8 @@ wait_for_ready
 docker compose cp  ext-src neon-test-extensions:/
 docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
 docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
+docker compose exec neon-test-extensions psql -c "CREATE DATABASE pgtap_regression"
+docker compose exec neon-test-extensions psql -d pgtap_regression -c "CREATE EXTENSION pgtap"
 create_extensions "${EXTNAMES}"
 if [ "${FORCE_ALL_UPGRADE_TESTS:-false}" = true ]; then
   exts="${EXTNAMES}"

From 2f0d6571a96cbf29701d8343e7d817391a30a7a4 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 19 Feb 2025 09:43:53 +0100
Subject: [PATCH 519/583] add a variant to ingest benchmark with
 shard-splitting disabled (#10876)

## Problem

we measure ingest performance for a few variants (stripe-sizes,
pre-sharded, shard-splitted).
However some phenomena (e.g. related to L0 compaction) in PS can be
better observed and optimized with un-sharded tenants.

## Summary of changes

- Allow to create projects with a policy that disables sharding
(`{"scheduling": "Essential"}`)
- add a variant to ingest_benchmark that uses that policy for the new
project

## Test run
https://github.com/neondatabase/neon/actions/runs/13396325970
---
 .../actions/neon-project-create/action.yml    | 22 ++++++++++++++++++-
 .github/workflows/ingest_benchmark.yml        | 10 +++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index c9f6b0832e..a393aa6106 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -19,7 +19,11 @@ inputs:
     default: '[1, 1]'
   # settings below only needed if you want the project to be sharded from the beginning
   shard_split_project:
-    description: 'by default new projects are not shard-split, specify true to shard-split'
+    description: 'by default new projects are not shard-split initiailly, but only when shard-split threshold is reached, specify true to explicitly shard-split initially'
+    required: false
+    default: 'false'
+  disable_sharding:
+    description: 'by default new projects use storage controller default policy to shard-split when shard-split threshold is reached, specify true to explicitly disable sharding'
     required: false
     default: 'false'
   admin_api_key:
@@ -107,6 +111,21 @@ runs:
             -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
             -d "{\"new_shard_count\": $SHARD_COUNT, \"new_stripe_size\": $STRIPE_SIZE}"
         fi
+        if [ "${DISABLE_SHARDING}" = "true" ]; then
+          # determine tenant ID
+          TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"`
+
+          echo "Explicitly disabling shard-splitting for project ${project_id} with tenant_id ${TENANT_ID}"
+
+          echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/policy"
+          echo "with body {\"scheduling\": \"Essential\"}"
+
+          # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set)
+          curl -X PUT \
+            "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/policy" \
+            -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
+            -d "{\"scheduling\": \"Essential\"}"
+        fi
 
       env:
         API_HOST: ${{ inputs.api_host }}
@@ -116,6 +135,7 @@ runs:
         MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
         MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
         SHARD_SPLIT_PROJECT: ${{ inputs.shard_split_project }}
+        DISABLE_SHARDING: ${{ inputs.disable_sharding }}
         ADMIN_API_KEY: ${{ inputs.admin_api_key }}
         SHARD_COUNT: ${{ inputs.shard_count }}
         STRIPE_SIZE: ${{ inputs.stripe_size }}
diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml
index 7b303fa37a..c20c5890f9 100644
--- a/.github/workflows/ingest_benchmark.yml
+++ b/.github/workflows/ingest_benchmark.yml
@@ -32,18 +32,27 @@ jobs:
           - target_project: new_empty_project_stripe_size_2048 
             stripe_size: 2048 # 16 MiB
             postgres_version: 16
+            disable_sharding: false
           - target_project: new_empty_project_stripe_size_32768
             stripe_size: 32768 # 256 MiB # note that this is different from null because using null will shard_split the project only if it reaches the threshold
                                # while here it is sharded from the beginning with a shard size of 256 MiB
+            disable_sharding: false
             postgres_version: 16
           - target_project: new_empty_project
             stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
+            disable_sharding: false
             postgres_version: 16
           - target_project: new_empty_project
             stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
+            disable_sharding: false
             postgres_version: 17
           - target_project: large_existing_project
             stripe_size: null # cannot re-shared or choose different stripe size for existing, already sharded project
+            disable_sharding: false
+            postgres_version: 16
+          - target_project: new_empty_project_unsharded
+            stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
+            disable_sharding: true
             postgres_version: 16
       max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
     permissions:
@@ -96,6 +105,7 @@ jobs:
         admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }} 
         shard_count: 8
         stripe_size: ${{ matrix.stripe_size }}
+        disable_sharding: ${{ matrix.disable_sharding }} 
 
     - name: Initialize Neon project
       if: ${{ startsWith(matrix.target_project, 'new_empty_project') }}

From aa115a774cb4e2245399c9994a42ea05a1fd9ddd Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 19 Feb 2025 10:01:02 +0100
Subject: [PATCH 520/583] storcon: eagerly attempt autosplits (#10849)

## Problem

Autosplits are crucial for bulk ingest performance. However, autosplits
were only attempted when there was no other pending work. This could
cause e.g. mass AZ affinity violations following Pageserver restarts to
starve out autosplits for hours.

Resolves #10762.

## Summary of changes

Always attempt autosplits in the background reconciliation loop,
regardless of other pending work.
---
 storage_controller/src/service.rs | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 5aa744f076..dd4d93dc84 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1031,12 +1031,11 @@ impl Service {
                 let reconciles_spawned = self.reconcile_all();
                 if reconciles_spawned == 0 {
                     // Run optimizer only when we didn't find any other work to do
-                    let optimizations = self.optimize_all().await;
-                    if optimizations == 0 {
-                        // Run new splits only when no optimizations are pending
-                        self.autosplit_tenants().await;
-                    }
+                    self.optimize_all().await;
                 }
+                // Always attempt autosplits. Sharding is crucial for bulk ingest performance, so we
+                // must be responsive when new projects begin ingesting and reach the threshold.
+                self.autosplit_tenants().await;
             }
               _ = self.reconcilers_cancel.cancelled() => return
             }

From e52e93797fbb930da8cf7139e150d748920182f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Wed, 19 Feb 2025 13:34:41 +0100
Subject: [PATCH 521/583] refactor(ci): use variables for AWS account IDs
 (#10886)

## Problem
Our AWS account IDs are copy-pasted all over the place. A wrong paste
might only be caught late if we hardcode them, but will get flagged
instantly by actionlint if we access them from github actions variables.
Resolves https://github.com/neondatabase/neon/issues/10787, follow-up
for https://github.com/neondatabase/neon/pull/10613.

## Summary of changes
Access AWS account IDs using Github Actions variables.
---
 .github/actionlint.yml                        |  3 +++
 .../workflows/_push-to-container-registry.yml |  2 +-
 .github/workflows/build_and_test.yml          | 23 +++++++++++--------
 .../build_and_test_with_sanitizers.yml        |  2 +-
 .github/workflows/pin-build-tools-image.yml   |  6 ++---
 scripts/generate_image_maps.py                |  7 ++++--
 6 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 5114517e7f..1e6c2d0aa2 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -29,3 +29,6 @@ config-variables:
   - SLACK_ON_CALL_STORAGE_STAGING_STREAM
   - SLACK_CICD_CHANNEL_ID
   - SLACK_STORAGE_CHANNEL_ID
+  - NEON_DEV_AWS_ACCOUNT_ID
+  - NEON_PROD_AWS_ACCOUNT_ID
+  - AWS_ECR_REGION
diff --git a/.github/workflows/_push-to-container-registry.yml b/.github/workflows/_push-to-container-registry.yml
index 3c97c8a67a..c938f62ad5 100644
--- a/.github/workflows/_push-to-container-registry.yml
+++ b/.github/workflows/_push-to-container-registry.yml
@@ -2,7 +2,7 @@ name: Push images to Container Registry
 on:
   workflow_call:
     inputs:
-      # Example: {"docker.io/neondatabase/neon:13196061314":["369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]}
+      # Example: {"docker.io/neondatabase/neon:13196061314":["${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]}
       image-map:
         description: JSON map of images, mapping from a source image to an array of target images that should be pushed.
         required: true
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index d9bf094aa9..f08280e112 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -68,7 +68,7 @@ jobs:
   tag:
     needs: [ check-permissions ]
     runs-on: [ self-hosted, small ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+    container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned
     outputs:
       build-tag: ${{steps.build-tag.outputs.tag}}
 
@@ -859,14 +859,17 @@ jobs:
           BRANCH: "${{ github.ref_name }}"
           DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}"
           PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}"
+          DEV_AWS: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
+          PROD_AWS: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}"
+          AWS_REGION: "${{ vars.AWS_ECR_REGION }}"
 
   push-neon-image-dev:
     needs: [ generate-image-maps, neon-image ]
     uses: ./.github/workflows/_push-to-container-registry.yml
     with:
       image-map: '${{ needs.generate-image-maps.outputs.neon-dev }}'
-      aws-region: eu-central-1
-      aws-account-ids: "369495373322"
+      aws-region: ${{ vars.AWS_ECR_REGION }}
+      aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
       azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
       azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
       azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
@@ -881,8 +884,8 @@ jobs:
     uses: ./.github/workflows/_push-to-container-registry.yml
     with:
       image-map: '${{ needs.generate-image-maps.outputs.compute-dev }}'
-      aws-region: eu-central-1
-      aws-account-ids: "369495373322"
+      aws-region: ${{ vars.AWS_ECR_REGION }}
+      aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
       azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
       azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
       azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
@@ -898,8 +901,8 @@ jobs:
     uses: ./.github/workflows/_push-to-container-registry.yml
     with:
       image-map: '${{ needs.generate-image-maps.outputs.neon-prod }}'
-      aws-region: eu-central-1
-      aws-account-ids: "093970136003"
+      aws-region: ${{ vars.AWS_ECR_REGION }}
+      aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}"
       azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }}
       azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
       azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
@@ -915,8 +918,8 @@ jobs:
     uses: ./.github/workflows/_push-to-container-registry.yml
     with:
       image-map: '${{ needs.generate-image-maps.outputs.compute-prod }}'
-      aws-region: eu-central-1
-      aws-account-ids: "093970136003"
+      aws-region: ${{ vars.AWS_ECR_REGION }}
+      aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}"
       azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }}
       azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
       azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
@@ -1029,7 +1032,7 @@ jobs:
       statuses: write
       contents: write
     runs-on: [ self-hosted, small ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/ansible:latest
     steps:
       - uses: actions/checkout@v4
 
diff --git a/.github/workflows/build_and_test_with_sanitizers.yml b/.github/workflows/build_and_test_with_sanitizers.yml
index 2bc938509f..e40b02b5d2 100644
--- a/.github/workflows/build_and_test_with_sanitizers.yml
+++ b/.github/workflows/build_and_test_with_sanitizers.yml
@@ -27,7 +27,7 @@ env:
 jobs:
   tag:
     runs-on: [ self-hosted, small ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+    container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned
     outputs:
       build-tag: ${{steps.build-tag.outputs.tag}}
 
diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
index 626de2b0e0..8861c1f093 100644
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -78,7 +78,7 @@ jobs:
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@v4
         with:
-          aws-region: eu-central-1
+          aws-region: ${{ vars.AWS_ECR_REGION }}
           role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
           role-duration-seconds: 3600
 
@@ -104,12 +104,12 @@ jobs:
             tags=()
 
             tags+=("-t" "neondatabase/build-tools:${TO_TAG}-${debian_version}")
-            tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}-${debian_version}")
+            tags+=("-t" "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:${TO_TAG}-${debian_version}")
             tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}-${debian_version}")
 
             if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then
               tags+=("-t" "neondatabase/build-tools:${TO_TAG}")
-              tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}")
+              tags+=("-t" "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:${TO_TAG}")
               tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}")
             fi
 
diff --git a/scripts/generate_image_maps.py b/scripts/generate_image_maps.py
index a2f553d290..915eb33673 100644
--- a/scripts/generate_image_maps.py
+++ b/scripts/generate_image_maps.py
@@ -6,6 +6,9 @@ build_tag = os.environ["BUILD_TAG"]
 branch = os.environ["BRANCH"]
 dev_acr = os.environ["DEV_ACR"]
 prod_acr = os.environ["PROD_ACR"]
+dev_aws = os.environ["DEV_AWS"]
+prod_aws = os.environ["PROD_AWS"]
+aws_region = os.environ["AWS_REGION"]
 
 components = {
     "neon": ["neon"],
@@ -24,11 +27,11 @@ components = {
 registries = {
     "dev": [
         "docker.io/neondatabase",
-        "369495373322.dkr.ecr.eu-central-1.amazonaws.com",
+        f"{dev_aws}.dkr.ecr.{aws_region}.amazonaws.com",
         f"{dev_acr}.azurecr.io/neondatabase",
     ],
     "prod": [
-        "093970136003.dkr.ecr.eu-central-1.amazonaws.com",
+        f"{prod_aws}.dkr.ecr.{aws_region}.amazonaws.com",
         f"{prod_acr}.azurecr.io/neondatabase",
     ],
 }

From 2d96134a4ee963cb9fd76ad46d8613ee1204654a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 19 Feb 2025 16:09:01 +0200
Subject: [PATCH 522/583] Remove unused dependencies (#10887)

Per cargo machete.
---
 Cargo.lock                            | 7 -------
 compute_tools/Cargo.toml              | 3 ---
 libs/proxy/tokio-postgres2/Cargo.toml | 5 +----
 libs/utils/Cargo.toml                 | 1 -
 4 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 12c12bc771..1cab85adb3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1316,7 +1316,6 @@ dependencies = [
  "flate2",
  "futures",
  "http 1.1.0",
- "jsonwebtoken",
  "metrics",
  "nix 0.27.1",
  "notify",
@@ -1326,7 +1325,6 @@ dependencies = [
  "opentelemetry_sdk",
  "postgres",
  "postgres_initdb",
- "prometheus",
  "regex",
  "remote_storage",
  "reqwest",
@@ -1345,7 +1343,6 @@ dependencies = [
  "tower 0.5.2",
  "tower-http",
  "tracing",
- "tracing-opentelemetry",
  "tracing-subscriber",
  "tracing-utils",
  "url",
@@ -7021,14 +7018,11 @@ dependencies = [
 name = "tokio-postgres2"
 version = "0.1.0"
 dependencies = [
- "async-trait",
- "byteorder",
  "bytes",
  "fallible-iterator",
  "futures-util",
  "log",
  "parking_lot 0.12.1",
- "percent-encoding",
  "phf",
  "pin-project-lite",
  "postgres-protocol2",
@@ -7615,7 +7609,6 @@ dependencies = [
  "hex",
  "hex-literal",
  "humantime",
- "inferno 0.12.0",
  "jsonwebtoken",
  "metrics",
  "nix 0.27.1",
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 81dcf99560..c276996df5 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -25,7 +25,6 @@ fail.workspace = true
 flate2.workspace = true
 futures.workspace = true
 http.workspace = true
-jsonwebtoken.workspace = true
 metrics.workspace = true
 nix.workspace = true
 notify.workspace = true
@@ -48,13 +47,11 @@ tokio-postgres.workspace = true
 tokio-util.workspace = true
 tokio-stream.workspace = true
 tracing.workspace = true
-tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 thiserror.workspace = true
 url.workspace = true
 uuid.workspace = true
-prometheus.workspace = true
 walkdir.workspace = true
 
 postgres_initdb.workspace = true
diff --git a/libs/proxy/tokio-postgres2/Cargo.toml b/libs/proxy/tokio-postgres2/Cargo.toml
index ade0ffc9f6..161c6b8309 100644
--- a/libs/proxy/tokio-postgres2/Cargo.toml
+++ b/libs/proxy/tokio-postgres2/Cargo.toml
@@ -5,18 +5,15 @@ edition = "2021"
 license = "MIT/Apache-2.0"
 
 [dependencies]
-async-trait.workspace = true
 bytes.workspace = true
-byteorder.workspace = true
 fallible-iterator.workspace = true
 futures-util = { workspace = true, features = ["sink"] }
 log = "0.4"
 parking_lot.workspace = true
-percent-encoding = "2.0"
 pin-project-lite.workspace = true
 phf = "0.11"
 postgres-protocol2 = { path = "../postgres-protocol2" }
 postgres-types2 = { path = "../postgres-types2" }
 tokio = { workspace = true, features = ["io-util", "time", "net"] }
 tokio-util = { workspace = true, features = ["codec"] }
-serde = { workspace = true, features = ["derive"] }
\ No newline at end of file
+serde = { workspace = true, features = ["derive"] }
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index e9611a0f12..62e0f4cfba 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -24,7 +24,6 @@ diatomic-waker.workspace = true
 git-version.workspace = true
 hex = { workspace = true, features = ["serde"] }
 humantime.workspace = true
-inferno.workspace = true
 fail.workspace = true
 futures = { workspace = true }
 jsonwebtoken.workspace = true

From 0453eaf65c9328a49720db2af9747a6a8df01872 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 19 Feb 2025 15:12:05 +0100
Subject: [PATCH 523/583] pageserver: reduce default `compaction_upper_limit`
 to 20 (#10889)

## Problem

We've seen the previous default of 50 cause OOMs. Compacting many L0
layers at once now has limited benefit, since the cost is mostly linear
anyway. This is already being reduced to 20 in production settings.

## Summary of changes

Reduce `DEFAULT_COMPACTION_UPPER_LIMIT` to 20.

Once released, let's remove the config overrides.
---
 libs/pageserver_api/src/config.rs | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index e64052c73d..0f33bcf45b 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -544,10 +544,11 @@ pub mod tenant_conf_defaults {
     pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
     pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
 
-    // This value needs to be tuned to avoid OOM. We have 3/4 of the total CPU threads to do background works, that's 16*3/4=9 on
-    // most of our pageservers. Compaction ~50 layers requires about 2GB memory (could be reduced later by optimizing L0 hole
-    // calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB.
-    pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50;
+    // This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's
+    // 3/4*16=9 on most of our pageservers. Compacting 20 layers requires about 1 GB memory (could
+    // be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So
+    // with this config, we can get a maximum peak compaction usage of 9 GB.
+    pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20;
     pub const DEFAULT_COMPACTION_L0_FIRST: bool = false;
     pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true;
 

From 3720cf1c5aed3aa8b50cd8d5c85572a51a01e766 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 19 Feb 2025 15:20:51 +0100
Subject: [PATCH 524/583] storcon: use jemalloc (#10892)

## Problem

We'd like to enable CPU/heap profiling for storcon. This requires
jemalloc.

## Summary of changes

Use jemalloc as the global allocator, and enable heap sampling for
profiling.
---
 Cargo.lock                     |  1 +
 storage_controller/Cargo.toml  |  1 +
 storage_controller/src/main.rs | 10 ++++++++++
 3 files changed, 12 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index 1cab85adb3..12232eaece 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6465,6 +6465,7 @@ dependencies = [
  "strum",
  "strum_macros",
  "thiserror 1.0.69",
+ "tikv-jemallocator",
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index a93bbdeaaf..73dc1a5c10 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -34,6 +34,7 @@ reqwest = { workspace = true, features = ["stream"] }
 routerify.workspace = true
 safekeeper_api.workspace = true
 safekeeper_client.workspace = true
+tikv-jemallocator.workspace = true
 regex.workspace = true
 rustls-native-certs.workspace = true
 serde.workspace = true
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index ea6bc38e89..9a9958f7a6 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -27,6 +27,16 @@ use utils::{project_build_tag, project_git_version, tcp_listener};
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
+/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21).
+/// This adds roughly 3% overhead for allocations on average, which is acceptable considering
+/// performance-sensitive code will avoid allocations as far as possible anyway.
+#[allow(non_upper_case_globals)]
+#[export_name = "malloc_conf"]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
+
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
 #[command(arg_required_else_help(true))]

From aab5482fd5fc43b0c092e22c0cab0e86b8655673 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 19 Feb 2025 15:43:29 +0100
Subject: [PATCH 525/583] storcon: add CPU/heap profiling endpoints (#10894)

Adds CPU/heap profiling for storcon.

Also fixes allowlists to match on the path only, since profiling
endpoints take query parameters.

Requires #10892 for heap profiling.
---
 storage_controller/src/http.rs | 62 +++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 20 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 8994721267..1cc61a12e8 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -9,7 +9,10 @@ use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECON
 use anyhow::Context;
 use futures::Future;
 use http_utils::{
-    endpoint::{self, auth_middleware, check_permission_with, request_span},
+    endpoint::{
+        self, auth_middleware, check_permission_with, profile_cpu_handler, profile_heap_handler,
+        request_span,
+    },
     error::ApiError,
     failpoints::failpoints_handler,
     json::{json_request, json_response},
@@ -54,7 +57,7 @@ pub struct HttpState {
     service: Arc<crate::service::Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
     neon_metrics: NeonMetrics,
-    allowlist_routes: Vec<Uri>,
+    allowlist_routes: &'static [&'static str],
 }
 
 impl HttpState {
@@ -63,15 +66,17 @@ impl HttpState {
         auth: Option<Arc<SwappableJwtAuth>>,
         build_info: BuildInfo,
     ) -> Self {
-        let allowlist_routes = ["/status", "/ready", "/metrics"]
-            .iter()
-            .map(|v| v.parse().unwrap())
-            .collect::<Vec<_>>();
         Self {
             service,
             auth,
             neon_metrics: NeonMetrics::new(build_info),
-            allowlist_routes,
+            allowlist_routes: &[
+                "/status",
+                "/ready",
+                "/metrics",
+                "/profile/cpu",
+                "/profile/heap",
+            ],
         }
     }
 }
@@ -1416,23 +1421,26 @@ pub fn prologue_leadership_status_check_middleware<
         let state = get_state(&req);
         let leadership_status = state.service.get_leadership_status();
 
-        enum AllowedRoutes<'a> {
+        enum AllowedRoutes {
             All,
-            Some(Vec<&'a str>),
+            Some(&'static [&'static str]),
         }
 
         let allowed_routes = match leadership_status {
             LeadershipStatus::Leader => AllowedRoutes::All,
             LeadershipStatus::SteppedDown => AllowedRoutes::All,
-            LeadershipStatus::Candidate => {
-                AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec())
-            }
+            LeadershipStatus::Candidate => AllowedRoutes::Some(&[
+                "/ready",
+                "/status",
+                "/metrics",
+                "/profile/cpu",
+                "/profile/heap",
+            ]),
         };
 
-        let uri = req.uri().to_string();
         match allowed_routes {
             AllowedRoutes::All => Ok(req),
-            AllowedRoutes::Some(allowed) if allowed.contains(&uri.as_str()) => Ok(req),
+            AllowedRoutes::Some(allowed) if allowed.contains(&req.uri().path()) => Ok(req),
             _ => {
                 tracing::info!(
                     "Request {} not allowed due to current leadership state",
@@ -1541,7 +1549,8 @@ enum ForwardOutcome {
 
 /// Potentially forward the request to the current storage controler leader.
 /// More specifically we forward when:
-/// 1. Request is not one of ["/control/v1/step_down", "/status", "/ready", "/metrics"]
+/// 1. Request is not one of:
+///    ["/control/v1/step_down", "/status", "/ready", "/metrics", "/profile/cpu", "/profile/heap"]
 /// 2. Current instance is in [`LeadershipStatus::SteppedDown`] state
 /// 3. There is a leader in the database to forward to
 /// 4. Leader from step (3) is not the current instance
@@ -1562,10 +1571,17 @@ enum ForwardOutcome {
 /// Hence, if we are in the edge case scenario the leader persisted in the database is the
 /// stepped down instance that received the request. Condition (4) above covers this scenario.
 async fn maybe_forward(req: Request<Body>) -> ForwardOutcome {
-    const NOT_FOR_FORWARD: [&str; 4] = ["/control/v1/step_down", "/status", "/ready", "/metrics"];
+    const NOT_FOR_FORWARD: &[&str] = &[
+        "/control/v1/step_down",
+        "/status",
+        "/ready",
+        "/metrics",
+        "/profile/cpu",
+        "/profile/heap",
+    ];
 
-    let uri = req.uri().to_string();
-    let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.as_str());
+    let uri = req.uri();
+    let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.path());
 
     // Fast return before trying to take any Service locks, if we will never forward anyway
     if !uri_for_forward {
@@ -1765,7 +1781,7 @@ pub fn make_router(
     if auth.is_some() {
         router = router.middleware(auth_middleware(|request| {
             let state = get_state(request);
-            if state.allowlist_routes.contains(request.uri()) {
+            if state.allowlist_routes.contains(&request.uri().path()) {
                 None
             } else {
                 state.auth.as_deref()
@@ -1778,13 +1794,19 @@ pub fn make_router(
         .get("/metrics", |r| {
             named_request_span(r, measured_metrics_handler, RequestName("metrics"))
         })
-        // Non-prefixed generic endpoints (status, metrics)
+        // Non-prefixed generic endpoints (status, metrics, profiling)
         .get("/status", |r| {
             named_request_span(r, handle_status, RequestName("status"))
         })
         .get("/ready", |r| {
             named_request_span(r, handle_ready, RequestName("ready"))
         })
+        .get("/profile/cpu", |r| {
+            named_request_span(r, profile_cpu_handler, RequestName("profile_cpu"))
+        })
+        .get("/profile/heap", |r| {
+            named_request_span(r, profile_heap_handler, RequestName("profile_heap"))
+        })
         // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
         .post("/upcall/v1/re-attach", |r| {
             named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach"))

From 1f9511dbd9570c90efca17e4322987db1e209014 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 19 Feb 2025 10:10:12 -0500
Subject: [PATCH 526/583] feat(pageserver): yield image creation to L0
 compactions across timelines (#10877)

## Problem

A simpler version of https://github.com/neondatabase/neon/pull/10812

## Summary of changes

Image layer creation will be preempted by L0 accumulated on other
timelines. We stop image layer generation if there's a pending L0
compaction request.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ea966d2b43..48c208d5d7 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -22,6 +22,7 @@ use chrono::{DateTime, Utc};
 use compaction::CompactionOutcome;
 use enumset::EnumSet;
 use fail::fail_point;
+use futures::FutureExt;
 use futures::{stream::FuturesUnordered, StreamExt};
 use handle::ShardTimelineId;
 use layer_manager::Shutdown;
@@ -5128,20 +5129,26 @@ impl Timeline {
                     // image layer generation taking too long time and blocking L0 compaction. So in this
                     // mode, we also inspect the current number of L0 layers and skip image layer generation
                     // if there are too many of them.
-                    let num_of_l0_layers = {
-                        let layers = self.layers.read().await;
-                        layers.layer_map()?.level0_deltas().len()
-                    };
                     let image_preempt_threshold = self.get_image_creation_preempt_threshold()
                         * self.get_compaction_threshold();
-                    if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold {
-                        tracing::info!(
-                        "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers {}",
-                        partition.start().unwrap(), partition.end().unwrap(), num_of_l0_layers
-                    );
-                        last_partition_processed = Some(partition.clone());
-                        all_generated = false;
-                        break;
+                    // TODO: currently we do not respect `get_image_creation_preempt_threshold` and always yield
+                    // when there is a single timeline with more than L0 threshold L0 layers. As long as the
+                    // `get_image_creation_preempt_threshold` is set to a value greater than 0, we will yield for L0 compaction.
+                    if image_preempt_threshold != 0 {
+                        let should_yield = self
+                            .l0_compaction_trigger
+                            .notified()
+                            .now_or_never()
+                            .is_some();
+                        if should_yield {
+                            tracing::info!(
+                                "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers",
+                                partition.start().unwrap(), partition.end().unwrap()
+                            );
+                            last_partition_processed = Some(partition.clone());
+                            all_generated = false;
+                            break;
+                        }
                     }
                 }
             }

From 9ba2a87e69880f1bad63bcf3cd433eee054919dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 19 Feb 2025 17:57:11 +0100
Subject: [PATCH 527/583] storcon: sk heartbeat fixes (#10891)

This PR does the following things:

* The initial heartbeat round blocks the storage controller from
becoming online again. If all safekeepers are unresponsive, this can
cause storage controller startup to be very slow. The original intent of
#10583 was that heartbeats don't affect normal functionality of the
storage controller. So add a short timeout to prevent it from impeding
storcon functionality.

* Fix the URL of the utilization endpoint.

* Don't send heartbeats to safekeepers which are decomissioned.

Part of https://github.com/neondatabase/neon/issues/9011

context: https://neondb.slack.com/archives/C033RQ5SPDH/p1739966807592589
---
 safekeeper/client/src/mgmt_api.rs              |  2 +-
 storage_controller/src/heartbeater.rs          |  8 +++++++-
 storage_controller/src/safekeeper.rs           | 16 ++++++++++++----
 storage_controller/src/service.rs              |  8 +++++---
 test_runner/regress/test_storage_controller.py | 14 +++++++++++---
 5 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs
index 40e5afc4aa..5c305769dd 100644
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -137,7 +137,7 @@ impl Client {
     }
 
     pub async fn utilization(&self) -> Result<SafekeeperUtilization> {
-        let uri = format!("{}/v1/utilization/", self.mgmt_api_endpoint);
+        let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
         let resp = self.get(&uri).await?;
         resp.json().await.map_err(Error::ReceiveBody)
     }
diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index 6f110d3294..1f20326398 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -10,7 +10,10 @@ use std::{
 };
 use tokio_util::sync::CancellationToken;
 
-use pageserver_api::{controller_api::NodeAvailability, models::PageserverUtilization};
+use pageserver_api::{
+    controller_api::{NodeAvailability, SkSchedulingPolicy},
+    models::PageserverUtilization,
+};
 
 use thiserror::Error;
 use utils::{id::NodeId, logging::SecretString};
@@ -311,6 +314,9 @@ impl HeartBeat<Safekeeper, SafekeeperState> for HeartbeaterTask<Safekeeper, Safe
 
         let mut heartbeat_futs = FuturesUnordered::new();
         for (node_id, sk) in &*safekeepers {
+            if sk.scheduling_policy() == SkSchedulingPolicy::Decomissioned {
+                continue;
+            }
             heartbeat_futs.push({
                 let jwt_token = self
                     .jwt_token
diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs
index be073d0cb9..b85b4de1e8 100644
--- a/storage_controller/src/safekeeper.rs
+++ b/storage_controller/src/safekeeper.rs
@@ -18,12 +18,14 @@ pub struct Safekeeper {
     cancel: CancellationToken,
     listen_http_addr: String,
     listen_http_port: u16,
+    scheduling_policy: SkSchedulingPolicy,
     id: NodeId,
     availability: SafekeeperState,
 }
 
 impl Safekeeper {
     pub(crate) fn from_persistence(skp: SafekeeperPersistence, cancel: CancellationToken) -> Self {
+        let scheduling_policy = SkSchedulingPolicy::from_str(&skp.scheduling_policy).unwrap();
         Self {
             cancel,
             listen_http_addr: skp.host.clone(),
@@ -31,6 +33,7 @@ impl Safekeeper {
             id: NodeId(skp.id as u64),
             skp,
             availability: SafekeeperState::Offline,
+            scheduling_policy,
         }
     }
     pub(crate) fn base_url(&self) -> String {
@@ -46,6 +49,13 @@ impl Safekeeper {
     pub(crate) fn set_availability(&mut self, availability: SafekeeperState) {
         self.availability = availability;
     }
+    pub(crate) fn scheduling_policy(&self) -> SkSchedulingPolicy {
+        self.scheduling_policy
+    }
+    pub(crate) fn set_scheduling_policy(&mut self, scheduling_policy: SkSchedulingPolicy) {
+        self.scheduling_policy = scheduling_policy;
+        self.skp.scheduling_policy = String::from(scheduling_policy);
+    }
     /// Perform an operation (which is given a [`SafekeeperClient`]) with retries
     pub(crate) async fn with_client_retries<T, O, F>(
         &self,
@@ -129,10 +139,8 @@ impl Safekeeper {
                 self.id.0
             );
         }
-        self.skp = crate::persistence::SafekeeperPersistence::from_upsert(
-            record,
-            SkSchedulingPolicy::from_str(&self.skp.scheduling_policy).unwrap(),
-        );
+        self.skp =
+            crate::persistence::SafekeeperPersistence::from_upsert(record, self.scheduling_policy);
         self.listen_http_port = http_port as u16;
         self.listen_http_addr = host;
     }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index dd4d93dc84..f47dd72579 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -819,7 +819,9 @@ impl Service {
             .heartbeater_ps
             .heartbeat(Arc::new(nodes_to_heartbeat))
             .await;
-        let res_sk = self.heartbeater_sk.heartbeat(all_sks).await;
+        // Put a small, but reasonable timeout to get the initial heartbeats of the safekeepers to avoid a storage controller downtime
+        const SK_TIMEOUT: Duration = Duration::from_secs(5);
+        let res_sk = tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(all_sks)).await;
 
         let mut online_nodes = HashMap::new();
         if let Ok(deltas) = res_ps {
@@ -837,7 +839,7 @@ impl Service {
         }
 
         let mut online_sks = HashMap::new();
-        if let Ok(deltas) = res_sk {
+        if let Ok(Ok(deltas)) = res_sk {
             for (node_id, status) in deltas.0 {
                 match status {
                     SafekeeperState::Available {
@@ -7960,7 +7962,7 @@ impl Service {
             let sk = safekeepers
                 .get_mut(&node_id)
                 .ok_or(DatabaseError::Logical("Not found".to_string()))?;
-            sk.skp.scheduling_policy = String::from(scheduling_policy);
+            sk.set_scheduling_policy(scheduling_policy);
 
             locked.safekeepers = Arc::new(safekeepers);
         }
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 88d30308f7..1d95312140 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3238,12 +3238,17 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
     newest_info = target.get_safekeeper(inserted["id"])
     assert newest_info
     assert newest_info["scheduling_policy"] == "Pause"
-    target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
+    target.safekeeper_scheduling_policy(inserted["id"], "Active")
     newest_info = target.get_safekeeper(inserted["id"])
     assert newest_info
-    assert newest_info["scheduling_policy"] == "Decomissioned"
+    assert newest_info["scheduling_policy"] == "Active"
     # Ensure idempotency
-    target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
+    target.safekeeper_scheduling_policy(inserted["id"], "Active")
+    newest_info = target.get_safekeeper(inserted["id"])
+    assert newest_info
+    assert newest_info["scheduling_policy"] == "Active"
+    # change back to paused again
+    target.safekeeper_scheduling_policy(inserted["id"], "Pause")
 
     def storcon_heartbeat():
         assert env.storage_controller.log_contains(
@@ -3252,6 +3257,9 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
 
     wait_until(storcon_heartbeat)
 
+    # Now decomission it
+    target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
+
 
 def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
     compared = [dict(a), dict(b)]

From 0b3db74c44f0309b0ae6721ae721a71358dc8bc1 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 19 Feb 2025 18:11:12 +0100
Subject: [PATCH 528/583] libs: remove unnecessary regex in `pprof::symbolize`
 (#10893)

`pprof::symbolize()` used a regex to strip the Rust monomorphization
suffix from generic methods. However, the `backtrace` crate can do this
itself if formatted with the `:#` flag.

Also tighten up the code a bit.
---
 libs/http-utils/src/pprof.rs | 37 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/libs/http-utils/src/pprof.rs b/libs/http-utils/src/pprof.rs
index dd57f9ed4b..fe1cc10838 100644
--- a/libs/http-utils/src/pprof.rs
+++ b/libs/http-utils/src/pprof.rs
@@ -2,7 +2,6 @@ use anyhow::bail;
 use flate2::write::{GzDecoder, GzEncoder};
 use flate2::Compression;
 use itertools::Itertools as _;
-use once_cell::sync::Lazy;
 use pprof::protos::{Function, Line, Location, Message as _, Profile};
 use regex::Regex;
 
@@ -58,38 +57,30 @@ pub fn symbolize(mut profile: Profile) -> anyhow::Result<Profile> {
 
         // Resolve the line and function for each location.
         backtrace::resolve(loc.address as *mut c_void, |symbol| {
-            let Some(symname) = symbol.name() else {
+            let Some(symbol_name) = symbol.name() else {
                 return;
             };
-            let mut name = symname.to_string();
 
-            // Strip the Rust monomorphization suffix from the symbol name.
-            static SUFFIX_REGEX: Lazy<Regex> =
-                Lazy::new(|| Regex::new("::h[0-9a-f]{16}$").expect("invalid regex"));
-            if let Some(m) = SUFFIX_REGEX.find(&name) {
-                name.truncate(m.start());
-            }
-
-            let function_id = match functions.get(&name) {
-                Some(function) => function.id,
-                None => {
-                    let id = functions.len() as u64 + 1;
-                    let system_name = String::from_utf8_lossy(symname.as_bytes());
+            let function_name = format!("{symbol_name:#}");
+            let functions_len = functions.len();
+            let function_id = functions
+                .entry(function_name)
+                .or_insert_with_key(|function_name| {
+                    let function_id = functions_len as u64 + 1;
+                    let system_name = String::from_utf8_lossy(symbol_name.as_bytes());
                     let filename = symbol
                         .filename()
                         .map(|path| path.to_string_lossy())
                         .unwrap_or(Cow::Borrowed(""));
-                    let function = Function {
-                        id,
-                        name: string_id(&name),
+                    Function {
+                        id: function_id,
+                        name: string_id(function_name),
                         system_name: string_id(&system_name),
                         filename: string_id(&filename),
                         ..Default::default()
-                    };
-                    functions.insert(name, function);
-                    id
-                }
-            };
+                    }
+                })
+                .id;
             loc.line.push(Line {
                 function_id,
                 line: symbol.lineno().unwrap_or(0) as i64,

From aad817d80678714d131973cc3c747be1b2c9c8a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Wed, 19 Feb 2025 18:26:09 +0100
Subject: [PATCH 529/583] refactor(ci): use reusable push-to-container-registry
 workflow for pinning the build-tools image (#10890)

## Problem
Pinning build tools still replicated the ACR/ECR/Docker Hub login and
pushing, even though we have a reusable workflow for this. Was mentioned
as a TODO in https://github.com/neondatabase/neon/pull/10613.

## Summary of changes
Reuse `_push-to-container-registry.yml` for pinning the build-tools
images.
---
 .github/workflows/build_and_test.yml        |  2 +-
 .github/workflows/pin-build-tools-image.yml | 92 ++++++++-------------
 2 files changed, 36 insertions(+), 58 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index f08280e112..8f3392ceea 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1293,7 +1293,7 @@ jobs:
           done
 
   pin-build-tools-image:
-    needs: [ build-build-tools-image, push-compute-image-prod, push-neon-image-prod, build-and-test-locally ]
+    needs: [ build-build-tools-image, test-images, build-and-test-locally ]
     if: github.ref_name == 'main'
     uses: ./.github/workflows/pin-build-tools-image.yml
     with:
diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
index 8861c1f093..b305b662ee 100644
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -33,10 +33,6 @@ concurrency:
 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
 
-env:
-  FROM_TAG: ${{ inputs.from-tag }}
-  TO_TAG: pinned
-
 jobs:
   check-manifests:
     runs-on: ubuntu-22.04
@@ -46,11 +42,14 @@ jobs:
     steps:
       - name: Check if we really need to pin the image
         id: check-manifests
+        env:
+          FROM_TAG: ${{ inputs.from-tag }}
+          TO_TAG: pinned
         run: |
-          docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json
-          docker manifest inspect neondatabase/build-tools:${TO_TAG}   > ${TO_TAG}.json
+          docker manifest inspect "docker.io/neondatabase/build-tools:${FROM_TAG}" > "${FROM_TAG}.json"
+          docker manifest inspect "docker.io/neondatabase/build-tools:${TO_TAG}"   > "${TO_TAG}.json"
 
-          if diff ${FROM_TAG}.json ${TO_TAG}.json; then
+          if diff "${FROM_TAG}.json" "${TO_TAG}.json"; then
             skip=true
           else
             skip=false
@@ -64,55 +63,34 @@ jobs:
     # use format(..) to catch both inputs.force = true AND inputs.force = 'true'
     if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true'
 
-    runs-on: ubuntu-22.04
-
     permissions:
-      id-token: write # for `azure/login` and aws auth
+      id-token: write  # Required for aws/azure login
 
-    steps:
-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-region: ${{ vars.AWS_ECR_REGION }}
-          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-          role-duration-seconds: 3600
-
-      - name: Login to Amazon Dev ECR
-        uses: aws-actions/amazon-ecr-login@v2
-
-      - name: Azure login
-        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
-        with:
-          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
-          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
-
-      - name: Login to ACR
-        run: |
-          az acr login --name=neoneastus2
-
-      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR
-        env:
-          DEFAULT_DEBIAN_VERSION: bookworm
-        run: |
-          for debian_version in bullseye bookworm; do
-            tags=()
-
-            tags+=("-t" "neondatabase/build-tools:${TO_TAG}-${debian_version}")
-            tags+=("-t" "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:${TO_TAG}-${debian_version}")
-            tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}-${debian_version}")
-
-            if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then
-              tags+=("-t" "neondatabase/build-tools:${TO_TAG}")
-              tags+=("-t" "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:${TO_TAG}")
-              tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}")
-            fi
-
-            docker buildx imagetools create "${tags[@]}" \
-                                              neondatabase/build-tools:${FROM_TAG}-${debian_version}
-          done
+    uses: ./.github/workflows/_push-to-container-registry.yml
+    with:
+      image-map: |
+        {
+          "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bullseye": [
+            "docker.io/neondatabase/build-tools:pinned-bullseye",
+            "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bullseye",
+            "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bullseye"
+          ],
+          "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bookworm": [
+            "docker.io/neondatabase/build-tools:pinned-bookworm",
+            "docker.io/neondatabase/build-tools:pinned",
+            "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bookworm",
+            "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned",
+            "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bookworm",
+            "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned"
+          ]
+        }
+      aws-region: ${{ vars.AWS_ECR_REGION }}
+      aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
+      azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
+      azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
+      azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
+      acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
+    secrets:
+      aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}"
+      docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+      docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

From f148d71d9bf344230159a941cb20a6b804acec9e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 19 Feb 2025 19:30:17 +0000
Subject: [PATCH 530/583] test: disable background heatmap uploads and
 downloads in cold migration test (#10895)

## Problem

Background heatmap uploads and downloads were blocking the ones done
manually by the test.

## Summary of changes

Disable Background heatmap uploads and downloads for the cold migration
test. The test does
them explicitly.
---
 test_runner/regress/test_pageserver_secondary.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index aa375604f4..602d493ae6 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -903,6 +903,9 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
         remote_storage_kind=RemoteStorageKind.MOCK_S3,
     )
 
+    tenant_conf = TENANT_CONF.copy()
+    tenant_conf["heatmap_period"] = "0s"
+
     env = neon_env_builder.init_configs()
     env.start()
 
@@ -910,7 +913,7 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
 
     tenant_id = TenantId.generate()
     timeline_id = TimelineId.generate()
-    env.create_tenant(tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}')
+    env.create_tenant(tenant_id, timeline_id, conf=tenant_conf, placement_policy='{"Attached":1}')
 
     env.storage_controller.reconcile_until_idle()
 

From 787b98f8f2d67b1322a260e50a0afa3af9ed5ac9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 19 Feb 2025 21:45:22 +0100
Subject: [PATCH 531/583] storcon: log all safekeepers marked as offline
 (#10898)

Doing this to help debugging offline safekeepers.

Part of https://github.com/neondatabase/neon/issues/9011
---
 storage_controller/src/heartbeater.rs | 8 +++++++-
 storage_controller/src/safekeeper.rs  | 2 +-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index 1f20326398..57e9fd0f75 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -346,7 +346,13 @@ impl HeartBeat<Safekeeper, SafekeeperState> for HeartbeaterTask<Safekeeper, Safe
                             // We ignore the node in this case.
                             return None;
                         }
-                        Err(_) => SafekeeperState::Offline,
+                        Err(e) => {
+                            tracing::info!(
+                                "Marking safekeeper {} at as offline: {e}",
+                                sk.base_url()
+                            );
+                            SafekeeperState::Offline
+                        }
                     };
 
                     Some((*node_id, status))
diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs
index b85b4de1e8..53cd8a908b 100644
--- a/storage_controller/src/safekeeper.rs
+++ b/storage_controller/src/safekeeper.rs
@@ -112,7 +112,7 @@ impl Safekeeper {
             warn_threshold,
             max_retries,
             &format!(
-                "Call to node {} ({}:{}) management API",
+                "Call to safekeeper {} ({}:{}) management API",
                 self.id, self.listen_http_addr, self.listen_http_port
             ),
             cancel,

From bb7e244a429742283ceff9b53f0ffab98a8d5ba3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 20 Feb 2025 00:04:05 +0100
Subject: [PATCH 532/583] storcon: fix heartbeats timing out causing a panic
 (#10902)

Fix an issue caused by PR
https://github.com/neondatabase/neon/pull/10891: we introduced the
concept of timeouts for heartbeats, where we would hang up on the other
side of the oneshot channel if a timeout happened (future gets
cancelled, receiver is dropped).

This hang up would make the heartbeat task panic when it did obtain the
response, as we unwrap the result of the result sending operation. The
panic would lead to the heartbeat task panicing itself, which is then
according to logs the last sign of life we of that process invocation.
I'm not sure what brings down the process, in theory tokio [should
continue](https://docs.rs/tokio/latest/tokio/runtime/enum.UnhandledPanic.html#variant.Ignore),
but idk.

Alternative to #10901.
---
 storage_controller/src/heartbeater.rs |  7 ++++++-
 storage_controller/src/service.rs     | 19 +++++++++++--------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index 57e9fd0f75..52b6110667 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -140,8 +140,13 @@ where
                 request = self.receiver.recv() => {
                     match request {
                         Some(req) => {
+                            if req.reply.is_closed() {
+                                // Prevent a possibly infinite buildup of the receiver channel, if requests arrive faster than we can handle them
+                                continue;
+                            }
                             let res = self.heartbeat(req.servers).await;
-                            req.reply.send(res).unwrap();
+                            // Ignore the return value in order to not panic if the heartbeat function's future was cancelled
+                            _ = req.reply.send(res);
                         },
                         None => { return; }
                     }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index f47dd72579..fc6d2f3d29 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -815,13 +815,12 @@ impl Service {
         };
 
         tracing::info!("Sending initial heartbeats...");
-        let res_ps = self
-            .heartbeater_ps
-            .heartbeat(Arc::new(nodes_to_heartbeat))
-            .await;
         // Put a small, but reasonable timeout to get the initial heartbeats of the safekeepers to avoid a storage controller downtime
         const SK_TIMEOUT: Duration = Duration::from_secs(5);
-        let res_sk = tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(all_sks)).await;
+        let (res_ps, res_sk) = tokio::join!(
+            self.heartbeater_ps.heartbeat(Arc::new(nodes_to_heartbeat)),
+            tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(all_sks))
+        );
 
         let mut online_nodes = HashMap::new();
         if let Ok(deltas) = res_ps {
@@ -1064,8 +1063,12 @@ impl Service {
                 locked.safekeepers.clone()
             };
 
-            let res_ps = self.heartbeater_ps.heartbeat(nodes).await;
-            let res_sk = self.heartbeater_sk.heartbeat(safekeepers).await;
+            const SK_TIMEOUT: Duration = Duration::from_secs(3);
+            let (res_ps, res_sk) = tokio::join!(
+                self.heartbeater_ps.heartbeat(nodes),
+                tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(safekeepers))
+            );
+
             if let Ok(deltas) = res_ps {
                 let mut to_handle = Vec::default();
 
@@ -1167,7 +1170,7 @@ impl Service {
                     }
                 }
             }
-            if let Ok(deltas) = res_sk {
+            if let Ok(Ok(deltas)) = res_sk {
                 let mut locked = self.inner.write().unwrap();
                 let mut safekeepers = (*locked.safekeepers).clone();
                 for (id, state) in deltas.0 {

From a6d8640d6f5c73f491648a2ab8373563c0d88bf6 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 20 Feb 2025 08:38:55 +0200
Subject: [PATCH 533/583] Persist pg_stat information in pageserver (#6560)

## Problem

Statistic is saved in local file and so lost on compute restart.

Persist in in page server using the same AUX file mechanism used for
replication slots

See more about motivation in
https://neondb.slack.com/archives/C04DGM6SMTM/p1703077676522789

## Summary of changes

Persist postal file using AUX mechanism


Postgres PRs:
https://github.com/neondatabase/postgres/pull/547
https://github.com/neondatabase/postgres/pull/446
https://github.com/neondatabase/postgres/pull/445

Related to #6684 and #6228

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 libs/postgres_ffi/src/lib.rs                  |  2 +-
 libs/postgres_ffi/src/xlog_utils.rs           | 42 +++++++++-
 pageserver/ctl/src/key.rs                     |  1 +
 pageserver/src/aux_file.rs                    |  4 +
 pageserver/src/basebackup.rs                  | 58 ++++++++-----
 pageserver/src/pgdatadir_mapping.rs           |  9 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  8 +-
 .../src/tenant/storage_layer/image_layer.rs   |  8 +-
 pageserver/src/walingest.rs                   | 44 ++++++++++
 test_runner/regress/test_broken_timeline.py   |  4 +-
 test_runner/regress/test_pgstat.py            | 83 +++++++++++++++++++
 test_runner/regress/test_timeline_archive.py  |  2 +
 vendor/postgres-v15                           |  2 +-
 vendor/postgres-v16                           |  2 +-
 vendor/postgres-v17                           |  2 +-
 vendor/revisions.json                         |  6 +-
 16 files changed, 238 insertions(+), 39 deletions(-)
 create mode 100644 test_runner/regress/test_pgstat.py

diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index 0239b56d9c..301bc2f16e 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -278,7 +278,7 @@ pub fn generate_pg_control(
     checkpoint_bytes: &[u8],
     lsn: Lsn,
     pg_version: u32,
-) -> anyhow::Result<(Bytes, u64)> {
+) -> anyhow::Result<(Bytes, u64, bool)> {
     dispatch_pgversion!(
         pg_version,
         pgv::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 852b20eace..14fb1f2a1f 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -124,23 +124,59 @@ pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn {
     }
 }
 
+/// Generate a pg_control file, for a basebackup for starting up Postgres at the given LSN
+///
+/// 'pg_control_bytes' and 'checkpoint_bytes' are the contents of those keys persisted in
+/// the pageserver. They use the same format as the PostgreSQL control file and the
+/// checkpoint record, but see walingest.rs for how exactly they are kept up to date.
+/// 'lsn' is the LSN at which we're starting up.
+///
+/// Returns:
+/// - pg_control file contents
+/// - system_identifier, extracted from the persisted information
+/// - true, if we're starting up from a "clean shutdown", i.e. if there was a shutdown
+///   checkpoint at the given LSN
 pub fn generate_pg_control(
     pg_control_bytes: &[u8],
     checkpoint_bytes: &[u8],
     lsn: Lsn,
-) -> anyhow::Result<(Bytes, u64)> {
+) -> anyhow::Result<(Bytes, u64, bool)> {
     let mut pg_control = ControlFileData::decode(pg_control_bytes)?;
     let mut checkpoint = CheckPoint::decode(checkpoint_bytes)?;
 
     // Generate new pg_control needed for bootstrap
+    //
+    // NB: In the checkpoint struct that we persist in the pageserver, we have a different
+    // convention for the 'redo' field than in PostgreSQL: On a shutdown checkpoint,
+    // 'redo' points the *end* of the checkpoint WAL record. On PostgreSQL, it points to
+    // the beginning. Furthermore, on an online checkpoint, 'redo' is set to 0.
+    //
+    // We didn't always have this convention however, and old persisted records will have
+    // old REDO values that point to some old LSN.
+    //
+    // The upshot is that if 'redo' is equal to the "current" LSN, there was a shutdown
+    // checkpoint record at that point in WAL, with no new WAL records after it. That case
+    // can be treated as starting from a clean shutdown. All other cases are treated as
+    // non-clean shutdown. In Neon, we don't do WAL replay at startup in either case, so
+    // that distinction doesn't matter very much. As of this writing, it only affects
+    // whether the persisted pg_stats information can be used or not.
+    //
+    // In the Checkpoint struct in the returned pg_control file, the redo pointer is
+    // always set to the LSN we're starting at, to hint that no WAL replay is required.
+    // (There's some neon-specific code in Postgres startup to make that work, though.
+    // Just setting the redo pointer is not sufficient.)
+    let was_shutdown = Lsn(checkpoint.redo) == lsn;
     checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0;
 
-    //save new values in pg_control
+    // We use DBState_DB_SHUTDOWNED even if it was not a clean shutdown.  The
+    // neon-specific code at postgres startup ignores the state stored in the control
+    // file, similar to archive recovery in standalone PostgreSQL. Similarly, the
+    // checkPoint pointer is ignored, so just set it to 0.
     pg_control.checkPoint = 0;
     pg_control.checkPointCopy = checkpoint;
     pg_control.state = DBState_DB_SHUTDOWNED;
 
-    Ok((pg_control.encode(), pg_control.system_identifier))
+    Ok((pg_control.encode(), pg_control.system_identifier, was_shutdown))
 }
 
 pub fn get_current_timestamp() -> TimestampTz {
diff --git a/pageserver/ctl/src/key.rs b/pageserver/ctl/src/key.rs
index af4b5a21ab..c7f0719c41 100644
--- a/pageserver/ctl/src/key.rs
+++ b/pageserver/ctl/src/key.rs
@@ -345,6 +345,7 @@ impl AuxFileV2 {
                 AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash)
             }
             (2, 1) => AuxFileV2::Recognized("pg_replslot/", hash),
+            (3, 1) => AuxFileV2::Recognized("pg_stat/pgstat.stat", hash),
             (1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash),
             (0xff, 0xff) => AuxFileV2::Other(hash),
             _ => return None,
diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs
index 5e527b7d61..5cc20a70b2 100644
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -39,6 +39,7 @@ fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key
 
 const AUX_DIR_PG_LOGICAL: u8 = 0x01;
 const AUX_DIR_PG_REPLSLOT: u8 = 0x02;
+const AUX_DIR_PG_STAT: u8 = 0x03;
 const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
 
 /// Encode the aux file into a fixed-size key.
@@ -53,6 +54,7 @@ const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
 /// * pg_logical/replorigin_checkpoint -> 0x0103
 /// * pg_logical/others -> 0x01FF
 /// * pg_replslot/ -> 0x0201
+/// * pg_stat/pgstat.stat -> 0x0301
 /// * others -> 0xFFFF
 ///
 /// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`.
@@ -75,6 +77,8 @@ pub fn encode_aux_file_key(path: &str) -> Key {
         aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes())
     } else if let Some(fname) = path.strip_prefix("pg_replslot/") {
         aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes())
+    } else if let Some(fname) = path.strip_prefix("pg_stat/") {
+        aux_hash_to_metadata_key(AUX_DIR_PG_STAT, 0x01, fname.as_bytes())
     } else {
         if cfg!(debug_assertions) {
             warn!(
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index e03b1bbe96..99b0775316 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -264,6 +264,31 @@ where
     async fn send_tarball(mut self) -> Result<(), BasebackupError> {
         // TODO include checksum
 
+        // Construct the pg_control file from the persisted checkpoint and pg_control
+        // information. But we only add this to the tarball at the end, so that if the
+        // writing is interrupted half-way through, the resulting incomplete tarball will
+        // be missing the pg_control file, which prevents PostgreSQL from starting up on
+        // it. With proper error handling, you should never try to start up from an
+        // incomplete basebackup in the first place, of course, but this is a nice little
+        // extra safety measure.
+        let checkpoint_bytes = self
+            .timeline
+            .get_checkpoint(self.lsn, self.ctx)
+            .await
+            .context("failed to get checkpoint bytes")?;
+        let pg_control_bytes = self
+            .timeline
+            .get_control_file(self.lsn, self.ctx)
+            .await
+            .context("failed to get control bytes")?;
+        let (pg_control_bytes, system_identifier, was_shutdown) =
+            postgres_ffi::generate_pg_control(
+                &pg_control_bytes,
+                &checkpoint_bytes,
+                self.lsn,
+                self.timeline.pg_version,
+            )?;
+
         let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
 
         let pgversion = self.timeline.pg_version;
@@ -401,6 +426,10 @@ where
                 // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
                 // but now we should handle (skip) it for backward compatibility.
                 continue;
+            } else if path == "pg_stat/pgstat.stat" && !was_shutdown {
+                // Drop statistic in case of abnormal termination, i.e. if we're not starting from the exact LSN
+                // of a shutdown checkpoint.
+                continue;
             }
             let header = new_tar_header(&path, content.len() as u64)?;
             self.ar
@@ -462,8 +491,9 @@ where
             )))
         });
 
-        // Generate pg_control and bootstrap WAL segment.
-        self.add_pgcontrol_file().await?;
+        // Last, add the pg_control file and bootstrap WAL segment.
+        self.add_pgcontrol_file(pg_control_bytes, system_identifier)
+            .await?;
         self.ar
             .finish()
             .await
@@ -671,7 +701,11 @@ where
     // Add generated pg_control file and bootstrap WAL segment.
     // Also send zenith.signal file with extra bootstrap data.
     //
-    async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> {
+    async fn add_pgcontrol_file(
+        &mut self,
+        pg_control_bytes: Bytes,
+        system_identifier: u64,
+    ) -> Result<(), BasebackupError> {
         // add zenith.signal file
         let mut zenith_signal = String::new();
         if self.prev_record_lsn == Lsn(0) {
@@ -694,24 +728,6 @@ where
             .await
             .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?;
 
-        let checkpoint_bytes = self
-            .timeline
-            .get_checkpoint(self.lsn, self.ctx)
-            .await
-            .context("failed to get checkpoint bytes")?;
-        let pg_control_bytes = self
-            .timeline
-            .get_control_file(self.lsn, self.ctx)
-            .await
-            .context("failed get control bytes")?;
-
-        let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control(
-            &pg_control_bytes,
-            &checkpoint_bytes,
-            self.lsn,
-            self.timeline.pg_version,
-        )?;
-
         //send pg_control
         let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
         self.ar
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index ae2762bd1e..d0e2dab042 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -45,7 +45,7 @@ use std::ops::ControlFlow;
 use std::ops::Range;
 use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, trace, warn};
+use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
 use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -2264,6 +2264,13 @@ impl DatadirModification<'_> {
                 self.tline.aux_file_size_estimator.on_add(content.len());
                 new_files.push((path, content));
             }
+            // Compute may request delete of old version of pgstat AUX file if new one exceeds size limit.
+            // Compute doesn't know if previous version of this file exists or not, so
+            // attempt to delete non-existing file can cause this message.
+            // To avoid false alarms, log it as info rather than warning.
+            (None, true) if path.starts_with("pg_stat/") => {
+                info!("removing non-existing pg_stat file: {}", path)
+            }
             (None, true) => warn!("removing non-existing aux file: {}", path),
         }
         let new_val = aux_file::encode_file_value(&new_files)?;
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 885c50425f..7ba0e3679f 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -51,8 +51,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::config::MaxVectoredReadBytes;
-use pageserver_api::key::DBDIR_KEY;
-use pageserver_api::key::{Key, KEY_SIZE};
+use pageserver_api::key::{Key, DBDIR_KEY, KEY_SIZE};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
@@ -967,7 +966,10 @@ impl DeltaLayerInner {
                 .as_slice()
                 .iter()
                 .filter_map(|(_, blob_meta)| {
-                    if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY {
+                    if blob_meta.key.is_rel_dir_key()
+                        || blob_meta.key == DBDIR_KEY
+                        || blob_meta.key.is_aux_file_key()
+                    {
                         // The size of values for these keys is unbounded and can
                         // grow very large in pathological cases.
                         None
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index c49281dc45..dc611bd6e1 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -48,8 +48,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use itertools::Itertools;
 use pageserver_api::config::MaxVectoredReadBytes;
-use pageserver_api::key::DBDIR_KEY;
-use pageserver_api::key::{Key, KEY_SIZE};
+use pageserver_api::key::{Key, DBDIR_KEY, KEY_SIZE};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_api::value::Value;
@@ -603,7 +602,10 @@ impl ImageLayerInner {
                     .as_slice()
                     .iter()
                     .filter_map(|(_, blob_meta)| {
-                        if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY {
+                        if blob_meta.key.is_rel_dir_key()
+                            || blob_meta.key == DBDIR_KEY
+                            || blob_meta.key.is_aux_file_key()
+                        {
                             // The size of values for these keys is unbounded and can
                             // grow very large in pathological cases.
                             None
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 04edb3e3f4..45c87353a7 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1180,6 +1180,50 @@ impl WalIngest {
                 } else {
                     cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
                 }
+                // NB: We abuse the Checkpoint.redo field:
+                //
+                // - In PostgreSQL, the Checkpoint struct doesn't store the information
+                //   of whether this is an online checkpoint or a shutdown checkpoint. It's
+                //   stored in the XLOG info field of the WAL record, shutdown checkpoints
+                //   use record type XLOG_CHECKPOINT_SHUTDOWN and online checkpoints use
+                //   XLOG_CHECKPOINT_ONLINE. We don't store the original WAL record headers
+                //   in the pageserver, however.
+                //
+                // - In PostgreSQL, the Checkpoint.redo field stores the *start* of the
+                //   checkpoint record, if it's a shutdown checkpoint. But when we are
+                //   starting from a shutdown checkpoint, the basebackup LSN is the *end*
+                //   of the shutdown checkpoint WAL record. That makes it difficult to
+                //   correctly detect whether we're starting from a shutdown record or
+                //   not.
+                //
+                // To address both of those issues, we store 0 in the redo field if it's
+                // an online checkpoint record, and the record's *end* LSN if it's a
+                // shutdown checkpoint. We don't need the original redo pointer in neon,
+                // because we don't perform WAL replay at startup anyway, so we can get
+                // away with abusing the redo field like this.
+                //
+                // XXX: Ideally, we would persist the extra information in a more
+                // explicit format, rather than repurpose the fields of the Postgres
+                // struct like this. However, we already have persisted data like this,
+                // so we need to maintain backwards compatibility.
+                //
+                // NB: We didn't originally have this convention, so there are still old
+                // persisted records that didn't do this. Before, we didn't update the
+                // persisted redo field at all. That means that old records have a bogus
+                // redo pointer that points to some old value, from the checkpoint record
+                // that was originally imported from the data directory. If it was a
+                // project created in Neon, that means it points to the first checkpoint
+                // after initdb. That's OK for our purposes: all such old checkpoints are
+                // treated as old online checkpoints when the basebackup is created.
+                cp.redo = if info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN {
+                    // Store the *end* LSN of the checkpoint record. Or to be precise,
+                    // the start LSN of the *next* record, i.e. if the record ends
+                    // exactly at page boundary, the redo LSN points to just after the
+                    // page header on the next page.
+                    lsn.into()
+                } else {
+                    Lsn::INVALID.into()
+                };
 
                 // Write a new checkpoint key-value pair on every checkpoint record, even
                 // if nothing really changed. Not strictly required, but it seems nice to
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 124e62999a..d49686b57c 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -29,6 +29,8 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
             ".*failed to load metadata.*",
             ".*load failed.*load local timeline.*",
             ".*: layer load failed, assuming permanent failure:.*",
+            ".*failed to get checkpoint bytes.*",
+            ".*failed to get control bytes.*",
         ]
     )
 
@@ -75,7 +77,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
     # (We don't check layer file contents on startup, when loading the timeline)
     #
     # This will change when we implement checksums for layers
-    with pytest.raises(Exception, match="get_values_reconstruct_data for layer ") as err:
+    with pytest.raises(Exception, match="failed to get checkpoint bytes") as err:
         pg1.start()
     log.info(
         f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}"
diff --git a/test_runner/regress/test_pgstat.py b/test_runner/regress/test_pgstat.py
new file mode 100644
index 0000000000..c31e5ef7f8
--- /dev/null
+++ b/test_runner/regress/test_pgstat.py
@@ -0,0 +1,83 @@
+import pytest
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.pg_version import PgVersion
+
+
+#
+# Test that pgstat statistic is preserved across sessions
+#
+def test_pgstat(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    if env.pg_version == PgVersion.V14:
+        pytest.skip("PG14 doesn't support pgstat statistic persistence")
+
+    n = 10000
+    endpoint = env.endpoints.create_start(
+        "main", config_lines=["neon_pgstat_file_size_limit=100kB", "autovacuum=off"]
+    )
+
+    con = endpoint.connect()
+    cur = con.cursor()
+
+    cur.execute("create table t(x integer)")
+    cur.execute(f"insert into t values (generate_series(1,{n}))")
+    cur.execute("vacuum analyze t")
+    cur.execute("select sum(x) from t")
+    cur.execute("update t set x=x+1")
+
+    cur.execute("select pg_stat_force_next_flush()")
+
+    cur.execute(
+        "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables"
+    )
+    rec = cur.fetchall()[0]
+    assert rec == (2, n * 2, n, n, n * 2, n, 1, 1)
+
+    endpoint.stop()
+    endpoint.start()
+
+    con = endpoint.connect()
+    cur = con.cursor()
+
+    cur.execute(
+        "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables"
+    )
+    rec = cur.fetchall()[0]
+    assert rec == (2, n * 2, n, n, n * 2, n, 1, 1)
+
+    cur.execute("update t set x=x+1")
+
+    # stop without checkpoint
+    endpoint.stop(mode="immediate")
+    endpoint.start()
+
+    con = endpoint.connect()
+    cur = con.cursor()
+
+    cur.execute(
+        "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables"
+    )
+    rec = cur.fetchall()[0]
+    # pgstat information should be discarded in case of abnormal termination
+    assert rec == (0, 0, 0, 0, 0, 0, 0, 0)
+
+    cur.execute("select sum(x) from t")
+
+    # create more relations to increase size of statistics
+    for i in range(1, 1000):
+        cur.execute(f"create table t{i}(pk integer primary key)")
+
+    cur.execute("select pg_stat_force_next_flush()")
+
+    endpoint.stop()
+    endpoint.start()
+
+    con = endpoint.connect()
+    cur = con.cursor()
+
+    cur.execute(
+        "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables"
+    )
+    rec = cur.fetchall()[0]
+    # pgstat information is not restored because its size exeeds 100k threshold
+    assert rec == (0, 0, 0, 0, 0, 0, 0, 0)
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 2706ddf2f0..c17840d31c 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -823,6 +823,8 @@ def test_timeline_retain_lsn(
             [
                 ".*initial size calculation failed: PageRead.MissingKey.could not find data for key.*",
                 ".*page_service_conn_main.*could not find data for key.*",
+                ".*failed to get checkpoint bytes.*",
+                ".*failed to get control bytes.*",
             ]
         )
     if offload_child is None or "no-restart" not in offload_child:
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 81e2eef061..023f1020ec 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 81e2eef0616c65c2233c75b06f25766ae4c080c4
+Subproject commit 023f1020ecb07af3bb0ddbf4622e1a3c3fa276a4
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 9422247c58..6cb8d22079 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 9422247c582e7c1a08a4855d04af0874f8df2f34
+Subproject commit 6cb8d22079570b50fcaff29124d40807c1e63a82
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index a8fea8b4be..59b2fe851f 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit a8fea8b4be43039f0782347c88a9b9b25f50c9d8
+Subproject commit 59b2fe851f8e0595f6c830b90ee766f4f1c17a0f
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 72d97d7f6a..3379cf1ba8 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,15 +1,15 @@
 {
   "v17": [
     "17.4",
-    "a8fea8b4be43039f0782347c88a9b9b25f50c9d8"
+    "59b2fe851f8e0595f6c830b90ee766f4f1c17a0f"
   ],
   "v16": [
     "16.8",
-    "9422247c582e7c1a08a4855d04af0874f8df2f34"
+    "6cb8d22079570b50fcaff29124d40807c1e63a82"
   ],
   "v15": [
     "15.12",
-    "81e2eef0616c65c2233c75b06f25766ae4c080c4"
+    "023f1020ecb07af3bb0ddbf4622e1a3c3fa276a4"
   ],
   "v14": [
     "14.17",

From 1d9346f8b746a5f4e6b5b9d1099bab9ebf6581d1 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Thu, 20 Feb 2025 11:05:01 +0100
Subject: [PATCH 534/583] Add pg_repack test (#10638)

## Problem
We don't test `pg_repack`
## Summary of changes
The test for `pg_repack` is added
---
 compute/compute-node.Dockerfile               |  5 +-
 compute/patches/pg_repack.patch               | 72 +++++++++++++++++++
 docker-compose/docker_compose_test.sh         |  9 +--
 .../ext-src/pg_repack-src/test-upgrade.sh     |  5 ++
 docker-compose/test_extensions_upgrade.sh     |  3 +-
 5 files changed, 84 insertions(+), 10 deletions(-)
 create mode 100644 compute/patches/pg_repack.patch
 create mode 100755 docker-compose/ext-src/pg_repack-src/test-upgrade.sh

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 0b3001613d..19633064a6 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1844,7 +1844,10 @@ COPY --from=pg_semver-src /ext-src/ /ext-src/
 COPY --from=pg_ivm-src /ext-src/ /ext-src/
 COPY --from=pg_partman-src /ext-src/ /ext-src/
 #COPY --from=pg_mooncake-src /ext-src/ /ext-src/
-#COPY --from=pg_repack-src /ext-src/ /ext-src/
+COPY --from=pg_repack-src /ext-src/ /ext-src/
+COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY compute/patches/pg_repack.patch /ext-src
+RUN cd /ext-src/pg_repack-src && patch -p1 </ext-src/pg_repack.patch && rm -f /ext-src/pg_repack.patch
 
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
 RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl\
diff --git a/compute/patches/pg_repack.patch b/compute/patches/pg_repack.patch
new file mode 100644
index 0000000000..f6b0aa1e13
--- /dev/null
+++ b/compute/patches/pg_repack.patch
@@ -0,0 +1,72 @@
+diff --git a/regress/Makefile b/regress/Makefile
+index bf6edcb..89b4c7f 100644
+--- a/regress/Makefile
++++ b/regress/Makefile
+@@ -17,7 +17,7 @@ INTVERSION := $(shell echo $$(($$(echo $(VERSION).0 | sed 's/\([[:digit:]]\{1,\}
+ # Test suite
+ #
+ 
+-REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper tablespace get_order_by trigger
++REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger
+ 
+ USE_PGXS = 1	# use pgxs if not in contrib directory
+ PGXS := $(shell $(PG_CONFIG) --pgxs)
+diff --git a/regress/expected/nosuper.out b/regress/expected/nosuper.out
+index 8d0a94e..63b68bf 100644
+--- a/regress/expected/nosuper.out
++++ b/regress/expected/nosuper.out
+@@ -4,22 +4,22 @@
+ SET client_min_messages = error;
+ DROP ROLE IF EXISTS nosuper;
+ SET client_min_messages = warning;
+-CREATE ROLE nosuper WITH LOGIN;
++CREATE ROLE nosuper WITH LOGIN PASSWORD 'NoSuPeRpAsSwOrD';
+ -- => OK
+ \! pg_repack --dbname=contrib_regression --table=tbl_cluster --no-superuser-check
+ INFO: repacking table "public.tbl_cluster"
+ -- => ERROR
+-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper
++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper
+ ERROR: pg_repack failed with error: You must be a superuser to use pg_repack
+ -- => ERROR
+-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
+ ERROR: pg_repack failed with error: ERROR:  permission denied for schema repack
+ LINE 1: select repack.version(), repack.version_sql()
+                ^
+ GRANT ALL ON ALL TABLES IN SCHEMA repack TO nosuper;
+ GRANT USAGE ON SCHEMA repack TO nosuper;
+ -- => ERROR
+-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
+ INFO: repacking table "public.tbl_cluster"
+ ERROR: query failed: ERROR:  current transaction is aborted, commands ignored until end of transaction block
+ DETAIL: query was: RESET lock_timeout
+diff --git a/regress/sql/nosuper.sql b/regress/sql/nosuper.sql
+index 072f0fa..dbe60f8 100644
+--- a/regress/sql/nosuper.sql
++++ b/regress/sql/nosuper.sql
+@@ -4,19 +4,19 @@
+ SET client_min_messages = error;
+ DROP ROLE IF EXISTS nosuper;
+ SET client_min_messages = warning;
+-CREATE ROLE nosuper WITH LOGIN;
++CREATE ROLE nosuper WITH LOGIN PASSWORD 'NoSuPeRpAsSwOrD';
+ -- => OK
+ \! pg_repack --dbname=contrib_regression --table=tbl_cluster --no-superuser-check
+ -- => ERROR
+-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper
++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper
+ -- => ERROR
+-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
+ 
+ GRANT ALL ON ALL TABLES IN SCHEMA repack TO nosuper;
+ GRANT USAGE ON SCHEMA repack TO nosuper;
+ 
+ -- => ERROR
+-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
+ 
+ REVOKE ALL ON ALL TABLES IN SCHEMA repack FROM nosuper;
+ REVOKE USAGE ON SCHEMA repack FROM nosuper;
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index dd520d4986..5b3cfc74eb 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -81,15 +81,8 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
             [ $EXT_SUCCESS -eq 0 ] && FAILED=$(tail -1 testout.txt | awk '{for(i=1;i<=NF;i++){print "/ext-src/"$i;}}')
             [ $CONTRIB_SUCCESS -eq 0 ] && CONTRIB_FAILED=$(tail -1 testout_contrib.txt | awk '{for(i=0;i<=NF;i++){print "/postgres/contrib/"$i;}}')
             for d in $FAILED $CONTRIB_FAILED; do
-                dn="$(basename $d)"
-                rm -rf $dn
-                mkdir $dn
-                docker cp $TEST_CONTAINER_NAME:$d/regression.diffs $dn || [ $? -eq 1 ]
-                docker cp $TEST_CONTAINER_NAME:$d/regression.out $dn || [ $? -eq 1 ]
-                cat $dn/regression.out $dn/regression.diffs || true
-                rm -rf $dn
+                docker exec $TEST_CONTAINER_NAME bash -c 'for file in $(find '"$d"' -name regression.diffs -o -name regression.out); do cat $file; done' || [ $? -eq 1 ]
             done
-        rm -rf $FAILED
         exit 1
         fi
     fi
diff --git a/docker-compose/ext-src/pg_repack-src/test-upgrade.sh b/docker-compose/ext-src/pg_repack-src/test-upgrade.sh
new file mode 100755
index 0000000000..5021eb4027
--- /dev/null
+++ b/docker-compose/ext-src/pg_repack-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --use-existing --inputdir=./regress --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger
diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh
index 4a9024569b..06d351b496 100755
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -43,7 +43,8 @@ EXTENSIONS='[
 {"extname": "semver", "extdir": "pg_semver-src"},
 {"extname": "pg_ivm", "extdir": "pg_ivm-src"},
 {"extname": "pgjwt", "extdir": "pgjwt-src"},
-{"extname": "pgtap", "extdir": "pgtap-src"}
+{"extname": "pgtap", "extdir": "pgtap-src"},
+{"extname": "pg_repack", "extdir": "pg_repack-src"}
 ]'
 EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
 TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d

From f7edcf12e320f5854d93cc21c5852bd2bf0433ce Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 20 Feb 2025 13:08:30 +0100
Subject: [PATCH 535/583] pageserver: downgrade ephemeral layer roll wait
 message (#10883)

We already log a message for this during the L0 flush, so the additional
message is mostly noise.
---
 pageserver/src/tenant/timeline.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 48c208d5d7..bc6131b378 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -6614,7 +6614,7 @@ impl TimelineWriter<'_> {
 
         if let Some(wait_threshold) = wait_threshold {
             if l0_count >= wait_threshold {
-                info!("layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers");
+                debug!("layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers");
                 self.tl.wait_flush_completion(flush_id).await?;
             }
         }

From 07bee600374ccd486c69370d0972d9035964fe68 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 20 Feb 2025 13:08:54 +0100
Subject: [PATCH 536/583] pageserver: make compaction walredo errors critical
 (#10884)

Mark walredo errors as critical too.

Also pull the pattern matching out into the outer `match`.

Follows #10872.
---
 pageserver/src/tenant/timeline.rs            |  6 ---
 pageserver/src/tenant/timeline/compaction.rs | 42 ++++++++++----------
 2 files changed, 20 insertions(+), 28 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index bc6131b378..b9425d2777 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5344,12 +5344,6 @@ impl From<OffloadError> for CompactionError {
     }
 }
 
-impl CompactionError {
-    pub fn is_cancelled(&self) -> bool {
-        matches!(self, CompactionError::ShuttingDown)
-    }
-}
-
 impl From<CollectKeySpaceError> for CompactionError {
     fn from(err: CollectKeySpaceError) -> Self {
         match err {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 4e4f906d78..58a87dbd5f 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -26,7 +26,7 @@ use pageserver_api::models::CompactInfoResponse;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
 use serde::Serialize;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, info, info_span, trace, warn, Instrument};
+use tracing::{debug, error, info, info_span, trace, warn, Instrument};
 use utils::critical;
 use utils::id::TimelineId;
 
@@ -775,27 +775,25 @@ impl Timeline {
                     return Ok(CompactionOutcome::YieldForL0);
                 }
             }
-            Err(err) => {
-                // no partitioning? This is normal, if the timeline was just created
-                // as an empty timeline. Also in unit tests, when we use the timeline
-                // as a simple key-value store, ignoring the datadir layout. Log the
-                // error but continue.
-                //
-                // Suppress error when it's due to cancellation
-                if !self.cancel.is_cancelled() && !err.is_cancelled() {
-                    if let CompactionError::CollectKeySpaceError(
-                        CollectKeySpaceError::Decode(_)
-                        | CollectKeySpaceError::PageRead(PageReconstructError::MissingKey(_)),
-                    ) = err
-                    {
-                        critical!("could not compact, repartitioning keyspace failed: {err:?}");
-                    } else {
-                        tracing::error!(
-                            "could not compact, repartitioning keyspace failed: {err:?}"
-                        );
-                    }
-                }
-            }
+
+            // Suppress errors when cancelled.
+            Err(_) if self.cancel.is_cancelled() => {}
+            Err(CompactionError::ShuttingDown) => {}
+
+            // Alert on critical errors that indicate data corruption.
+            Err(
+                err @ CompactionError::CollectKeySpaceError(
+                    CollectKeySpaceError::Decode(_)
+                    | CollectKeySpaceError::PageRead(
+                        PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
+                    ),
+                ),
+            ) => critical!("could not compact, repartitioning keyspace failed: {err:?}"),
+
+            // Log other errors. No partitioning? This is normal, if the timeline was just created
+            // as an empty timeline. Also in unit tests, when we use the timeline as a simple
+            // key-value store, ignoring the datadir layout. Log the error but continue.
+            Err(err) => error!("could not compact, repartitioning keyspace failed: {err:?}"),
         };
 
         let partition_count = self.partitioning.read().0 .0.parts.len();

From 7c7180a79dbda2764d883392a73950acf114b63f Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 20 Feb 2025 17:14:16 +0000
Subject: [PATCH 537/583] Fix deadlock in drop_subscriptions_before_start
 (#10806)

ALTER SUBSCRIPTION requires AccessExclusive lock
which conflicts with iteration over pg_subscription when multiple
databases are present
and operations are applied concurrently.

Fix by explicitly locking pg_subscription
in the beginning of the transaction in each database.

## Problem
https://github.com/neondatabase/cloud/issues/24292
---
 compute_tools/src/sql/drop_subscriptions.sql  |   1 +
 control_plane/src/endpoint.rs                 |  46 ++++-
 libs/compute_api/src/spec.rs                  |   8 +-
 .../regress/test_subscriber_branching.py      | 173 +++++++++++++++++-
 4 files changed, 220 insertions(+), 8 deletions(-)

diff --git a/compute_tools/src/sql/drop_subscriptions.sql b/compute_tools/src/sql/drop_subscriptions.sql
index dfb925e48e..03e8e158fa 100644
--- a/compute_tools/src/sql/drop_subscriptions.sql
+++ b/compute_tools/src/sql/drop_subscriptions.sql
@@ -2,6 +2,7 @@ DO $$
 DECLARE
     subname TEXT;
 BEGIN
+    LOCK TABLE pg_subscription IN ACCESS EXCLUSIVE MODE;
     FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP
         EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname);
         EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index c3c8229c38..c16b3cb017 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -59,6 +59,7 @@ use nix::sys::signal::Signal;
 use pageserver_api::shard::ShardStripeSize;
 use reqwest::header::CONTENT_TYPE;
 use serde::{Deserialize, Serialize};
+use tracing::debug;
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
 
@@ -81,8 +82,10 @@ pub struct EndpointConf {
     internal_http_port: u16,
     pg_version: u32,
     skip_pg_catalog_updates: bool,
+    reconfigure_concurrency: usize,
     drop_subscriptions_before_start: bool,
     features: Vec<ComputeFeature>,
+    cluster: Option<Cluster>,
 }
 
 //
@@ -179,7 +182,9 @@ impl ComputeControlPlane {
             // we also skip catalog updates in the cloud.
             skip_pg_catalog_updates,
             drop_subscriptions_before_start,
+            reconfigure_concurrency: 1,
             features: vec![],
+            cluster: None,
         });
 
         ep.create_endpoint_dir()?;
@@ -196,7 +201,9 @@ impl ComputeControlPlane {
                 pg_version,
                 skip_pg_catalog_updates,
                 drop_subscriptions_before_start,
+                reconfigure_concurrency: 1,
                 features: vec![],
+                cluster: None,
             })?,
         )?;
         std::fs::write(
@@ -261,8 +268,11 @@ pub struct Endpoint {
     skip_pg_catalog_updates: bool,
 
     drop_subscriptions_before_start: bool,
+    reconfigure_concurrency: usize,
     // Feature flags
     features: Vec<ComputeFeature>,
+    // Cluster settings
+    cluster: Option<Cluster>,
 }
 
 #[derive(PartialEq, Eq)]
@@ -302,6 +312,8 @@ impl Endpoint {
         let conf: EndpointConf =
             serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;
 
+        debug!("serialized endpoint conf: {:?}", conf);
+
         Ok(Endpoint {
             pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), conf.pg_port),
             external_http_address: SocketAddr::new(
@@ -319,8 +331,10 @@ impl Endpoint {
             tenant_id: conf.tenant_id,
             pg_version: conf.pg_version,
             skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
+            reconfigure_concurrency: conf.reconfigure_concurrency,
             drop_subscriptions_before_start: conf.drop_subscriptions_before_start,
             features: conf.features,
+            cluster: conf.cluster,
         })
     }
 
@@ -607,7 +621,7 @@ impl Endpoint {
         };
 
         // Create spec file
-        let spec = ComputeSpec {
+        let mut spec = ComputeSpec {
             skip_pg_catalog_updates: self.skip_pg_catalog_updates,
             format_version: 1.0,
             operation_uuid: None,
@@ -640,7 +654,7 @@ impl Endpoint {
                     Vec::new()
                 },
                 settings: None,
-                postgresql_conf: Some(postgresql_conf),
+                postgresql_conf: Some(postgresql_conf.clone()),
             },
             delta_operations: None,
             tenant_id: Some(self.tenant_id),
@@ -653,9 +667,35 @@ impl Endpoint {
             pgbouncer_settings: None,
             shard_stripe_size: Some(shard_stripe_size),
             local_proxy_config: None,
-            reconfigure_concurrency: 1,
+            reconfigure_concurrency: self.reconfigure_concurrency,
             drop_subscriptions_before_start: self.drop_subscriptions_before_start,
         };
+
+        // this strange code is needed to support respec() in tests
+        if self.cluster.is_some() {
+            debug!("Cluster is already set in the endpoint spec, using it");
+            spec.cluster = self.cluster.clone().unwrap();
+
+            debug!("spec.cluster {:?}", spec.cluster);
+
+            // fill missing fields again
+            if create_test_user {
+                spec.cluster.roles.push(Role {
+                    name: PgIdent::from_str("test").unwrap(),
+                    encrypted_password: None,
+                    options: None,
+                });
+                spec.cluster.databases.push(Database {
+                    name: PgIdent::from_str("neondb").unwrap(),
+                    owner: PgIdent::from_str("test").unwrap(),
+                    options: None,
+                    restrict_conn: false,
+                    invalid: false,
+                });
+            }
+            spec.cluster.postgresql_conf = Some(postgresql_conf);
+        }
+
         let spec_path = self.endpoint_path().join("spec.json");
         std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
 
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 767a34bcbc..8fffae92fb 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -252,7 +252,7 @@ pub enum ComputeMode {
     Replica,
 }
 
-#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+#[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
 pub struct Cluster {
     pub cluster_id: Option<String>,
     pub name: Option<String>,
@@ -283,7 +283,7 @@ pub struct DeltaOp {
 
 /// Rust representation of Postgres role info with only those fields
 /// that matter for us.
-#[derive(Clone, Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
 pub struct Role {
     pub name: PgIdent,
     pub encrypted_password: Option<String>,
@@ -292,7 +292,7 @@ pub struct Role {
 
 /// Rust representation of Postgres database info with only those fields
 /// that matter for us.
-#[derive(Clone, Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
 pub struct Database {
     pub name: PgIdent,
     pub owner: PgIdent,
@@ -308,7 +308,7 @@ pub struct Database {
 /// Common type representing both SQL statement params with or without value,
 /// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
 /// options like `wal_level = logical`.
-#[derive(Clone, Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
 pub struct GenericOption {
     pub name: String,
     pub value: Option<String>,
diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py
index 849d4f024d..6175643389 100644
--- a/test_runner/regress/test_subscriber_branching.py
+++ b/test_runner/regress/test_subscriber_branching.py
@@ -1,9 +1,10 @@
 from __future__ import annotations
 
+import threading
 import time
 
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv
+from fixtures.neon_fixtures import NeonEnv, logical_replication_sync
 from fixtures.utils import query_scalar, wait_until
 
 
@@ -239,3 +240,173 @@ def test_subscriber_branching(neon_simple_env: NeonEnv):
             res = scur_postgres.fetchall()
             assert len(res) == 1
             assert str(sub_child_2_timeline_id) == res[0][0]
+
+
+def test_multiple_subscription_branching(neon_simple_env: NeonEnv):
+    """
+    Test that compute_ctl can handle concurrent deletion of subscriptions in a multiple databases
+    """
+    env = neon_simple_env
+
+    NUMBER_OF_DBS = 5
+
+    # Create and start endpoint so that neon_local put all the generated
+    # stuff into the spec.json file.
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            "max_replication_slots = 10",
+            "max_logical_replication_workers=10",
+            "max_worker_processes=10",
+        ],
+    )
+
+    TEST_DB_NAMES = [
+        {
+            "name": "neondb",
+            "owner": "cloud_admin",
+        },
+        {
+            "name": "publisher_db",
+            "owner": "cloud_admin",
+        },
+    ]
+
+    for i in range(NUMBER_OF_DBS):
+        TEST_DB_NAMES.append(
+            {
+                "name": f"db{i}",
+                "owner": "cloud_admin",
+            }
+        )
+
+    # Update the spec.json file to create the databases
+    # and reconfigure the endpoint to apply the changes.
+    endpoint.respec_deep(
+        **{
+            "skip_pg_catalog_updates": False,
+            "cluster": {
+                "databases": TEST_DB_NAMES,
+            },
+        }
+    )
+    endpoint.reconfigure()
+
+    connstr = endpoint.connstr(dbname="publisher_db").replace("'", "''")
+
+    # create table, replication and subscription for each of the databases
+    with endpoint.cursor(dbname="publisher_db") as publisher_cursor:
+        for i in range(NUMBER_OF_DBS):
+            publisher_cursor.execute(f"CREATE TABLE t{i}(a int)")
+            publisher_cursor.execute(f"CREATE PUBLICATION mypub{i} FOR TABLE t{i}")
+            publisher_cursor.execute(
+                f"select pg_catalog.pg_create_logical_replication_slot('mysub{i}', 'pgoutput');"
+            )
+            publisher_cursor.execute(f"INSERT INTO t{i} VALUES ({i})")
+
+            with endpoint.cursor(dbname=f"db{i}") as cursor:
+                cursor.execute(f"CREATE TABLE t{i}(a int)")
+                cursor.execute(
+                    f"CREATE SUBSCRIPTION mysub{i} CONNECTION '{connstr}' PUBLICATION mypub{i}  WITH (create_slot = false) "
+                )
+
+    # wait for the subscription to be active
+    for i in range(NUMBER_OF_DBS):
+        logical_replication_sync(
+            endpoint,
+            endpoint,
+            f"mysub{i}",
+            sub_dbname=f"db{i}",
+            pub_dbname="publisher_db",
+        )
+
+    # Check that replication is working
+    for i in range(NUMBER_OF_DBS):
+        with endpoint.cursor(dbname=f"db{i}") as cursor:
+            cursor.execute(f"SELECT * FROM t{i}")
+            rows = cursor.fetchall()
+            assert len(rows) == 1
+            assert rows[0][0] == i
+
+            last_insert_lsn = query_scalar(cursor, "select pg_current_wal_insert_lsn();")
+
+    def start_publisher_workload(table_num: int, duration: int):
+        start = time.time()
+        with endpoint.cursor(dbname="publisher_db") as cur:
+            while time.time() - start < duration:
+                cur.execute(f"INSERT INTO t{i} SELECT FROM generate_series(1,1000)")
+
+    LOAD_DURATION = 5
+    threads = [
+        threading.Thread(target=start_publisher_workload, args=(i, LOAD_DURATION))
+        for i in range(NUMBER_OF_DBS)
+    ]
+
+    for thread in threads:
+        thread.start()
+
+    sub_child_1_timeline_id = env.create_branch(
+        "subscriber_child_1",
+        ancestor_branch_name="main",
+        ancestor_start_lsn=last_insert_lsn,
+    )
+
+    sub_child_1 = env.endpoints.create("subscriber_child_1")
+
+    sub_child_1.respec(
+        skip_pg_catalog_updates=False,
+        reconfigure_concurrency=5,
+        drop_subscriptions_before_start=True,
+        cluster={
+            "databases": TEST_DB_NAMES,
+            "roles": [],
+        },
+    )
+
+    sub_child_1.start()
+
+    # ensure that subscription deletion happened on this timeline
+    with sub_child_1.cursor() as scur_postgres:
+        scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done")
+        res = scur_postgres.fetchall()
+        log.info(f"res = {res}")
+        assert len(res) == 1
+        assert str(sub_child_1_timeline_id) == res[0][0]
+
+    # ensure that there are no subscriptions in the databases
+    for i in range(NUMBER_OF_DBS):
+        with sub_child_1.cursor(dbname=f"db{i}") as cursor:
+            cursor.execute("SELECT * FROM pg_catalog.pg_subscription")
+            res = cursor.fetchall()
+            assert len(res) == 0
+
+            # ensure that there are no unexpected rows in the tables
+            cursor.execute(f"SELECT * FROM t{i}")
+            rows = cursor.fetchall()
+            assert len(rows) == 1
+            assert rows[0][0] == i
+
+    for thread in threads:
+        thread.join()
+
+    # ensure that logical replication is still working in main endpoint
+    # wait for it to catch up
+    for i in range(NUMBER_OF_DBS):
+        logical_replication_sync(
+            endpoint,
+            endpoint,
+            f"mysub{i}",
+            sub_dbname=f"db{i}",
+            pub_dbname="publisher_db",
+        )
+
+    # verify that the data is the same in publisher and subscriber tables
+    with endpoint.cursor(dbname="publisher_db") as publisher_cursor:
+        for i in range(NUMBER_OF_DBS):
+            with endpoint.cursor(dbname=f"db{i}") as cursor:
+                publisher_cursor.execute(f"SELECT count(*) FROM t{i}")
+                cursor.execute(f"SELECT count(*) FROM t{i}")
+                pub_res = publisher_cursor.fetchone()
+                sub_res = cursor.fetchone()
+                log.info(f"for table t{i}: pub_res = {pub_res}, sub_res = {sub_res}")
+                assert pub_res == sub_res

From e808e9432af8ec6809cf97de577ff4e2a466fd02 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Thu, 20 Feb 2025 21:16:04 +0400
Subject: [PATCH 538/583] storcon: use https for pageservers (#10759)

## Problem

Storage controller uses unsecure http for pageserver API.

Closes: https://github.com/neondatabase/cloud/issues/23734
Closes: https://github.com/neondatabase/cloud/issues/24091

## Summary of changes

- Add an optional `listen_https_port` field to storage controller's Node
state and its API (RegisterNode/ListNodes/etc).
- Allow updating `listen_https_port` on node registration to gradually
add https port for all nodes.
- Add `use_https_pageserver_api` CLI option to storage controller to
enable https.
- Pageserver doesn't support https for now and always reports
`https_port=None`. This will be addressed in follow-up PR.
---
 control_plane/storcon_cli/src/main.rs         |  5 ++
 libs/pageserver_api/src/controller_api.rs     |  3 +
 pageserver/src/controller_upcall_client.rs    |  1 +
 .../down.sql                                  |  1 +
 .../up.sql                                    |  1 +
 storage_controller/src/main.rs                |  5 ++
 storage_controller/src/node.rs                | 60 +++++++++++--
 storage_controller/src/persistence.rs         | 40 ++++++++-
 storage_controller/src/scheduler.rs           |  5 +-
 storage_controller/src/schema.rs              |  1 +
 storage_controller/src/service.rs             | 85 +++++++++++++++----
 test_runner/fixtures/neon_fixtures.py         |  2 +
 .../regress/test_storage_controller.py        | 53 ++++++++++++
 13 files changed, 231 insertions(+), 31 deletions(-)
 create mode 100644 storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql
 create mode 100644 storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 3c574efc63..953ade83ad 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -47,6 +47,9 @@ enum Command {
         listen_http_addr: String,
         #[arg(long)]
         listen_http_port: u16,
+        #[arg(long)]
+        listen_https_port: Option<u16>,
+
         #[arg(long)]
         availability_zone_id: String,
     },
@@ -394,6 +397,7 @@ async fn main() -> anyhow::Result<()> {
             listen_pg_port,
             listen_http_addr,
             listen_http_port,
+            listen_https_port,
             availability_zone_id,
         } => {
             storcon_client
@@ -406,6 +410,7 @@ async fn main() -> anyhow::Result<()> {
                         listen_pg_port,
                         listen_http_addr,
                         listen_http_port,
+                        listen_https_port,
                         availability_zone_id: AvailabilityZone(availability_zone_id),
                     }),
                 )
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 42f6e47e63..f94bfab581 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -57,6 +57,7 @@ pub struct NodeRegisterRequest {
 
     pub listen_http_addr: String,
     pub listen_http_port: u16,
+    pub listen_https_port: Option<u16>,
 
     pub availability_zone_id: AvailabilityZone,
 }
@@ -105,6 +106,7 @@ pub struct TenantLocateResponseShard {
 
     pub listen_http_addr: String,
     pub listen_http_port: u16,
+    pub listen_https_port: Option<u16>,
 }
 
 #[derive(Serialize, Deserialize)]
@@ -148,6 +150,7 @@ pub struct NodeDescribeResponse {
 
     pub listen_http_addr: String,
     pub listen_http_port: u16,
+    pub listen_https_port: Option<u16>,
 
     pub listen_pg_addr: String,
     pub listen_pg_port: u16,
diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs
index d41bfd9021..4990f17b40 100644
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -173,6 +173,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
                         listen_pg_port: m.postgres_port,
                         listen_http_addr: m.http_host,
                         listen_http_port: m.http_port,
+                        listen_https_port: None, // TODO: Support https.
                         availability_zone_id: az_id.expect("Checked above"),
                     })
                 }
diff --git a/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql
new file mode 100644
index 0000000000..0f051d3ac3
--- /dev/null
+++ b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql
@@ -0,0 +1 @@
+ALTER TABLE nodes DROP listen_https_port;
diff --git a/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql
new file mode 100644
index 0000000000..172237d477
--- /dev/null
+++ b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql
@@ -0,0 +1 @@
+ALTER TABLE nodes ADD listen_https_port INTEGER;
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 9a9958f7a6..be074d269d 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -126,6 +126,10 @@ struct Cli {
 
     #[arg(long)]
     long_reconcile_threshold: Option<humantime::Duration>,
+
+    // Flag to use https for requests to pageserver API.
+    #[arg(long, default_value = "false")]
+    use_https_pageserver_api: bool,
 }
 
 enum StrictMode {
@@ -321,6 +325,7 @@ async fn async_main() -> anyhow::Result<()> {
         address_for_peers: args.address_for_peers,
         start_as_candidate: args.start_as_candidate,
         http_service_port: args.listen.port() as i32,
+        use_https_pageserver_api: args.use_https_pageserver_api,
     };
 
     // Validate that we can connect to the database
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index f5c2d329e0..3762d13c10 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -1,5 +1,6 @@
 use std::{str::FromStr, time::Duration};
 
+use anyhow::anyhow;
 use pageserver_api::{
     controller_api::{
         AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeRegisterRequest,
@@ -32,12 +33,16 @@ pub(crate) struct Node {
 
     listen_http_addr: String,
     listen_http_port: u16,
+    listen_https_port: Option<u16>,
 
     listen_pg_addr: String,
     listen_pg_port: u16,
 
     availability_zone_id: AvailabilityZone,
 
+    // Flag from storcon's config to use https for pageserver admin API.
+    // Invariant: if |true|, listen_https_port should contain a value.
+    use_https: bool,
     // This cancellation token means "stop any RPCs in flight to this node, and don't start
     // any more". It is not related to process shutdown.
     #[serde(skip)]
@@ -56,7 +61,16 @@ pub(crate) enum AvailabilityTransition {
 
 impl Node {
     pub(crate) fn base_url(&self) -> String {
-        format!("http://{}:{}", self.listen_http_addr, self.listen_http_port)
+        if self.use_https {
+            format!(
+                "https://{}:{}",
+                self.listen_http_addr,
+                self.listen_https_port
+                    .expect("https port should be specified if use_https is on")
+            )
+        } else {
+            format!("http://{}:{}", self.listen_http_addr, self.listen_http_port)
+        }
     }
 
     pub(crate) fn get_id(&self) -> NodeId {
@@ -82,11 +96,20 @@ impl Node {
         self.id == register_req.node_id
             && self.listen_http_addr == register_req.listen_http_addr
             && self.listen_http_port == register_req.listen_http_port
+            // Note: listen_https_port may change. See [`Self::need_update`] for mode details.
+            // && self.listen_https_port == register_req.listen_https_port
             && self.listen_pg_addr == register_req.listen_pg_addr
             && self.listen_pg_port == register_req.listen_pg_port
             && self.availability_zone_id == register_req.availability_zone_id
     }
 
+    // Do we need to update an existing record in DB on this registration request?
+    pub(crate) fn need_update(&self, register_req: &NodeRegisterRequest) -> bool {
+        // listen_https_port is checked here because it may change during migration to https.
+        // After migration, this check may be moved to registration_match.
+        self.listen_https_port != register_req.listen_https_port
+    }
+
     /// For a shard located on this node, populate a response object
     /// with this node's address information.
     pub(crate) fn shard_location(&self, shard_id: TenantShardId) -> TenantLocateResponseShard {
@@ -95,6 +118,7 @@ impl Node {
             node_id: self.id,
             listen_http_addr: self.listen_http_addr.clone(),
             listen_http_port: self.listen_http_port,
+            listen_https_port: self.listen_https_port,
             listen_pg_addr: self.listen_pg_addr.clone(),
             listen_pg_port: self.listen_pg_port,
         }
@@ -175,25 +199,34 @@ impl Node {
         }
     }
 
+    #[allow(clippy::too_many_arguments)]
     pub(crate) fn new(
         id: NodeId,
         listen_http_addr: String,
         listen_http_port: u16,
+        listen_https_port: Option<u16>,
         listen_pg_addr: String,
         listen_pg_port: u16,
         availability_zone_id: AvailabilityZone,
-    ) -> Self {
-        Self {
+        use_https: bool,
+    ) -> anyhow::Result<Self> {
+        if use_https && listen_https_port.is_none() {
+            return Err(anyhow!("https is enabled, but node has no https port"));
+        }
+
+        Ok(Self {
             id,
             listen_http_addr,
             listen_http_port,
+            listen_https_port,
             listen_pg_addr,
             listen_pg_port,
             scheduling: NodeSchedulingPolicy::Active,
             availability: NodeAvailability::Offline,
             availability_zone_id,
+            use_https,
             cancel: CancellationToken::new(),
-        }
+        })
     }
 
     pub(crate) fn to_persistent(&self) -> NodePersistence {
@@ -202,14 +235,19 @@ impl Node {
             scheduling_policy: self.scheduling.into(),
             listen_http_addr: self.listen_http_addr.clone(),
             listen_http_port: self.listen_http_port as i32,
+            listen_https_port: self.listen_https_port.map(|x| x as i32),
             listen_pg_addr: self.listen_pg_addr.clone(),
             listen_pg_port: self.listen_pg_port as i32,
             availability_zone_id: self.availability_zone_id.0.clone(),
         }
     }
 
-    pub(crate) fn from_persistent(np: NodePersistence) -> Self {
-        Self {
+    pub(crate) fn from_persistent(np: NodePersistence, use_https: bool) -> anyhow::Result<Self> {
+        if use_https && np.listen_https_port.is_none() {
+            return Err(anyhow!("https is enabled, but node has no https port"));
+        }
+
+        Ok(Self {
             id: NodeId(np.node_id as u64),
             // At startup we consider a node offline until proven otherwise.
             availability: NodeAvailability::Offline,
@@ -217,11 +255,13 @@ impl Node {
                 .expect("Bad scheduling policy in DB"),
             listen_http_addr: np.listen_http_addr,
             listen_http_port: np.listen_http_port as u16,
+            listen_https_port: np.listen_https_port.map(|x| x as u16),
             listen_pg_addr: np.listen_pg_addr,
             listen_pg_port: np.listen_pg_port as u16,
             availability_zone_id: AvailabilityZone(np.availability_zone_id),
+            use_https,
             cancel: CancellationToken::new(),
-        }
+        })
     }
 
     /// Wrapper for issuing requests to pageserver management API: takes care of generic
@@ -285,8 +325,9 @@ impl Node {
             warn_threshold,
             max_retries,
             &format!(
-                "Call to node {} ({}:{}) management API",
-                self.id, self.listen_http_addr, self.listen_http_port
+                "Call to node {} ({}) management API",
+                self.id,
+                self.base_url(),
             ),
             cancel,
         )
@@ -302,6 +343,7 @@ impl Node {
             availability_zone_id: self.availability_zone_id.0.clone(),
             listen_http_addr: self.listen_http_addr.clone(),
             listen_http_port: self.listen_http_port,
+            listen_https_port: self.listen_https_port,
             listen_pg_addr: self.listen_pg_addr.clone(),
             listen_pg_port: self.listen_pg_port,
         }
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 67b60eadf3..459c11add9 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -375,18 +375,23 @@ impl Persistence {
         Ok(nodes)
     }
 
-    pub(crate) async fn update_node(
+    pub(crate) async fn update_node<V>(
         &self,
         input_node_id: NodeId,
-        input_scheduling: NodeSchedulingPolicy,
-    ) -> DatabaseResult<()> {
+        values: V,
+    ) -> DatabaseResult<()>
+    where
+        V: diesel::AsChangeset<Target = crate::schema::nodes::table> + Clone + Send + Sync,
+        V::Changeset: diesel::query_builder::QueryFragment<diesel::pg::Pg> + Send, // valid Postgres SQL
+    {
         use crate::schema::nodes::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
+                let values = values.clone();
                 Box::pin(async move {
                     let updated = diesel::update(nodes)
                         .filter(node_id.eq(input_node_id.0 as i64))
-                        .set((scheduling_policy.eq(String::from(input_scheduling)),))
+                        .set(values)
                         .execute(conn)
                         .await?;
                     Ok(updated)
@@ -403,6 +408,32 @@ impl Persistence {
         }
     }
 
+    pub(crate) async fn update_node_scheduling_policy(
+        &self,
+        input_node_id: NodeId,
+        input_scheduling: NodeSchedulingPolicy,
+    ) -> DatabaseResult<()> {
+        use crate::schema::nodes::dsl::*;
+        self.update_node(
+            input_node_id,
+            scheduling_policy.eq(String::from(input_scheduling)),
+        )
+        .await
+    }
+
+    pub(crate) async fn update_node_on_registration(
+        &self,
+        input_node_id: NodeId,
+        input_https_port: Option<u16>,
+    ) -> DatabaseResult<()> {
+        use crate::schema::nodes::dsl::*;
+        self.update_node(
+            input_node_id,
+            listen_https_port.eq(input_https_port.map(|x| x as i32)),
+        )
+        .await
+    }
+
     /// At startup, load the high level state for shards, such as their config + policy.  This will
     /// be enriched at runtime with state discovered on pageservers.
     ///
@@ -1452,6 +1483,7 @@ pub(crate) struct NodePersistence {
     pub(crate) listen_pg_addr: String,
     pub(crate) listen_pg_port: i32,
     pub(crate) availability_zone_id: String,
+    pub(crate) listen_https_port: Option<i32>,
 }
 
 /// Tenant metadata health status that are stored durably.
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 106a7b2699..44936d018a 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -930,13 +930,16 @@ pub(crate) mod test_utils {
                         NodeId(i),
                         format!("httphost-{i}"),
                         80 + i as u16,
+                        None,
                         format!("pghost-{i}"),
                         5432 + i as u16,
                         az_iter
                             .next()
                             .cloned()
                             .unwrap_or(AvailabilityZone("test-az".to_string())),
-                    );
+                        false,
+                    )
+                    .unwrap();
                     node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0)));
                     assert!(node.is_available());
                     node
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index 14c30c296d..361253bd19 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -26,6 +26,7 @@ diesel::table! {
         listen_pg_addr -> Varchar,
         listen_pg_port -> Int4,
         availability_zone_id -> Varchar,
+        listen_https_port -> Nullable<Int4>,
     }
 }
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index fc6d2f3d29..25a1cb4252 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -399,6 +399,8 @@ pub struct Config {
     pub http_service_port: i32,
 
     pub long_reconcile_threshold: Duration,
+
+    pub use_https_pageserver_api: bool,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -1401,8 +1403,8 @@ impl Service {
             .list_nodes()
             .await?
             .into_iter()
-            .map(Node::from_persistent)
-            .collect::<Vec<_>>();
+            .map(|x| Node::from_persistent(x, config.use_https_pageserver_api))
+            .collect::<anyhow::Result<Vec<Node>>>()?;
         let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.get_id(), n)).collect();
         tracing::info!("Loaded {} nodes from database.", nodes.len());
         metrics::METRICS_REGISTRY
@@ -1501,10 +1503,13 @@ impl Service {
                     NodeId(node_id as u64),
                     "".to_string(),
                     123,
+                    None,
                     "".to_string(),
                     123,
                     AvailabilityZone("test_az".to_string()),
-                );
+                    false,
+                )
+                .unwrap();
 
                 scheduler.node_upsert(&node);
             }
@@ -5907,8 +5912,10 @@ impl Service {
         )
         .await;
 
+        #[derive(PartialEq)]
         enum RegistrationStatus {
-            Matched,
+            UpToDate,
+            NeedUpdate,
             Mismatched,
             New,
         }
@@ -5917,7 +5924,11 @@ impl Service {
             let locked = self.inner.read().unwrap();
             if let Some(node) = locked.nodes.get(&register_req.node_id) {
                 if node.registration_match(&register_req) {
-                    RegistrationStatus::Matched
+                    if node.need_update(&register_req) {
+                        RegistrationStatus::NeedUpdate
+                    } else {
+                        RegistrationStatus::UpToDate
+                    }
                 } else {
                     RegistrationStatus::Mismatched
                 }
@@ -5927,9 +5938,9 @@ impl Service {
         };
 
         match registration_status {
-            RegistrationStatus::Matched => {
+            RegistrationStatus::UpToDate => {
                 tracing::info!(
-                    "Node {} re-registered with matching address",
+                    "Node {} re-registered with matching address and is up to date",
                     register_req.node_id
                 );
 
@@ -5947,7 +5958,7 @@ impl Service {
                     "Node is already registered with different address".to_string(),
                 ));
             }
-            RegistrationStatus::New => {
+            RegistrationStatus::New | RegistrationStatus::NeedUpdate => {
                 // fallthrough
             }
         }
@@ -5976,6 +5987,16 @@ impl Service {
             ));
         }
 
+        if self.config.use_https_pageserver_api && register_req.listen_https_port.is_none() {
+            return Err(ApiError::PreconditionFailed(
+                format!(
+                    "Node {} has no https port, but use_https is enabled",
+                    register_req.node_id
+                )
+                .into(),
+            ));
+        }
+
         // Ordering: we must persist the new node _before_ adding it to in-memory state.
         // This ensures that before we use it for anything or expose it via any external
         // API, it is guaranteed to be available after a restart.
@@ -5983,13 +6004,29 @@ impl Service {
             register_req.node_id,
             register_req.listen_http_addr,
             register_req.listen_http_port,
+            register_req.listen_https_port,
             register_req.listen_pg_addr,
             register_req.listen_pg_port,
             register_req.availability_zone_id.clone(),
+            self.config.use_https_pageserver_api,
         );
+        let new_node = match new_node {
+            Ok(new_node) => new_node,
+            Err(error) => return Err(ApiError::InternalServerError(error)),
+        };
 
-        // TODO: idempotency if the node already exists in the database
-        self.persistence.insert_node(&new_node).await?;
+        match registration_status {
+            RegistrationStatus::New => self.persistence.insert_node(&new_node).await?,
+            RegistrationStatus::NeedUpdate => {
+                self.persistence
+                    .update_node_on_registration(
+                        register_req.node_id,
+                        register_req.listen_https_port,
+                    )
+                    .await?
+            }
+            _ => unreachable!("Other statuses have been processed earlier"),
+        }
 
         let mut locked = self.inner.write().unwrap();
         let mut new_nodes = (*locked.nodes).clone();
@@ -6004,12 +6041,24 @@ impl Service {
             .storage_controller_pageserver_nodes
             .set(locked.nodes.len() as i64);
 
-        tracing::info!(
-            "Registered pageserver {} ({}), now have {} pageservers",
-            register_req.node_id,
-            register_req.availability_zone_id,
-            locked.nodes.len()
-        );
+        match registration_status {
+            RegistrationStatus::New => {
+                tracing::info!(
+                    "Registered pageserver {} ({}), now have {} pageservers",
+                    register_req.node_id,
+                    register_req.availability_zone_id,
+                    locked.nodes.len()
+                );
+            }
+            RegistrationStatus::NeedUpdate => {
+                tracing::info!(
+                    "Re-registered and updated node {} ({})",
+                    register_req.node_id,
+                    register_req.availability_zone_id,
+                );
+            }
+            _ => unreachable!("Other statuses have been processed earlier"),
+        }
         Ok(())
     }
 
@@ -6027,7 +6076,9 @@ impl Service {
         if let Some(scheduling) = scheduling {
             // Scheduling is a persistent part of Node: we must write updates to the database before
             // applying them in memory
-            self.persistence.update_node(node_id, scheduling).await?;
+            self.persistence
+                .update_node_scheduling_policy(node_id, scheduling)
+                .await?;
         }
 
         // If we're activating a node, then before setting it active we must reconcile any shard locations
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 58c5dbfd29..36af522535 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1630,6 +1630,7 @@ def neon_env_builder(
 class PageserverPort:
     pg: int
     http: int
+    https: int | None = None
 
 
 class LogUtils:
@@ -1886,6 +1887,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
             "node_id": int(node.id),
             "listen_http_addr": "localhost",
             "listen_http_port": node.service_port.http,
+            "listen_https_port": node.service_port.https,
             "listen_pg_addr": "localhost",
             "listen_pg_port": node.service_port.pg,
             "availability_zone_id": node.az_id,
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 1d95312140..7e895422d2 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3764,3 +3764,56 @@ def test_storage_controller_node_flap_detach_race(
             assert len(locs) == 1, f"{shard} has {len(locs)} attached locations"
 
     wait_until(validate_locations, timeout=10)
+
+
+def test_update_node_on_registration(neon_env_builder: NeonEnvBuilder):
+    """
+    Check that storage controller handles node_register requests with updated fields correctly.
+    1. Run storage controller and register 1 pageserver without https port.
+    2. Register the same pageserver with https port. Check that port has been updated.
+    3. Restart the storage controller. Check that https port is persistent.
+    4. Register the same pageserver without https port again (rollback). Check that port has been removed.
+    """
+    neon_env_builder.num_pageservers = 1
+    env = neon_env_builder.init_configs()
+
+    env.storage_controller.start()
+    env.storage_controller.wait_until_ready()
+
+    pageserver = env.pageservers[0]
+
+    # Step 1. Register pageserver without https port.
+    env.storage_controller.node_register(pageserver)
+    env.storage_controller.consistency_check()
+
+    nodes = env.storage_controller.node_list()
+    assert len(nodes) == 1
+    assert nodes[0]["listen_https_port"] is None
+
+    # Step 2. Register pageserver with https port.
+    pageserver.service_port.https = 1234
+    env.storage_controller.node_register(pageserver)
+    env.storage_controller.consistency_check()
+
+    nodes = env.storage_controller.node_list()
+    assert len(nodes) == 1
+    assert nodes[0]["listen_https_port"] == 1234
+
+    # Step 3. Restart storage controller.
+    env.storage_controller.stop()
+    env.storage_controller.start()
+    env.storage_controller.wait_until_ready()
+    env.storage_controller.consistency_check()
+
+    nodes = env.storage_controller.node_list()
+    assert len(nodes) == 1
+    assert nodes[0]["listen_https_port"] == 1234
+
+    # Step 4. Register pageserver with no https port again.
+    pageserver.service_port.https = None
+    env.storage_controller.node_register(pageserver)
+    env.storage_controller.consistency_check()
+
+    nodes = env.storage_controller.node_list()
+    assert len(nodes) == 1
+    assert nodes[0]["listen_https_port"] is None

From f7474d3f4142d1a05dda0719b19037358f717bae Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 20 Feb 2025 11:31:42 -0600
Subject: [PATCH 539/583] Remove forward compatibility hacks related to compute
 HTTP servers (#10797)

These hacks were added to appease the forward compatibility tests and
can be removed.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs | 15 ++++++---------
 control_plane/src/endpoint.rs        | 14 +++++---------
 2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index a8803ec793..9193f06b3b 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -112,16 +112,13 @@ struct Cli {
     /// outside the compute will talk to the compute through this port. Keep
     /// the previous name for this argument around for a smoother release
     /// with the control plane.
-    ///
-    /// TODO: Remove the alias after the control plane release which teaches the
-    /// control plane about the renamed argument.
-    #[arg(long, alias = "http-port", default_value_t = 3080)]
+    #[arg(long, default_value_t = 3080)]
     pub external_http_port: u16,
 
-    /// The port to bind the internal listening HTTP server to. Clients like
+    /// The port to bind the internal listening HTTP server to. Clients include
     /// the neon extension (for installing remote extensions) and local_proxy.
-    #[arg(long)]
-    pub internal_http_port: Option<u16>,
+    #[arg(long, default_value_t = 3081)]
+    pub internal_http_port: u16,
 
     #[arg(short = 'D', long, value_name = "DATADIR")]
     pub pgdata: String,
@@ -359,7 +356,7 @@ fn wait_spec(
         pgbin: cli.pgbin.clone(),
         pgversion: get_pg_version_string(&cli.pgbin),
         external_http_port: cli.external_http_port,
-        internal_http_port: cli.internal_http_port.unwrap_or(cli.external_http_port + 1),
+        internal_http_port: cli.internal_http_port,
         live_config_allowed,
         state: Mutex::new(new_state),
         state_changed: Condvar::new(),
@@ -383,7 +380,7 @@ fn wait_spec(
 
     // The internal HTTP server could be launched later, but there isn't much
     // sense in waiting.
-    Server::Internal(cli.internal_http_port.unwrap_or(cli.external_http_port + 1)).launch(&compute);
+    Server::Internal(cli.internal_http_port).launch(&compute);
 
     if !spec_set {
         // No spec provided, hang waiting for it.
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index c16b3cb017..c22ff20c70 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -713,18 +713,14 @@ impl Endpoint {
             println!("Also at '{}'", conn_str);
         }
         let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
-        //cmd.args([
-        //    "--external-http-port",
-        //    &self.external_http_address.port().to_string(),
-        //])
-        //.args([
-        //    "--internal-http-port",
-        //    &self.internal_http_address.port().to_string(),
-        //])
         cmd.args([
-            "--http-port",
+            "--external-http-port",
             &self.external_http_address.port().to_string(),
         ])
+        .args([
+            "--internal-http-port",
+            &self.internal_http_address.port().to_string(),
+        ])
         .args(["--pgdata", self.pgdata().to_str().unwrap()])
         .args(["--connstr", &conn_str])
         .args([

From d571553d8aff2e9f16c8dbc1ad59370e27418eeb Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 20 Feb 2025 11:31:52 -0600
Subject: [PATCH 540/583] Remove hacks in compute_ctl related to compute ID
 (#10751)

---
 compute_tools/src/bin/compute_ctl.rs          | 16 +-----------
 control_plane/src/endpoint.rs                 | 26 +++++++++----------
 .../compute_wrapper/shell/compute.sh          |  1 +
 3 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 9193f06b3b..1cdae718fe 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -41,7 +41,6 @@ use std::process::exit;
 use std::str::FromStr;
 use std::sync::atomic::Ordering;
 use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
-use std::time::SystemTime;
 use std::{thread, time::Duration};
 
 use anyhow::{Context, Result};
@@ -86,19 +85,6 @@ fn parse_remote_ext_config(arg: &str) -> Result<String> {
     }
 }
 
-/// Generate a compute ID if one is not supplied. This exists to keep forward
-/// compatibility tests working, but will be removed in a future iteration.
-fn generate_compute_id() -> String {
-    let now = SystemTime::now();
-
-    format!(
-        "compute-{}",
-        now.duration_since(SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_secs()
-    )
-}
-
 #[derive(Parser)]
 #[command(rename_all = "kebab-case")]
 struct Cli {
@@ -153,7 +139,7 @@ struct Cli {
     #[arg(short = 'S', long, group = "spec-path")]
     pub spec_path: Option<OsString>,
 
-    #[arg(short = 'i', long, group = "compute-id", default_value = generate_compute_id())]
+    #[arg(short = 'i', long, group = "compute-id")]
     pub compute_id: String,
 
     #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")]
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index c22ff20c70..407578abb8 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -46,6 +46,8 @@ use std::process::Command;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
+use std::time::SystemTime;
+use std::time::UNIX_EPOCH;
 
 use anyhow::{anyhow, bail, Context, Result};
 use compute_api::requests::ConfigurationRequest;
@@ -737,20 +739,16 @@ impl Endpoint {
         ])
         // TODO: It would be nice if we generated compute IDs with the same
         // algorithm as the real control plane.
-        //
-        // TODO: Add this back when
-        // https://github.com/neondatabase/neon/pull/10747 is merged.
-        //
-        //.args([
-        //    "--compute-id",
-        //    &format!(
-        //        "compute-{}",
-        //        SystemTime::now()
-        //            .duration_since(UNIX_EPOCH)
-        //            .unwrap()
-        //            .as_secs()
-        //    ),
-        //])
+        .args([
+            "--compute-id",
+            &format!(
+                "compute-{}",
+                SystemTime::now()
+                    .duration_since(UNIX_EPOCH)
+                    .unwrap()
+                    .as_secs()
+            ),
+        ])
         .stdin(std::process::Stdio::null())
         .stderr(logfile.try_clone()?)
         .stdout(logfile);
diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh
index b4f8d3d66a..9dbdcce69f 100755
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -77,4 +77,5 @@ echo "Start compute node"
 /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
      -C "postgresql://cloud_admin@localhost:55433/postgres"  \
      -b /usr/local/bin/postgres                              \
+     --compute-id "compute-$RANDOM"                          \
      -S ${SPEC_FILE}

From 34996416d65eed2377f3cbf8bd4559792c26a045 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 20 Feb 2025 17:49:05 +0000
Subject: [PATCH 541/583] pageserver: guard against WAL gaps in the interpreted
 protocol (#10858)

## Problem

The interpreted SK <-> PS protocol does not guard against gaps (neither
does the Vanilla one, but that's beside the point).

## Summary of changes

Extend the protocol to include the start LSN of the PG WAL section from
which the records were interpreted.
Validation is enabled via a config flag on the pageserver and works as
follows:

**Case 1**: `raw_wal_start_lsn` is smaller than the requested LSN
There can't be gaps here, but we check that the shard received records
which it hasn't seen before.

**Case 2**: `raw_wal_start_lsn` is equal to the requested LSN
This is the happy case. No gap and nothing to check

**Case 3**: `raw_wal_start_lsn` is greater than the requested LSN
This is a gap.

To make Case 3 work I had to bend the protocol a bit.
We read record chunks of WAL which aren't record aligned and feed them
to the decoder.
The picture below shows a shard which subscribes at a position somewhere
within Record 2.
We already have a wal reader which is below that position so we wait to
catch up.
We read some wal in Read 1 (all of Record 1 and some of Record 2). The
new shard doesn't
need Record 1 (it has already processed it according to the starting
position), but we read
past it's starting position. When we do Read 2, we decode Record 2 and
ship it off to the shard,
but the starting position of Read 2 is greater than the starting
position the shard requested.
This looks like a gap.


![image](https://github.com/user-attachments/assets/8aed292e-5d62-46a3-9b01-fbf9dc25efe0)

To make it work, we extend the protocol to send an empty
`InterpretedWalRecords` to shards
if the WAL the records originated from ends the requested start
position. On the pageserver,
that just updates the tracking LSNs in memory (no-op really). This gives
us a workaround for
the fake gap.

As a drive by, make `InterpretedWalRecords::next_record_lsn` mandatory
in the application level definition.
It's always included.

Related: https://github.com/neondatabase/cloud/issues/23935
---
 libs/pageserver_api/src/config.rs             |  3 +
 libs/wal_decoder/proto/interpreted_wal.proto  |  1 +
 libs/wal_decoder/src/models.rs                |  6 +-
 libs/wal_decoder/src/wire_format.rs           |  9 ++-
 pageserver/src/bin/pageserver.rs              |  1 +
 pageserver/src/config.rs                      |  6 ++
 pageserver/src/tenant/timeline.rs             |  1 +
 pageserver/src/tenant/timeline/walreceiver.rs |  1 +
 .../walreceiver/connection_manager.rs         |  3 +
 .../walreceiver/walreceiver_connection.rs     | 56 +++++++++++++++----
 safekeeper/src/send_interpreted_wal.rs        | 51 +++++++++++++----
 test_runner/fixtures/neon_fixtures.py         | 12 ++--
 12 files changed, 120 insertions(+), 30 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 0f33bcf45b..1aff5a7012 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -122,6 +122,8 @@ pub struct ConfigToml {
     pub page_service_pipelining: PageServicePipeliningConfig,
     pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
     pub enable_read_path_debugging: Option<bool>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub validate_wal_contiguity: Option<bool>,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -521,6 +523,7 @@ impl Default for ConfigToml {
             } else {
                 None
             },
+            validate_wal_contiguity: None,
         }
     }
 }
diff --git a/libs/wal_decoder/proto/interpreted_wal.proto b/libs/wal_decoder/proto/interpreted_wal.proto
index d68484d30f..7b40201a75 100644
--- a/libs/wal_decoder/proto/interpreted_wal.proto
+++ b/libs/wal_decoder/proto/interpreted_wal.proto
@@ -5,6 +5,7 @@ package interpreted_wal;
 message InterpretedWalRecords {
   repeated InterpretedWalRecord records = 1;
   optional uint64 next_record_lsn = 2;
+  optional uint64 raw_wal_start_lsn = 3;
 }
 
 message InterpretedWalRecord {
diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs
index 51bf7e44ab..7e1934c6c3 100644
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -60,7 +60,11 @@ pub struct InterpretedWalRecords {
     pub records: Vec<InterpretedWalRecord>,
     // Start LSN of the next record after the batch.
     // Note that said record may not belong to the current shard.
-    pub next_record_lsn: Option<Lsn>,
+    pub next_record_lsn: Lsn,
+    // Inclusive start LSN of the PG WAL from which the interpreted
+    // WAL records were extracted. Note that this is not necessarily the
+    // start LSN of the first interpreted record in the batch.
+    pub raw_wal_start_lsn: Option<Lsn>,
 }
 
 /// An interpreted Postgres WAL record, ready to be handled by the pageserver
diff --git a/libs/wal_decoder/src/wire_format.rs b/libs/wal_decoder/src/wire_format.rs
index 944ee5c919..52ed5c70b5 100644
--- a/libs/wal_decoder/src/wire_format.rs
+++ b/libs/wal_decoder/src/wire_format.rs
@@ -167,7 +167,8 @@ impl TryFrom<InterpretedWalRecords> for proto::InterpretedWalRecords {
             .collect::<Result<Vec<_>, _>>()?;
         Ok(proto::InterpretedWalRecords {
             records,
-            next_record_lsn: value.next_record_lsn.map(|l| l.0),
+            next_record_lsn: Some(value.next_record_lsn.0),
+            raw_wal_start_lsn: value.raw_wal_start_lsn.map(|l| l.0),
         })
     }
 }
@@ -254,7 +255,11 @@ impl TryFrom<proto::InterpretedWalRecords> for InterpretedWalRecords {
 
         Ok(InterpretedWalRecords {
             records,
-            next_record_lsn: value.next_record_lsn.map(Lsn::from),
+            next_record_lsn: value
+                .next_record_lsn
+                .map(Lsn::from)
+                .expect("Always provided"),
+            raw_wal_start_lsn: value.raw_wal_start_lsn.map(Lsn::from),
         })
     }
 }
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index fa098e9364..e2b9a7f073 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -134,6 +134,7 @@ fn main() -> anyhow::Result<()> {
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
     info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
     info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
+    info!(?conf.validate_wal_contiguity, "starting with WAL contiguity validation");
     info!(?conf.page_service_pipelining, "starting with page service pipelining config");
     info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config");
 
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index c5368f6806..09d9444dd5 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -197,6 +197,10 @@ pub struct PageServerConf {
     /// Enable read path debugging. If enabled, read key errors will print a backtrace of the layer
     /// files read.
     pub enable_read_path_debugging: bool,
+
+    /// Interpreted protocol feature: if enabled, validate that the logical WAL received from
+    /// safekeepers does not have gaps.
+    pub validate_wal_contiguity: bool,
 }
 
 /// Token for authentication to safekeepers
@@ -360,6 +364,7 @@ impl PageServerConf {
             page_service_pipelining,
             get_vectored_concurrent_io,
             enable_read_path_debugging,
+            validate_wal_contiguity,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -446,6 +451,7 @@ impl PageServerConf {
             virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
             no_sync: no_sync.unwrap_or(false),
             enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
+            validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false),
         };
 
         // ------------------------------------------------------------
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b9425d2777..30de4d90dc 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2874,6 +2874,7 @@ impl Timeline {
                 auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
                 availability_zone: self.conf.availability_zone.clone(),
                 ingest_batch_size: self.conf.ingest_batch_size,
+                validate_wal_contiguity: self.conf.validate_wal_contiguity,
             },
             broker_client,
             ctx,
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index f831f5e48a..67429bff98 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -56,6 +56,7 @@ pub struct WalReceiverConf {
     pub auth_token: Option<Arc<String>>,
     pub availability_zone: Option<String>,
     pub ingest_batch_size: u64,
+    pub validate_wal_contiguity: bool,
 }
 
 pub struct WalReceiver {
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 65f9d39078..1955345315 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -537,6 +537,7 @@ impl ConnectionManagerState {
         let connect_timeout = self.conf.wal_connect_timeout;
         let ingest_batch_size = self.conf.ingest_batch_size;
         let protocol = self.conf.protocol;
+        let validate_wal_contiguity = self.conf.validate_wal_contiguity;
         let timeline = Arc::clone(&self.timeline);
         let ctx = ctx.detached_child(
             TaskKind::WalReceiverConnectionHandler,
@@ -558,6 +559,7 @@ impl ConnectionManagerState {
                     ctx,
                     node_id,
                     ingest_batch_size,
+                    validate_wal_contiguity,
                 )
                 .await;
 
@@ -1563,6 +1565,7 @@ mod tests {
                 auth_token: None,
                 availability_zone: None,
                 ingest_batch_size: 1,
+                validate_wal_contiguity: false,
             },
             wal_connection: None,
             wal_stream_candidates: HashMap::new(),
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 23db4f88d2..ff05a8f902 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -120,6 +120,7 @@ pub(super) async fn handle_walreceiver_connection(
     ctx: RequestContext,
     safekeeper_node: NodeId,
     ingest_batch_size: u64,
+    validate_wal_contiguity: bool,
 ) -> Result<(), WalReceiverError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -274,6 +275,7 @@ pub(super) async fn handle_walreceiver_connection(
         } => Some((format, compression)),
     };
 
+    let mut expected_wal_start = startpoint;
     while let Some(replication_message) = {
         select! {
             _ = cancellation.cancelled() => {
@@ -340,13 +342,49 @@ pub(super) async fn handle_walreceiver_connection(
                     )
                     })?;
 
+                // Guard against WAL gaps. If the start LSN of the PG WAL section
+                // from which the interpreted records were extracted, doesn't match
+                // the end of the previous batch (or the starting point for the first batch),
+                // then kill this WAL receiver connection and start a new one.
+                if validate_wal_contiguity {
+                    if let Some(raw_wal_start_lsn) = batch.raw_wal_start_lsn {
+                        match raw_wal_start_lsn.cmp(&expected_wal_start) {
+                            std::cmp::Ordering::Greater => {
+                                let msg = format!(
+                                    "Gap in streamed WAL: [{}, {})",
+                                    expected_wal_start, raw_wal_start_lsn
+                                );
+                                critical!("{msg}");
+                                return Err(WalReceiverError::Other(anyhow!(msg)));
+                            }
+                            std::cmp::Ordering::Less => {
+                                // Other shards are reading WAL behind us.
+                                // This is valid, but check that we received records
+                                // that we haven't seen before.
+                                if let Some(first_rec) = batch.records.first() {
+                                    if first_rec.next_record_lsn < last_rec_lsn {
+                                        let msg = format!(
+                                            "Received record with next_record_lsn multiple times ({} < {})",
+                                            first_rec.next_record_lsn, expected_wal_start
+                                        );
+                                        critical!("{msg}");
+                                        return Err(WalReceiverError::Other(anyhow!(msg)));
+                                    }
+                                }
+                            }
+                            std::cmp::Ordering::Equal => {}
+                        }
+                    }
+                }
+
                 let InterpretedWalRecords {
                     records,
                     next_record_lsn,
+                    raw_wal_start_lsn: _,
                 } = batch;
 
                 tracing::debug!(
-                    "Received WAL up to {} with next_record_lsn={:?}",
+                    "Received WAL up to {} with next_record_lsn={}",
                     streaming_lsn,
                     next_record_lsn
                 );
@@ -423,12 +461,11 @@ pub(super) async fn handle_walreceiver_connection(
                 // need to advance last record LSN on all shards. If we've not ingested the latest
                 // record, then set the LSN of the modification past it. This way all shards
                 // advance their last record LSN at the same time.
-                let needs_last_record_lsn_advance = match next_record_lsn {
-                    Some(lsn) if lsn > modification.get_lsn() => {
-                        modification.set_lsn(lsn).unwrap();
-                        true
-                    }
-                    _ => false,
+                let needs_last_record_lsn_advance = if next_record_lsn > modification.get_lsn() {
+                    modification.set_lsn(next_record_lsn).unwrap();
+                    true
+                } else {
+                    false
                 };
 
                 if uncommitted_records > 0 || needs_last_record_lsn_advance {
@@ -446,9 +483,8 @@ pub(super) async fn handle_walreceiver_connection(
                     timeline.get_last_record_lsn()
                 );
 
-                if let Some(lsn) = next_record_lsn {
-                    last_rec_lsn = lsn;
-                }
+                last_rec_lsn = next_record_lsn;
+                expected_wal_start = streaming_lsn;
 
                 Some(streaming_lsn)
             }
diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
index 5916675c3f..fb06339604 100644
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -295,6 +295,10 @@ impl InterpretedWalReader {
 
         let mut wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
 
+        // Tracks the start of the PG WAL LSN from which the current batch of
+        // interpreted records originated.
+        let mut current_batch_wal_start_lsn: Option<Lsn> = None;
+
         loop {
             tokio::select! {
                 // Main branch for reading WAL and forwarding it
@@ -302,7 +306,7 @@ impl InterpretedWalReader {
                     let wal = wal_or_reset.map(|wor| wor.get_wal().expect("reset handled in select branch below"));
                     let WalBytes {
                         wal,
-                        wal_start_lsn: _,
+                        wal_start_lsn,
                         wal_end_lsn,
                         available_wal_end_lsn,
                     } = match wal {
@@ -315,6 +319,12 @@ impl InterpretedWalReader {
                         }
                     };
 
+                    // We will already have a value if the previous chunks of WAL
+                    // did not decode into anything useful.
+                    if current_batch_wal_start_lsn.is_none() {
+                        current_batch_wal_start_lsn = Some(wal_start_lsn);
+                    }
+
                     wal_decoder.feed_bytes(&wal);
 
                     // Deserialize and interpret WAL records from this batch of WAL.
@@ -363,7 +373,9 @@ impl InterpretedWalReader {
 
                     let max_next_record_lsn = match max_next_record_lsn {
                         Some(lsn) => lsn,
-                        None => { continue; }
+                        None => {
+                            continue;
+                        }
                     };
 
                     // Update the current position such that new receivers can decide
@@ -377,21 +389,38 @@ impl InterpretedWalReader {
                         }
                     }
 
+                    let batch_wal_start_lsn = current_batch_wal_start_lsn.take().unwrap();
+
                     // Send interpreted records downstream. Anything that has already been seen
                     // by a shard is filtered out.
                     let mut shard_senders_to_remove = Vec::new();
                     for (shard, states) in &mut self.shard_senders {
                         for state in states {
-                            if max_next_record_lsn <= state.next_record_lsn {
-                                continue;
-                            }
-
                             let shard_sender_id = ShardSenderId::new(*shard, state.sender_id);
-                            let records = records_by_sender.remove(&shard_sender_id).unwrap_or_default();
 
-                            let batch = InterpretedWalRecords {
-                                records,
-                                next_record_lsn: Some(max_next_record_lsn),
+                            let batch = if max_next_record_lsn > state.next_record_lsn {
+                                // This batch contains at least one record that this shard has not
+                                // seen yet.
+                                let records = records_by_sender.remove(&shard_sender_id).unwrap_or_default();
+
+                                InterpretedWalRecords {
+                                    records,
+                                    next_record_lsn: max_next_record_lsn,
+                                    raw_wal_start_lsn: Some(batch_wal_start_lsn),
+                                }
+                            } else if wal_end_lsn > state.next_record_lsn {
+                                // All the records in this batch were seen by the shard
+                                // However, the batch maps to a chunk of WAL that the
+                                // shard has not yet seen. Notify it of the start LSN
+                                // of the PG WAL chunk such that it doesn't look like a gap.
+                                InterpretedWalRecords {
+                                    records: Vec::default(),
+                                    next_record_lsn: state.next_record_lsn,
+                                    raw_wal_start_lsn: Some(batch_wal_start_lsn),
+                                }
+                            } else {
+                                // The shard has seen this chunk of WAL before. Skip it.
+                                continue;
                             };
 
                             let res = state.tx.send(Batch {
@@ -403,7 +432,7 @@ impl InterpretedWalReader {
                             if res.is_err() {
                                 shard_senders_to_remove.push(shard_sender_id);
                             } else {
-                                state.next_record_lsn = max_next_record_lsn;
+                                state.next_record_lsn = std::cmp::max(state.next_record_lsn, max_next_record_lsn);
                             }
                         }
                     }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 36af522535..1d282971b1 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1167,15 +1167,15 @@ class NeonEnv:
                 "max_batch_size": 32,
             }
 
-            # Concurrent IO (https://github.com/neondatabase/neon/issues/9378):
-            # enable concurrent IO by default in tests and benchmarks.
-            # Compat tests are exempt because old versions fail to parse the new config.
-            get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io
             if config.test_may_use_compatibility_snapshot_binaries:
                 log.info(
-                    "Forcing use of binary-built-in default to avoid forward-compatibility related test failures"
+                    "Skipping WAL contiguity validation to avoid forward-compatibility related test failures"
                 )
-                get_vectored_concurrent_io = None
+            else:
+                # Look for gaps in WAL received from safekeepeers
+                ps_cfg["validate_wal_contiguity"] = True
+
+            get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io
             if get_vectored_concurrent_io is not None:
                 ps_cfg["get_vectored_concurrent_io"] = {
                     "mode": self.pageserver_get_vectored_concurrent_io,

From bd335fa751162f7b0c534b306f3eceb6edd89c4b Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 20 Feb 2025 20:29:14 +0200
Subject: [PATCH 542/583] Fix prototype of CheckPointReplicationState (#10907)

## Problem

Occasionally removed (void) from definition of
`CheckPointReplicationState` function

## Summary of changes

Restore function prototype.

https://github.com/neondatabase/postgres/pull/585
https://github.com/neondatabase/postgres/pull/586

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 023f1020ec..6ff5044377 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 023f1020ecb07af3bb0ddbf4622e1a3c3fa276a4
+Subproject commit 6ff50443773b69749e16da6db9d4f4b19064b4b7
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 6cb8d22079..261ed10e9b 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 6cb8d22079570b50fcaff29124d40807c1e63a82
+Subproject commit 261ed10e9b8c8dda01ad7aefb18e944e30aa161d
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 3379cf1ba8..f85cec3a0b 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -5,11 +5,11 @@
   ],
   "v16": [
     "16.8",
-    "6cb8d22079570b50fcaff29124d40807c1e63a82"
+    "261ed10e9b8c8dda01ad7aefb18e944e30aa161d"
   ],
   "v15": [
     "15.12",
-    "023f1020ecb07af3bb0ddbf4622e1a3c3fa276a4"
+    "6ff50443773b69749e16da6db9d4f4b19064b4b7"
   ],
   "v14": [
     "14.17",

From 5b81a774fc698ea66f972af4463bfe0c5e9c8545 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 20 Feb 2025 20:16:22 +0100
Subject: [PATCH 543/583] Update rust to 1.85.0 (#10914)

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust
ecosystem as well.

[Announcement blog
post](https://blog.rust-lang.org/2025/02/20/Rust-1.85.0.html).

Prior update was in #10618.
---
 build-tools.Dockerfile | 2 +-
 rust-toolchain.toml    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 317eded26e..c103ceaea5 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -292,7 +292,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.84.1
+ENV RUSTC_VERSION=1.85.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 38a7f202ba..591d60ea79 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.84.1"
+channel = "1.85.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From 3f376e44babb12e9a1c7c2f57c51628daccfed15 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 20 Feb 2025 21:48:06 +0200
Subject: [PATCH 544/583] Temporarily disable pg_duckdb (#10909)

It clashed with pg_mooncake

This is the same as the hotfix #10908 , but for the main branch, to keep
the release and main branches in sync. In particular, we don't want to
accidentally revert this temporary fix, if we cut a new release from
main.
---
 compute/compute-node.Dockerfile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 19633064a6..7169cbc41d 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1669,7 +1669,11 @@ COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
+
+# Disabled temporarily, because it clashed with pg_mooncake. pg_mooncake
+# also depends on libduckdb, but a different version.
+#COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
+
 COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pgaudit-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pgauditlogtofile-build /usr/local/pgsql/ /usr/local/pgsql/

From 0b9b391ea0366b896069705b3cdfdf82a9a8e901 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 20 Feb 2025 22:21:44 +0200
Subject: [PATCH 545/583] Fix caclulation of prefetch ring position to fit
 in-flight request in resized ring buffer (#10899)

## Problem

Refer https://github.com/neondatabase/neon/issues/10885

Wait position in ring buffer to restrict number of in-flight requests is
not correctly calculated.

## Summary of changes

Update condition and remove redundant assertion

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/pagestore_smgr.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index f1087a8ccb..6c812f347f 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -474,8 +474,7 @@ readahead_buffer_resize(int newsize, void *extra)
 	 */
 	if (MyPState->n_requests_inflight > newsize)
 	{
-		Assert(MyPState->ring_unused >= MyPState->n_requests_inflight - newsize);
-		prefetch_wait_for(MyPState->ring_unused - (MyPState->n_requests_inflight - newsize));
+		prefetch_wait_for(MyPState->ring_unused - newsize - 1);
 		Assert(MyPState->n_requests_inflight <= newsize);
 	}
 

From 9b42d1ce1a6e0c8bba0fca397d6d17d200da3d9c Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 20 Feb 2025 22:38:42 +0100
Subject: [PATCH 546/583] pageserver: periodically log slow ongoing getpage
 requests (#10906)

## Problem

We don't have good observability for "stuck" getpage requests.

Resolves https://github.com/neondatabase/cloud/issues/23808.

## Summary of changes

Log a periodic warning (every 30 seconds) if GetPage request execution
is slow to complete, to aid in debugging stuck GetPage requests.

This does not cover response flushing (we have separate logging for
that), nor reading the request from the socket and batching it (expected
to be insignificant and not straightforward to handle with the current
protocol).

This costs 95 nanoseconds on the happy path when awaiting a
`tokio::task::yield_now()`:

```
warn_slow/enabled=false time:   [45.716 ns 46.116 ns 46.687 ns]
warn_slow/enabled=true  time:   [141.53 ns 141.83 ns 142.18 ns]
```
---
 Cargo.lock                       |  1 +
 libs/utils/Cargo.toml            |  3 ++-
 libs/utils/benches/README.md     | 26 ++++++++++++++++++
 libs/utils/benches/benchmarks.rs | 45 +++++++++++++++++++++++++++++---
 libs/utils/src/logging.rs        | 39 +++++++++++++++++++++++++++
 pageserver/src/page_service.rs   | 41 ++++++++++++++++++++---------
 6 files changed, 139 insertions(+), 16 deletions(-)
 create mode 100644 libs/utils/benches/README.md

diff --git a/Cargo.lock b/Cargo.lock
index 12232eaece..f62026696e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7616,6 +7616,7 @@ dependencies = [
  "once_cell",
  "pin-project-lite",
  "postgres_connection",
+ "pprof",
  "pq_proto",
  "rand 0.8.5",
  "regex",
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 62e0f4cfba..5020d82adf 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -27,7 +27,7 @@ humantime.workspace = true
 fail.workspace = true
 futures = { workspace = true }
 jsonwebtoken.workspace = true
-nix = {workspace = true, features = [ "ioctl" ] }
+nix = { workspace = true, features = ["ioctl"] }
 once_cell.workspace = true
 pin-project-lite.workspace = true
 regex.workspace = true
@@ -61,6 +61,7 @@ bytes.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 camino-tempfile.workspace = true
+pprof.workspace = true
 serde_assert.workspace = true
 tokio = { workspace = true, features = ["test-util"] }
 
diff --git a/libs/utils/benches/README.md b/libs/utils/benches/README.md
new file mode 100644
index 0000000000..e23ec268c2
--- /dev/null
+++ b/libs/utils/benches/README.md
@@ -0,0 +1,26 @@
+## Utils Benchmarks
+
+To run benchmarks:
+
+```sh
+# All benchmarks.
+cargo bench --package utils
+
+# Specific file.
+cargo bench --package utils --bench benchmarks
+
+# Specific benchmark.
+cargo bench --package utils --bench benchmarks warn_slow/enabled=true
+
+# List available benchmarks.
+cargo bench --package utils --benches -- --list
+
+# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds.
+# Output in target/criterion/*/profile/flamegraph.svg.
+cargo bench --package utils --bench benchmarks warn_slow/enabled=true --profile-time 10
+```
+
+Additional charts and statistics are available in `target/criterion/report/index.html`.
+
+Benchmarks are automatically compared against the previous run. To compare against other runs, see
+`--baseline` and `--save-baseline`.
\ No newline at end of file
diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs
index 44eb36387c..cff3792f3a 100644
--- a/libs/utils/benches/benchmarks.rs
+++ b/libs/utils/benches/benchmarks.rs
@@ -1,5 +1,18 @@
-use criterion::{criterion_group, criterion_main, Criterion};
+use std::time::Duration;
+
+use criterion::{criterion_group, criterion_main, Bencher, Criterion};
+use pprof::criterion::{Output, PProfProfiler};
 use utils::id;
+use utils::logging::warn_slow;
+
+// Register benchmarks with Criterion.
+criterion_group!(
+    name = benches;
+    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets = bench_id_stringify,
+    bench_warn_slow,
+);
+criterion_main!(benches);
 
 pub fn bench_id_stringify(c: &mut Criterion) {
     // Can only use public methods.
@@ -16,5 +29,31 @@ pub fn bench_id_stringify(c: &mut Criterion) {
     });
 }
 
-criterion_group!(benches, bench_id_stringify);
-criterion_main!(benches);
+pub fn bench_warn_slow(c: &mut Criterion) {
+    for enabled in [false, true] {
+        c.bench_function(&format!("warn_slow/enabled={enabled}"), |b| {
+            run_bench(b, enabled).unwrap()
+        });
+    }
+
+    // The actual benchmark.
+    fn run_bench(b: &mut Bencher, enabled: bool) -> anyhow::Result<()> {
+        const THRESHOLD: Duration = Duration::from_secs(1);
+
+        // Use a multi-threaded runtime to avoid thread parking overhead when yielding.
+        let runtime = tokio::runtime::Builder::new_multi_thread()
+            .enable_all()
+            .build()?;
+
+        // Test both with and without warn_slow, since we're essentially measuring Tokio scheduling
+        // performance too. Use a simple noop future that yields once, to avoid any scheduler fast
+        // paths for a ready future.
+        if enabled {
+            b.iter(|| runtime.block_on(warn_slow("ready", THRESHOLD, tokio::task::yield_now())));
+        } else {
+            b.iter(|| runtime.block_on(tokio::task::yield_now()));
+        }
+
+        Ok(())
+    }
+}
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 4a6069294d..95c69ac8ba 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,9 +1,13 @@
+use std::future::Future;
 use std::str::FromStr;
+use std::time::Duration;
 
 use anyhow::Context;
 use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, VariantNames};
+use tokio::time::Instant;
+use tracing::warn;
 
 /// Logs a critical error, similarly to `tracing::error!`. This will:
 ///
@@ -318,6 +322,41 @@ impl std::fmt::Debug for SecretString {
     }
 }
 
+/// Logs a periodic warning if a future is slow to complete.
+///
+/// This is performance-sensitive as it's used on the GetPage read path.
+#[inline]
+pub async fn warn_slow<O>(name: &str, threshold: Duration, f: impl Future<Output = O>) -> O {
+    // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and
+    // won't fit on the stack.
+    let mut f = Box::pin(f);
+
+    let started = Instant::now();
+    let mut attempt = 1;
+
+    loop {
+        // NB: use timeout_at() instead of timeout() to avoid an extra clock reading in the common
+        // case where the timeout doesn't fire.
+        let deadline = started + attempt * threshold;
+        if let Ok(output) = tokio::time::timeout_at(deadline, &mut f).await {
+            // NB: we check if we exceeded the threshold even if the timeout never fired, because
+            // scheduling or execution delays may cause the future to succeed even if it exceeds the
+            // timeout. This costs an extra unconditional clock reading, but seems worth it to avoid
+            // false negatives.
+            let elapsed = started.elapsed();
+            if elapsed >= threshold {
+                warn!("slow {name} completed after {:.3}s", elapsed.as_secs_f64());
+            }
+            return output;
+        }
+
+        let elapsed = started.elapsed().as_secs_f64();
+        warn!("slow {name} still running after {elapsed:.3}s",);
+
+        attempt += 1;
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use metrics::{core::Opts, IntCounterVec};
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 0c8da6f2a8..7285697040 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -34,11 +34,13 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::SystemTime;
 use std::time::{Duration, Instant};
+use strum_macros::IntoStaticStr;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::io::{AsyncWriteExt, BufWriter};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::logging::warn_slow;
 use utils::sync::gate::{Gate, GateGuard};
 use utils::sync::spsc_fold;
 use utils::{
@@ -81,6 +83,9 @@ use std::os::fd::AsRawFd;
 /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 
+/// Threshold at which to log a warning about slow GetPage requests.
+const WARN_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30);
+
 ///////////////////////////////////////////////////////////////////////////////
 
 pub struct Listener {
@@ -594,6 +599,7 @@ struct BatchedTestRequest {
 /// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum,
 /// so that we don't keep the [`Timeline::gate`] open while the batch
 /// is being built up inside the [`spsc_fold`] (pagestream pipelining).
+#[derive(IntoStaticStr)]
 enum BatchedFeMessage {
     Exists {
         span: Span,
@@ -638,6 +644,10 @@ enum BatchedFeMessage {
 }
 
 impl BatchedFeMessage {
+    fn as_static_str(&self) -> &'static str {
+        self.into()
+    }
+
     fn observe_execution_start(&mut self, at: Instant) {
         match self {
             BatchedFeMessage::Exists { timer, .. }
@@ -1463,17 +1473,20 @@ impl PageServerHandler {
                 }
             };
 
-            let err = self
-                .pagesteam_handle_batched_message(
+            let result = warn_slow(
+                msg.as_static_str(),
+                WARN_SLOW_GETPAGE_THRESHOLD,
+                self.pagesteam_handle_batched_message(
                     pgb_writer,
                     msg,
                     io_concurrency.clone(),
                     &cancel,
                     protocol_version,
                     ctx,
-                )
-                .await;
-            match err {
+                ),
+            )
+            .await;
+            match result {
                 Ok(()) => {}
                 Err(e) => break e,
             }
@@ -1636,13 +1649,17 @@ impl PageServerHandler {
                             return Err(e);
                         }
                     };
-                    self.pagesteam_handle_batched_message(
-                        pgb_writer,
-                        batch,
-                        io_concurrency.clone(),
-                        &cancel,
-                        protocol_version,
-                        &ctx,
+                    warn_slow(
+                        batch.as_static_str(),
+                        WARN_SLOW_GETPAGE_THRESHOLD,
+                        self.pagesteam_handle_batched_message(
+                            pgb_writer,
+                            batch,
+                            io_concurrency.clone(),
+                            &cancel,
+                            protocol_version,
+                            &ctx,
+                        ),
                     )
                     .await?;
                 }

From c214c32d3f7f29485226d7baf97ea6f7643769bd Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 20 Feb 2025 20:56:30 -0500
Subject: [PATCH 547/583] fix(pageserver): avoid creating empty job for
 gc-compaction (#10917)

## Problem

This should be one last fix for
https://github.com/neondatabase/neon/issues/10517.

## Summary of changes

If a keyspace is empty, we might produce a gc-compaction job which
covers no layer files. We should avoid generating such jobs so that the
gc-compaction image layer can cover the full key range.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 31 +++++++++++++-------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 58a87dbd5f..0361ce8cd1 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -2212,7 +2212,7 @@ impl Timeline {
         let sub_compaction_max_job_size_mb =
             sub_compaction_max_job_size_mb.unwrap_or(GC_COMPACT_MAX_SIZE_MB);
 
-        let mut compact_jobs = Vec::new();
+        let mut compact_jobs = Vec::<GcCompactJob>::new();
         // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
         // by estimating the amount of files read for a compaction job. We should also partition on LSN.
         let ((dense_ks, sparse_ks), _) = self.partitioning.read().as_ref().clone();
@@ -2299,16 +2299,25 @@ impl Timeline {
                 } else {
                     end
                 };
-                info!(
-                    "splitting compaction job: {}..{}, estimated_size={}",
-                    start, end, total_size
-                );
-                compact_jobs.push(GcCompactJob {
-                    dry_run: job.dry_run,
-                    compact_key_range: start..end,
-                    compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn,
-                });
-                current_start = Some(end);
+                if total_size == 0 && !compact_jobs.is_empty() {
+                    info!(
+                        "splitting compaction job: {}..{}, estimated_size={}, extending the previous job",
+                        start, end, total_size
+                    );
+                    compact_jobs.last_mut().unwrap().compact_key_range.end = end;
+                    current_start = Some(end);
+                } else {
+                    info!(
+                        "splitting compaction job: {}..{}, estimated_size={}",
+                        start, end, total_size
+                    );
+                    compact_jobs.push(GcCompactJob {
+                        dry_run: job.dry_run,
+                        compact_key_range: start..end,
+                        compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn,
+                    });
+                    current_start = Some(end);
+                }
             }
         }
         Ok(compact_jobs)

From 61d385caeae6988c4ee8bf251066105037d21191 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 21 Feb 2025 11:03:54 +0200
Subject: [PATCH 548/583] Split plv8 build into two parts (#10920)

Plv8 consists of two parts:
1. the V8 engine, which is built from vendored sources, and
2. the PostgreSQL extension.

Split those into two separate steps in the Dockerfile. The first step
doesn't need any PostgreSQL sources or any other files from the neon
repository, just the build tools and the upstream plv8 sources. Use the
build-deps image as the base for that step, so that the layer can be
cached and doesn't need to be rebuilt every time. This is worthwhile
because the V8 build takes a very long time.
---
 compute/compute-node.Dockerfile | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 7169cbc41d..ef4c22612d 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -395,15 +395,22 @@ RUN case "${PG_VERSION:?}" in \
     cd plv8-src && \
     if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi
 
-FROM pg-build AS plv8-build
+# Step 1: Build the vendored V8 engine. It doesn't depend on PostgreSQL, so use
+# 'build-deps' as the base. This enables caching and avoids unnecessary rebuilds.
+# (The V8 engine takes a very long time to build)
+FROM build-deps AS plv8-build
 ARG PG_VERSION
+WORKDIR /ext-src/plv8-src
 RUN apt update && \
     apt install --no-install-recommends --no-install-suggests -y \
     ninja-build python3-dev libncurses5 binutils clang \
     && apt clean && rm -rf /var/lib/apt/lists/*
-
 COPY --from=plv8-src /ext-src/ /ext-src/
-WORKDIR /ext-src/plv8-src
+RUN make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) v8
+
+# Step 2: Build the PostgreSQL-dependent parts
+COPY --from=pg-build /usr/local/pgsql /usr/local/pgsql
+ENV PATH="/usr/local/pgsql/bin:$PATH"
 RUN \
     # generate and copy upgrade scripts
     make generate_upgrades && \

From f927ae6e15431e543c4d31af29da2d72172cd506 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 21 Feb 2025 12:01:57 +0100
Subject: [PATCH 549/583] Return a json response in scheduling_policy handler
 (#10904)

Return an empty json response in the `scheduling_policy` handler.

This prevents errors of the form:

```
Error: receive body: error decoding response body: EOF while parsing a value at line 1 column 0
```

when setting the scheduling policy via the `storcon_cli`.

part of #9011.
---
 storage_controller/src/http.rs | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 1cc61a12e8..fb7b4356d1 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1354,10 +1354,7 @@ async fn handle_safekeeper_scheduling_policy(
         .set_safekeeper_scheduling_policy(id, body.scheduling_policy)
         .await?;
 
-    Ok(Response::builder()
-        .status(StatusCode::NO_CONTENT)
-        .body(Body::empty())
-        .unwrap())
+    json_response(StatusCode::OK, ())
 }
 
 /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only

From ff3819efc784bf5919e9de37535466e90c6e83dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 21 Feb 2025 12:02:02 +0100
Subject: [PATCH 550/583] storcon: infrastructure for safekeeper specific JWT
 tokens (#10905)

Safekeepers only respond to requests with the per-token scope, or the
`safekeeperdata` JWT scope. Therefore, add infrastructure in the storage
controller for safekeeper JWTs. Also, rename the ambiguous `jwt_token`
to `pageserver_jwt_token`.

Part of #9011
Related: https://github.com/neondatabase/cloud/issues/24727
---
 storage_controller/src/http.rs       |  5 ++-
 storage_controller/src/main.rs       | 28 ++++++++++++--
 storage_controller/src/reconciler.rs | 12 +++---
 storage_controller/src/service.rs    | 55 +++++++++++++++-------------
 4 files changed, 64 insertions(+), 36 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index fb7b4356d1..33b3d88c25 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -598,7 +598,10 @@ async fn handle_tenant_timeline_passthrough(
 
     let _timer = latency.start_timer(labels.clone());
 
-    let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref());
+    let client = mgmt_api::Client::new(
+        node.base_url(),
+        service.get_config().pageserver_jwt_token.as_deref(),
+    );
     let resp = client.get_raw(path).await.map_err(|e|
         // We return 503 here because if we can't successfully send a request to the pageserver,
         // either we aren't available or the pageserver is unavailable.
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index be074d269d..18922b9e05 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -53,6 +53,10 @@ struct Cli {
     #[arg(long)]
     jwt_token: Option<String>,
 
+    /// Token for authenticating this service with the safekeepers it controls
+    #[arg(long)]
+    safekeeper_jwt_token: Option<String>,
+
     /// Token for authenticating this service with the control plane, when calling
     /// the compute notification endpoint
     #[arg(long)]
@@ -153,7 +157,8 @@ impl Default for StrictMode {
 struct Secrets {
     database_url: String,
     public_key: Option<JwtAuth>,
-    jwt_token: Option<String>,
+    pageserver_jwt_token: Option<String>,
+    safekeeper_jwt_token: Option<String>,
     control_plane_jwt_token: Option<String>,
     peer_jwt_token: Option<String>,
 }
@@ -161,6 +166,7 @@ struct Secrets {
 impl Secrets {
     const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
     const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
+    const SAFEKEEPER_JWT_TOKEN_ENV: &'static str = "SAFEKEEPER_JWT_TOKEN";
     const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
     const PEER_JWT_TOKEN_ENV: &'static str = "PEER_JWT_TOKEN";
     const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";
@@ -184,7 +190,14 @@ impl Secrets {
         let this = Self {
             database_url,
             public_key,
-            jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV),
+            pageserver_jwt_token: Self::load_secret(
+                &args.jwt_token,
+                Self::PAGESERVER_JWT_TOKEN_ENV,
+            ),
+            safekeeper_jwt_token: Self::load_secret(
+                &args.safekeeper_jwt_token,
+                Self::SAFEKEEPER_JWT_TOKEN_ENV,
+            ),
             control_plane_jwt_token: Self::load_secret(
                 &args.control_plane_jwt_token,
                 Self::CONTROL_PLANE_JWT_TOKEN_ENV,
@@ -264,11 +277,17 @@ async fn async_main() -> anyhow::Result<()> {
 
     let secrets = Secrets::load(&args).await?;
 
+    // TODO: once we've rolled out the safekeeper JWT token everywhere, put it into the validation code below
+    tracing::info!(
+        "safekeeper_jwt_token set: {:?}",
+        secrets.safekeeper_jwt_token.is_some()
+    );
+
     // Validate required secrets and arguments are provided in strict mode
     match strict_mode {
         StrictMode::Strict
             if (secrets.public_key.is_none()
-                || secrets.jwt_token.is_none()
+                || secrets.pageserver_jwt_token.is_none()
                 || secrets.control_plane_jwt_token.is_none()) =>
         {
             // Production systems should always have secrets configured: if public_key was not set
@@ -293,7 +312,8 @@ async fn async_main() -> anyhow::Result<()> {
     }
 
     let config = Config {
-        jwt_token: secrets.jwt_token,
+        pageserver_jwt_token: secrets.pageserver_jwt_token,
+        safekeeper_jwt_token: secrets.safekeeper_jwt_token,
         control_plane_jwt_token: secrets.control_plane_jwt_token,
         peer_jwt_token: secrets.peer_jwt_token,
         compute_hook_url: args.compute_hook_url,
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 48f0804926..4fda7338e5 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -296,7 +296,7 @@ impl Reconciler {
                         .location_config(tenant_shard_id, config.clone(), flush_ms, lazy)
                         .await
                 },
-                &self.service_config.jwt_token,
+                &self.service_config.pageserver_jwt_token,
                 1,
                 3,
                 timeout,
@@ -417,7 +417,7 @@ impl Reconciler {
         let client = PageserverClient::new(
             node.get_id(),
             node.base_url(),
-            self.service_config.jwt_token.as_deref(),
+            self.service_config.pageserver_jwt_token.as_deref(),
         );
 
         client
@@ -440,7 +440,7 @@ impl Reconciler {
         let client = PageserverClient::new(
             node.get_id(),
             node.base_url(),
-            self.service_config.jwt_token.as_deref(),
+            self.service_config.pageserver_jwt_token.as_deref(),
         );
 
         let timelines = client.timeline_list(&tenant_shard_id).await?;
@@ -478,7 +478,7 @@ impl Reconciler {
                             )
                             .await
                     },
-                    &self.service_config.jwt_token,
+                    &self.service_config.pageserver_jwt_token,
                     1,
                     3,
                     request_download_timeout * 2,
@@ -771,7 +771,7 @@ impl Reconciler {
             let observed_conf = match attached_node
                 .with_client_retries(
                     |client| async move { client.get_location_config(tenant_shard_id).await },
-                    &self.service_config.jwt_token,
+                    &self.service_config.pageserver_jwt_token,
                     1,
                     1,
                     Duration::from_secs(5),
@@ -1099,7 +1099,7 @@ impl Reconciler {
             match origin
                 .with_client_retries(
                     |client| async move { client.get_location_config(tenant_shard_id).await },
-                    &self.service_config.jwt_token,
+                    &self.service_config.pageserver_jwt_token,
                     1,
                     3,
                     Duration::from_secs(5),
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 25a1cb4252..1bff5a37db 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -348,7 +348,12 @@ pub struct Config {
     // All pageservers managed by one instance of this service must have
     // the same public key.  This JWT token will be used to authenticate
     // this service to the pageservers it manages.
-    pub jwt_token: Option<String>,
+    pub pageserver_jwt_token: Option<String>,
+
+    // All safekeepers managed by one instance of this service must have
+    // the same public key. This JWT token will be used to authenticate
+    // this service to the safekeepers it manages.
+    pub safekeeper_jwt_token: Option<String>,
 
     // This JWT token will be used to authenticate this service to the control plane.
     pub control_plane_jwt_token: Option<String>,
@@ -882,7 +887,7 @@ impl Service {
                     let response = node
                         .with_client_retries(
                             |client| async move { client.list_location_config().await },
-                            &self.config.jwt_token,
+                            &self.config.pageserver_jwt_token,
                             1,
                             5,
                             timeout,
@@ -983,7 +988,7 @@ impl Service {
             let client = PageserverClient::new(
                 node.get_id(),
                 node.base_url(),
-                self.config.jwt_token.as_deref(),
+                self.config.pageserver_jwt_token.as_deref(),
             );
             match client
                 .location_config(
@@ -1553,14 +1558,14 @@ impl Service {
         let reconcilers_cancel = cancel.child_token();
 
         let heartbeater_ps = Heartbeater::new(
-            config.jwt_token.clone(),
+            config.pageserver_jwt_token.clone(),
             config.max_offline_interval,
             config.max_warming_up_interval,
             cancel.clone(),
         );
 
         let heartbeater_sk = Heartbeater::new(
-            config.jwt_token.clone(),
+            config.safekeeper_jwt_token.clone(),
             config.max_offline_interval,
             config.max_warming_up_interval,
             cancel.clone(),
@@ -1907,7 +1912,7 @@ impl Service {
         let configs = match node
             .with_client_retries(
                 |client| async move { client.list_location_config().await },
-                &self.config.jwt_token,
+                &self.config.pageserver_jwt_token,
                 1,
                 5,
                 SHORT_RECONCILE_TIMEOUT,
@@ -1965,7 +1970,7 @@ impl Service {
                             .location_config(tenant_shard_id, config, None, false)
                             .await
                     },
-                    &self.config.jwt_token,
+                    &self.config.pageserver_jwt_token,
                     1,
                     5,
                     SHORT_RECONCILE_TIMEOUT,
@@ -3100,7 +3105,7 @@ impl Service {
                 let client = PageserverClient::new(
                     node.get_id(),
                     node.base_url(),
-                    self.config.jwt_token.as_deref(),
+                    self.config.pageserver_jwt_token.as_deref(),
                 );
 
                 tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",);
@@ -3161,7 +3166,7 @@ impl Service {
             let client = PageserverClient::new(
                 node.get_id(),
                 node.base_url(),
-                self.config.jwt_token.as_deref(),
+                self.config.pageserver_jwt_token.as_deref(),
             );
             futs.push(async move {
                 let result = client
@@ -3284,7 +3289,7 @@ impl Service {
                         .tenant_delete(TenantShardId::unsharded(tenant_id))
                         .await
                 },
-                &self.config.jwt_token,
+                &self.config.pageserver_jwt_token,
                 1,
                 3,
                 RECONCILE_TIMEOUT,
@@ -3503,7 +3508,7 @@ impl Service {
             let timeline_info = create_one(
                 shard_zero_tid,
                 shard_zero_locations,
-                self.config.jwt_token.clone(),
+                self.config.pageserver_jwt_token.clone(),
                 create_req.clone(),
             )
             .await?;
@@ -3519,7 +3524,7 @@ impl Service {
             // Create timeline on remaining shards with number >0
             if !targets.0.is_empty() {
                 // If we had multiple shards, issue requests for the remainder now.
-                let jwt = &self.config.jwt_token;
+                let jwt = &self.config.pageserver_jwt_token;
                 self.tenant_for_shards(
                     targets
                         .0
@@ -3602,7 +3607,7 @@ impl Service {
                         tenant_shard_id,
                         timeline_id,
                         node,
-                        self.config.jwt_token.clone(),
+                        self.config.pageserver_jwt_token.clone(),
                         req.clone(),
                     ))
                 })
@@ -3683,7 +3688,7 @@ impl Service {
                         tenant_shard_id,
                         timeline_id,
                         node,
-                        self.config.jwt_token.clone(),
+                        self.config.pageserver_jwt_token.clone(),
                     ))
                 })
                 .await?;
@@ -3757,7 +3762,7 @@ impl Service {
                     tenant_shard_id,
                     timeline_id,
                     node,
-                    self.config.jwt_token.clone(),
+                    self.config.pageserver_jwt_token.clone(),
                     dir,
                 ))
             })
@@ -3872,7 +3877,7 @@ impl Service {
             futs.push(async move {
                 node.with_client_retries(
                     |client| op(tenant_shard_id, client),
-                    &self.config.jwt_token,
+                    &self.config.pageserver_jwt_token,
                     warn_threshold,
                     max_retries,
                     timeout,
@@ -4121,7 +4126,7 @@ impl Service {
                         tenant_shard_id,
                         timeline_id,
                         node,
-                        self.config.jwt_token.clone(),
+                        self.config.pageserver_jwt_token.clone(),
                     ))
                 })
                 .await?;
@@ -4143,7 +4148,7 @@ impl Service {
                 shard_zero_tid,
                 timeline_id,
                 shard_zero_locations.latest.node,
-                self.config.jwt_token.clone(),
+                self.config.pageserver_jwt_token.clone(),
             )
             .await?;
             Ok(shard_zero_status)
@@ -4542,7 +4547,7 @@ impl Service {
 
                         client.location_config(child_id, config, None, false).await
                     },
-                    &self.config.jwt_token,
+                    &self.config.pageserver_jwt_token,
                     1,
                     10,
                     Duration::from_secs(5),
@@ -5142,7 +5147,7 @@ impl Service {
             let client = PageserverClient::new(
                 node.get_id(),
                 node.base_url(),
-                self.config.jwt_token.as_deref(),
+                self.config.pageserver_jwt_token.as_deref(),
             );
             let response = client
                 .tenant_shard_split(
@@ -5468,7 +5473,7 @@ impl Service {
         let client = PageserverClient::new(
             node.get_id(),
             node.base_url(),
-            self.config.jwt_token.as_deref(),
+            self.config.pageserver_jwt_token.as_deref(),
         );
 
         let scan_result = client
@@ -7094,7 +7099,7 @@ impl Service {
         match attached_node
             .with_client_retries(
                 |client| async move { client.tenant_heatmap_upload(tenant_shard_id).await },
-                &self.config.jwt_token,
+                &self.config.pageserver_jwt_token,
                 3,
                 10,
                 SHORT_RECONCILE_TIMEOUT,
@@ -7130,7 +7135,7 @@ impl Service {
                             )
                             .await
                     },
-                    &self.config.jwt_token,
+                    &self.config.pageserver_jwt_token,
                     3,
                     10,
                     SHORT_RECONCILE_TIMEOUT,
@@ -7185,7 +7190,7 @@ impl Service {
                         let request = request_ref.clone();
                         client.top_tenant_shards(request.clone()).await
                     },
-                    &self.config.jwt_token,
+                    &self.config.pageserver_jwt_token,
                     3,
                     3,
                     Duration::from_secs(5),
@@ -7358,7 +7363,7 @@ impl Service {
         match node
             .with_client_retries(
                 |client| async move { client.tenant_secondary_status(tenant_shard_id).await },
-                &self.config.jwt_token,
+                &self.config.pageserver_jwt_token,
                 1,
                 3,
                 Duration::from_millis(250),

From 5e3c234edc9971740b09447ce7950398dd12295c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 21 Feb 2025 14:58:49 +0000
Subject: [PATCH 551/583] storcon: do more concurrent optimisations (#10929)

## Problem

Now that we rely on the optimisation logic to handle fixing things up
after some tenants are in the wrong AZ (e.g. after node failure), it's
no longer appropriate to treat optimisations as an ultra-low-priority
task. We used to reflect that low priority with a very low limit on
concurrent execution, such that we would only migrate 2 things every 20
seconds.

## Summary of changes

- Increase MAX_OPTIMIZATIONS_EXEC_PER_PASS from 2 to 16
- Increase MAX_OPTIMIZATIONS_PLAN_PER_PASS from 8 to 64.

Since we recently gave user-initiated actions their own semaphore, this
should not risk starving out API requests.
---
 storage_controller/src/service.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 1bff5a37db..14eacfd422 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -6821,7 +6821,7 @@ impl Service {
         // with the frequency of background calls, this acts as an implicit rate limit that runs a small
         // trickle of optimizations in the background, rather than executing a large number in parallel
         // when a change occurs.
-        const MAX_OPTIMIZATIONS_EXEC_PER_PASS: usize = 2;
+        const MAX_OPTIMIZATIONS_EXEC_PER_PASS: usize = 16;
 
         // Synchronous prepare: scan shards for possible scheduling optimizations
         let candidate_work = self.optimize_all_plan();
@@ -6872,7 +6872,7 @@ impl Service {
         // How many candidate optimizations we will generate, before evaluating them for readniess: setting
         // this higher than the execution limit gives us a chance to execute some work even if the first
         // few optimizations we find are not ready.
-        const MAX_OPTIMIZATIONS_PLAN_PER_PASS: usize = 8;
+        const MAX_OPTIMIZATIONS_PLAN_PER_PASS: usize = 64;
 
         let mut work = Vec::new();
         let mut locked = self.inner.write().unwrap();

From 3e82addd64788f8fc963633a9d9fc78efe06cd85 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 21 Feb 2025 15:45:00 +0000
Subject: [PATCH 552/583] storcon: use `Duration` for duration's in the storage
 controller tenant config (#10928)

## Problem

The storage controller treats durations in the tenant config as strings.
These are loaded from the db.
The pageserver maps these durations to a seconds only format and we
always get a mismatch compared
to what's in the db.

## Summary of changes

Treat durations as durations inside the storage controller and not as
strings.
Nothing changes in the cross service API's themselves or the way things
are stored in the db.

I also added some logging which I would have made the investigation a
10min job:
1. Reason for why the reconciliation was spawned
2. Location config diff between the observed and wanted states
---
 Cargo.lock                                    | 18 ++++
 Cargo.toml                                    |  1 +
 control_plane/src/pageserver.rs               | 49 +++++++---
 control_plane/storcon_cli/src/main.rs         |  2 +-
 libs/pageserver_api/src/models.rs             | 89 ++++++++++++++-----
 pageserver/src/tenant/config.rs               | 47 +++-------
 storage_controller/Cargo.toml                 |  1 +
 storage_controller/src/reconciler.rs          | 27 +++++-
 storage_controller/src/service.rs             | 10 ++-
 storage_controller/src/tenant_shard.rs        | 30 +++++--
 .../regress/test_storage_controller.py        | 40 +++++++++
 11 files changed, 234 insertions(+), 80 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f62026696e..f0dbdff3ec 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1874,6 +1874,12 @@ dependencies = [
  "syn 2.0.90",
 ]
 
+[[package]]
+name = "difflib"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8"
+
 [[package]]
 name = "digest"
 version = "0.10.7"
@@ -3331,6 +3337,17 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "json-structural-diff"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e878e36a8a44c158505c2c818abdc1350413ad83dcb774a0459f6a7ef2b65cbf"
+dependencies = [
+ "difflib",
+ "regex",
+ "serde_json",
+]
+
 [[package]]
 name = "jsonwebtoken"
 version = "9.2.0"
@@ -6443,6 +6460,7 @@ dependencies = [
  "humantime",
  "hyper 0.14.30",
  "itertools 0.10.5",
+ "json-structural-diff",
  "lasso",
  "measured",
  "metrics",
diff --git a/Cargo.toml b/Cargo.toml
index 7228623c6b..21310ce6ec 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -210,6 +210,7 @@ rustls-native-certs = "0.8"
 x509-parser = "0.16"
 whoami = "1.5.1"
 zerocopy = { version = "0.7", features = ["derive"] }
+json-structural-diff = { version = "0.2.0" }
 
 ## TODO replace this with tracing
 env_logger = "0.10"
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 28d130d9e0..2bf89b7bfa 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -335,13 +335,21 @@ impl PageServerNode {
                 .map(|x| x.parse::<u64>())
                 .transpose()
                 .context("Failed to parse 'checkpoint_distance' as an integer")?,
-            checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()),
+            checkpoint_timeout: settings
+                .remove("checkpoint_timeout")
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'checkpoint_timeout' as duration")?,
             compaction_target_size: settings
                 .remove("compaction_target_size")
                 .map(|x| x.parse::<u64>())
                 .transpose()
                 .context("Failed to parse 'compaction_target_size' as an integer")?,
-            compaction_period: settings.remove("compaction_period").map(|x| x.to_string()),
+            compaction_period: settings
+                .remove("compaction_period")
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'compaction_period' as duration")?,
             compaction_threshold: settings
                 .remove("compaction_threshold")
                 .map(|x| x.parse::<usize>())
@@ -387,7 +395,10 @@ impl PageServerNode {
                 .map(|x| x.parse::<u64>())
                 .transpose()
                 .context("Failed to parse 'gc_horizon' as an integer")?,
-            gc_period: settings.remove("gc_period").map(|x| x.to_string()),
+            gc_period: settings.remove("gc_period")
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'gc_period' as duration")?,
             image_creation_threshold: settings
                 .remove("image_creation_threshold")
                 .map(|x| x.parse::<usize>())
@@ -403,13 +414,20 @@ impl PageServerNode {
                 .map(|x| x.parse::<usize>())
                 .transpose()
                 .context("Failed to parse 'image_creation_preempt_threshold' as integer")?,
-            pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
+            pitr_interval: settings.remove("pitr_interval")
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'pitr_interval' as duration")?,
             walreceiver_connect_timeout: settings
                 .remove("walreceiver_connect_timeout")
-                .map(|x| x.to_string()),
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'walreceiver_connect_timeout' as duration")?,
             lagging_wal_timeout: settings
                 .remove("lagging_wal_timeout")
-                .map(|x| x.to_string()),
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'lagging_wal_timeout' as duration")?,
             max_lsn_wal_lag: settings
                 .remove("max_lsn_wal_lag")
                 .map(|x| x.parse::<NonZeroU64>())
@@ -427,8 +445,14 @@ impl PageServerNode {
                 .context("Failed to parse 'min_resident_size_override' as integer")?,
             evictions_low_residence_duration_metric_threshold: settings
                 .remove("evictions_low_residence_duration_metric_threshold")
-                .map(|x| x.to_string()),
-            heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'evictions_low_residence_duration_metric_threshold' as duration")?,
+            heatmap_period: settings
+                .remove("heatmap_period")
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'heatmap_period' as duration")?,
             lazy_slru_download: settings
                 .remove("lazy_slru_download")
                 .map(|x| x.parse::<bool>())
@@ -439,10 +463,15 @@ impl PageServerNode {
                 .map(serde_json::from_str)
                 .transpose()
                 .context("parse `timeline_get_throttle` from json")?,
-            lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
+            lsn_lease_length: settings.remove("lsn_lease_length")
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'lsn_lease_length' as duration")?,
             lsn_lease_length_for_ts: settings
                 .remove("lsn_lease_length_for_ts")
-                .map(|x| x.to_string()),
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'lsn_lease_length_for_ts' as duration")?,
             timeline_offloading: settings
                 .remove("timeline_offloading")
                 .map(|x| x.parse::<bool>())
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 953ade83ad..40b86e4110 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -959,7 +959,7 @@ async fn main() -> anyhow::Result<()> {
                                 threshold: threshold.into(),
                             },
                         )),
-                        heatmap_period: Some("300s".to_string()),
+                        heatmap_period: Some(Duration::from_secs(300)),
                         ..Default::default()
                     },
                 })
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index dd7bea2916..1164048229 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -526,9 +526,13 @@ pub struct TenantConfigPatch {
 #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
 pub struct TenantConfig {
     pub checkpoint_distance: Option<u64>,
-    pub checkpoint_timeout: Option<String>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub checkpoint_timeout: Option<Duration>,
     pub compaction_target_size: Option<u64>,
-    pub compaction_period: Option<String>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub compaction_period: Option<Duration>,
     pub compaction_threshold: Option<usize>,
     pub compaction_upper_limit: Option<usize>,
     // defer parsing compaction_algorithm, like eviction_policy
@@ -539,22 +543,38 @@ pub struct TenantConfig {
     pub l0_flush_stall_threshold: Option<usize>,
     pub l0_flush_wait_upload: Option<bool>,
     pub gc_horizon: Option<u64>,
-    pub gc_period: Option<String>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub gc_period: Option<Duration>,
     pub image_creation_threshold: Option<usize>,
-    pub pitr_interval: Option<String>,
-    pub walreceiver_connect_timeout: Option<String>,
-    pub lagging_wal_timeout: Option<String>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub pitr_interval: Option<Duration>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub walreceiver_connect_timeout: Option<Duration>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub lagging_wal_timeout: Option<Duration>,
     pub max_lsn_wal_lag: Option<NonZeroU64>,
     pub eviction_policy: Option<EvictionPolicy>,
     pub min_resident_size_override: Option<u64>,
-    pub evictions_low_residence_duration_metric_threshold: Option<String>,
-    pub heatmap_period: Option<String>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub evictions_low_residence_duration_metric_threshold: Option<Duration>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub heatmap_period: Option<Duration>,
     pub lazy_slru_download: Option<bool>,
     pub timeline_get_throttle: Option<ThrottleConfig>,
     pub image_layer_creation_check_threshold: Option<u8>,
     pub image_creation_preempt_threshold: Option<usize>,
-    pub lsn_lease_length: Option<String>,
-    pub lsn_lease_length_for_ts: Option<String>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub lsn_lease_length: Option<Duration>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub lsn_lease_length_for_ts: Option<Duration>,
     pub timeline_offloading: Option<bool>,
     pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
     pub rel_size_v2_enabled: Option<bool>,
@@ -564,7 +584,10 @@ pub struct TenantConfig {
 }
 
 impl TenantConfig {
-    pub fn apply_patch(self, patch: TenantConfigPatch) -> TenantConfig {
+    pub fn apply_patch(
+        self,
+        patch: TenantConfigPatch,
+    ) -> Result<TenantConfig, humantime::DurationError> {
         let Self {
             mut checkpoint_distance,
             mut checkpoint_timeout,
@@ -604,11 +627,17 @@ impl TenantConfig {
         } = self;
 
         patch.checkpoint_distance.apply(&mut checkpoint_distance);
-        patch.checkpoint_timeout.apply(&mut checkpoint_timeout);
+        patch
+            .checkpoint_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut checkpoint_timeout);
         patch
             .compaction_target_size
             .apply(&mut compaction_target_size);
-        patch.compaction_period.apply(&mut compaction_period);
+        patch
+            .compaction_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut compaction_period);
         patch.compaction_threshold.apply(&mut compaction_threshold);
         patch
             .compaction_upper_limit
@@ -626,15 +655,25 @@ impl TenantConfig {
             .apply(&mut l0_flush_stall_threshold);
         patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload);
         patch.gc_horizon.apply(&mut gc_horizon);
-        patch.gc_period.apply(&mut gc_period);
+        patch
+            .gc_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut gc_period);
         patch
             .image_creation_threshold
             .apply(&mut image_creation_threshold);
-        patch.pitr_interval.apply(&mut pitr_interval);
+        patch
+            .pitr_interval
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut pitr_interval);
         patch
             .walreceiver_connect_timeout
+            .map(|v| humantime::parse_duration(&v))?
             .apply(&mut walreceiver_connect_timeout);
-        patch.lagging_wal_timeout.apply(&mut lagging_wal_timeout);
+        patch
+            .lagging_wal_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lagging_wal_timeout);
         patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag);
         patch.eviction_policy.apply(&mut eviction_policy);
         patch
@@ -642,8 +681,12 @@ impl TenantConfig {
             .apply(&mut min_resident_size_override);
         patch
             .evictions_low_residence_duration_metric_threshold
+            .map(|v| humantime::parse_duration(&v))?
             .apply(&mut evictions_low_residence_duration_metric_threshold);
-        patch.heatmap_period.apply(&mut heatmap_period);
+        patch
+            .heatmap_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut heatmap_period);
         patch.lazy_slru_download.apply(&mut lazy_slru_download);
         patch
             .timeline_get_throttle
@@ -654,9 +697,13 @@ impl TenantConfig {
         patch
             .image_creation_preempt_threshold
             .apply(&mut image_creation_preempt_threshold);
-        patch.lsn_lease_length.apply(&mut lsn_lease_length);
+        patch
+            .lsn_lease_length
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lsn_lease_length);
         patch
             .lsn_lease_length_for_ts
+            .map(|v| humantime::parse_duration(&v))?
             .apply(&mut lsn_lease_length_for_ts);
         patch.timeline_offloading.apply(&mut timeline_offloading);
         patch
@@ -673,7 +720,7 @@ impl TenantConfig {
             .gc_compaction_ratio_percent
             .apply(&mut gc_compaction_ratio_percent);
 
-        Self {
+        Ok(Self {
             checkpoint_distance,
             checkpoint_timeout,
             compaction_target_size,
@@ -709,7 +756,7 @@ impl TenantConfig {
             gc_compaction_enabled,
             gc_compaction_initial_threshold_kb,
             gc_compaction_ratio_percent,
-        }
+        })
     }
 }
 
@@ -2503,7 +2550,7 @@ mod tests {
             ..base.clone()
         };
 
-        let patched = base.apply_patch(decoded.config);
+        let patched = base.apply_patch(decoded.config).unwrap();
 
         assert_eq!(patched, expected);
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index c6bcfdf2fb..ab4c4c935d 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -693,16 +693,15 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
 /// This is a conversion from our internal tenant config object to the one used
 /// in external APIs.
 impl From<TenantConfOpt> for models::TenantConfig {
+    // TODO(vlad): These are now the same, but they have different serialization logic.
+    // Can we merge them?
     fn from(value: TenantConfOpt) -> Self {
-        fn humantime(d: Duration) -> String {
-            format!("{}s", d.as_secs())
-        }
         Self {
             checkpoint_distance: value.checkpoint_distance,
-            checkpoint_timeout: value.checkpoint_timeout.map(humantime),
+            checkpoint_timeout: value.checkpoint_timeout,
             compaction_algorithm: value.compaction_algorithm,
             compaction_target_size: value.compaction_target_size,
-            compaction_period: value.compaction_period.map(humantime),
+            compaction_period: value.compaction_period,
             compaction_threshold: value.compaction_threshold,
             compaction_upper_limit: value.compaction_upper_limit,
             compaction_l0_first: value.compaction_l0_first,
@@ -711,24 +710,23 @@ impl From<TenantConfOpt> for models::TenantConfig {
             l0_flush_stall_threshold: value.l0_flush_stall_threshold,
             l0_flush_wait_upload: value.l0_flush_wait_upload,
             gc_horizon: value.gc_horizon,
-            gc_period: value.gc_period.map(humantime),
+            gc_period: value.gc_period,
             image_creation_threshold: value.image_creation_threshold,
-            pitr_interval: value.pitr_interval.map(humantime),
-            walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime),
-            lagging_wal_timeout: value.lagging_wal_timeout.map(humantime),
+            pitr_interval: value.pitr_interval,
+            walreceiver_connect_timeout: value.walreceiver_connect_timeout,
+            lagging_wal_timeout: value.lagging_wal_timeout,
             max_lsn_wal_lag: value.max_lsn_wal_lag,
             eviction_policy: value.eviction_policy,
             min_resident_size_override: value.min_resident_size_override,
             evictions_low_residence_duration_metric_threshold: value
-                .evictions_low_residence_duration_metric_threshold
-                .map(humantime),
-            heatmap_period: value.heatmap_period.map(humantime),
+                .evictions_low_residence_duration_metric_threshold,
+            heatmap_period: value.heatmap_period,
             lazy_slru_download: value.lazy_slru_download,
             timeline_get_throttle: value.timeline_get_throttle,
             image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
             image_creation_preempt_threshold: value.image_creation_preempt_threshold,
-            lsn_lease_length: value.lsn_lease_length.map(humantime),
-            lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
+            lsn_lease_length: value.lsn_lease_length,
+            lsn_lease_length_for_ts: value.lsn_lease_length_for_ts,
             timeline_offloading: value.timeline_offloading,
             wal_receiver_protocol_override: value.wal_receiver_protocol_override,
             rel_size_v2_enabled: value.rel_size_v2_enabled,
@@ -760,29 +758,10 @@ mod tests {
         assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap());
     }
 
-    #[test]
-    fn test_try_from_models_tenant_config_err() {
-        let tenant_config = models::TenantConfig {
-            lagging_wal_timeout: Some("5a".to_string()),
-            ..TenantConfig::default()
-        };
-
-        let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config);
-
-        assert!(
-            tenant_conf_opt.is_err(),
-            "Suceeded to convert TenantConfig to TenantConfOpt"
-        );
-
-        let expected_error_str =
-            "lagging_wal_timeout: invalid value: string \"5a\", expected a duration";
-        assert_eq!(tenant_conf_opt.unwrap_err().to_string(), expected_error_str);
-    }
-
     #[test]
     fn test_try_from_models_tenant_config_success() {
         let tenant_config = models::TenantConfig {
-            lagging_wal_timeout: Some("5s".to_string()),
+            lagging_wal_timeout: Some(Duration::from_secs(5)),
             ..TenantConfig::default()
         };
 
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 73dc1a5c10..08c80bc141 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -24,6 +24,7 @@ hex.workspace = true
 hyper0.workspace = true
 humantime.workspace = true
 itertools.workspace = true
+json-structural-diff.workspace = true
 lasso.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 4fda7338e5..4f0f170284 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -1,6 +1,7 @@
 use crate::pageserver_client::PageserverClient;
 use crate::persistence::Persistence;
 use crate::{compute_hook, service};
+use json_structural_diff::JsonDiff;
 use pageserver_api::controller_api::{AvailabilityZone, MigrationConfig, PlacementPolicy};
 use pageserver_api::models::{
     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, TenantWaitLsnRequest,
@@ -24,7 +25,7 @@ use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
 use crate::tenant_shard::{IntentState, ObservedState, ObservedStateDelta, ObservedStateLocation};
 
-const DEFAULT_HEATMAP_PERIOD: &str = "60s";
+const DEFAULT_HEATMAP_PERIOD: Duration = Duration::from_secs(60);
 
 /// Object with the lifetime of the background reconcile task that is created
 /// for tenants which have a difference between their intent and observed states.
@@ -880,7 +881,27 @@ impl Reconciler {
                         self.generation = Some(generation);
                         wanted_conf.generation = generation.into();
                     }
-                    tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
+
+                    let diff = match observed {
+                        Some(ObservedStateLocation {
+                            conf: Some(observed),
+                        }) => {
+                            let diff = JsonDiff::diff(
+                                &serde_json::to_value(observed.clone()).unwrap(),
+                                &serde_json::to_value(wanted_conf.clone()).unwrap(),
+                                false,
+                            );
+
+                            if let Some(json_diff) = diff.diff {
+                                serde_json::to_string(&json_diff).unwrap_or("diff err".to_string())
+                            } else {
+                                "unknown".to_string()
+                            }
+                        }
+                        _ => "full".to_string(),
+                    };
+
+                    tracing::info!(node_id=%node.get_id(), "Observed configuration requires update: {diff}");
 
                     // Because `node` comes from a ref to &self, clone it before calling into a &mut self
                     // function: this could be avoided by refactoring the state mutated by location_config into
@@ -1180,7 +1201,7 @@ fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig
     let mut config = config.clone();
     if has_secondaries {
         if config.heatmap_period.is_none() {
-            config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string());
+            config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD);
         }
     } else {
         config.heatmap_period = None;
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 14eacfd422..b9c2711192 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2926,7 +2926,9 @@ impl Service {
             first
         };
 
-        let updated_config = base.apply_patch(patch);
+        let updated_config = base
+            .apply_patch(patch)
+            .map_err(|err| ApiError::BadRequest(anyhow::anyhow!(err)))?;
         self.set_tenant_config_and_reconcile(tenant_id, updated_config)
             .await
     }
@@ -6654,11 +6656,12 @@ impl Service {
     ) -> Option<ReconcilerWaiter> {
         let reconcile_needed = shard.get_reconcile_needed(nodes);
 
-        match reconcile_needed {
+        let reconcile_reason = match reconcile_needed {
             ReconcileNeeded::No => return None,
             ReconcileNeeded::WaitExisting(waiter) => return Some(waiter),
-            ReconcileNeeded::Yes => {
+            ReconcileNeeded::Yes(reason) => {
                 // Fall through to try and acquire units for spawning reconciler
+                reason
             }
         };
 
@@ -6697,6 +6700,7 @@ impl Service {
         };
 
         shard.spawn_reconciler(
+            reconcile_reason,
             &self.result_tx,
             nodes,
             &self.compute_hook,
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 219c0dffe7..56a36dc2df 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -481,7 +481,14 @@ pub(crate) enum ReconcileNeeded {
     /// spawned: wait for the existing reconciler rather than spawning a new one.
     WaitExisting(ReconcilerWaiter),
     /// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`]
-    Yes,
+    Yes(ReconcileReason),
+}
+
+#[derive(Debug)]
+pub(crate) enum ReconcileReason {
+    ActiveNodesDirty,
+    UnknownLocation,
+    PendingComputeNotification,
 }
 
 /// Pending modification to the observed state of a tenant shard.
@@ -1341,12 +1348,18 @@ impl TenantShard {
 
         let active_nodes_dirty = self.dirty(pageservers);
 
-        // Even if there is no pageserver work to be done, if we have a pending notification to computes,
-        // wake up a reconciler to send it.
-        let do_reconcile =
-            active_nodes_dirty || dirty_observed || self.pending_compute_notification;
+        let reconcile_needed = match (
+            active_nodes_dirty,
+            dirty_observed,
+            self.pending_compute_notification,
+        ) {
+            (true, _, _) => ReconcileNeeded::Yes(ReconcileReason::ActiveNodesDirty),
+            (_, true, _) => ReconcileNeeded::Yes(ReconcileReason::UnknownLocation),
+            (_, _, true) => ReconcileNeeded::Yes(ReconcileReason::PendingComputeNotification),
+            _ => ReconcileNeeded::No,
+        };
 
-        if !do_reconcile {
+        if matches!(reconcile_needed, ReconcileNeeded::No) {
             tracing::debug!("Not dirty, no reconciliation needed.");
             return ReconcileNeeded::No;
         }
@@ -1389,7 +1402,7 @@ impl TenantShard {
             }
         }
 
-        ReconcileNeeded::Yes
+        reconcile_needed
     }
 
     /// Ensure the sequence number is set to a value where waiting for this value will make us wait
@@ -1479,6 +1492,7 @@ impl TenantShard {
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
     pub(crate) fn spawn_reconciler(
         &mut self,
+        reason: ReconcileReason,
         result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResultRequest>,
         pageservers: &Arc<HashMap<NodeId, Node>>,
         compute_hook: &Arc<ComputeHook>,
@@ -1538,7 +1552,7 @@ impl TenantShard {
         let reconcile_seq = self.sequence;
         let long_reconcile_threshold = service_config.long_reconcile_threshold;
 
-        tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence);
+        tracing::info!(seq=%reconcile_seq, "Spawning Reconciler ({reason:?})");
         let must_notify = self.pending_compute_notification;
         let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
                                                         tenant_id=%reconciler.tenant_shard_id.tenant_id,
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 7e895422d2..d18cbb3393 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3817,3 +3817,43 @@ def test_update_node_on_registration(neon_env_builder: NeonEnvBuilder):
     nodes = env.storage_controller.node_list()
     assert len(nodes) == 1
     assert nodes[0]["listen_https_port"] is None
+
+
+def test_storage_controller_location_conf_equivalence(neon_env_builder: NeonEnvBuilder):
+    """
+    Validate that a storage controller restart with no shards in a transient state
+    performs zero reconciliations at start-up. Implicitly, this means that the location
+    configs returned by the pageserver are identical to the persisted state in the
+    storage controller database.
+    """
+    neon_env_builder.num_pageservers = 1
+    neon_env_builder.storage_controller_config = {
+        "start_as_candidate": False,
+    }
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    env.storage_controller.tenant_create(
+        tenant_id, shard_count=2, tenant_config={"pitr_interval": "1h2m3s"}
+    )
+
+    env.storage_controller.reconcile_until_idle()
+
+    reconciles_before_restart = env.storage_controller.get_metric_value(
+        "storage_controller_reconcile_complete_total", filter={"status": "ok"}
+    )
+
+    assert reconciles_before_restart != 0
+
+    env.storage_controller.stop()
+    env.storage_controller.start()
+
+    env.storage_controller.reconcile_until_idle()
+
+    reconciles_after_restart = env.storage_controller.get_metric_value(
+        "storage_controller_reconcile_complete_total", filter={"status": "ok"}
+    )
+
+    assert reconciles_after_restart == 0

From b1d8771d5f1f21f29e86a5e6c10213fded6a0a75 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 21 Feb 2025 18:56:16 +0200
Subject: [PATCH 553/583] Store prefetch results in LFC cache once as soon as
 they are received (#10442)

## Problem

Prefetch is performed locally, so different backers can request the same
pages form PS.
Such duplicated request increase load of page server and network
traffic.
Making prefetch global seems to be very difficult and undesirable,
because different queries can access chunks on different speed. Storing
prefetch chunks in LFC will not completely eliminate duplicates, but can
minimise such requests.

The problem with storing prefetch result in LFC is that in this case
page is not protected by share buffer lock.
So we will have to perform extra synchronisation at LFC side.

See:

https://neondb.slack.com/archives/C0875PUD0LC/p1736772890602029?thread_ts=1736762541.116949&cid=C0875PUD0LC

@MMeent implementation of prewarm:
See https://github.com/neondatabase/neon/pull/10312/


## Summary of changes

Use conditional variables to sycnhronize access to LFC entry.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c                   | 764 +++++++++++++++--------
 pgxn/neon/neon.c                         |   2 +
 pgxn/neon/neon.h                         |   2 +
 pgxn/neon/pagestore_client.h             |   7 +-
 pgxn/neon/pagestore_smgr.c               | 185 +++++-
 test_runner/regress/test_lfc_prefetch.py | 101 +++
 6 files changed, 766 insertions(+), 295 deletions(-)
 create mode 100644 test_runner/regress/test_lfc_prefetch.py

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index a61dc9f4c6..f6a577abfc 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -22,6 +22,7 @@
 #include "neon_pgversioncompat.h"
 
 #include "access/parallel.h"
+#include "access/xlog.h"
 #include "funcapi.h"
 #include "miscadmin.h"
 #include "pagestore_client.h"
@@ -40,12 +41,16 @@
 #include "utils/dynahash.h"
 #include "utils/guc.h"
 
+#if PG_VERSION_NUM >= 150000
+#include "access/xlogrecovery.h"
+#endif
+
 #include "hll.h"
 #include "bitmap.h"
 #include "neon.h"
 #include "neon_perf_counters.h"
 
-#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
+#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "LFC: assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
 
 /*
  * Local file cache is used to temporary store relations pages in local file system.
@@ -93,7 +98,23 @@
 #define MB					((uint64)1024*1024)
 
 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
-#define CHUNK_BITMAP_SIZE ((BLOCKS_PER_CHUNK + 31) / 32)
+
+/*
+ * Blocks are read or written to LFC file outside LFC critical section.
+ * To synchronize access to such block, writer set state of such block to PENDING.
+ * If some other backend (read or writer) see PENDING status, it change it to REQUESTED and start
+ * waiting until status is changed on conditional variable.
+ * When writer completes is operation, it checks if status is REQUESTED and if so, broadcast conditional variable,
+ * waking up all backend waiting for access to this block.
+ */
+typedef enum FileCacheBlockState
+{
+	UNAVAILABLE, /* block is not present in cache */
+	AVAILABLE,   /* block can be used */
+	PENDING,     /* block is loaded */
+	REQUESTED    /* some other backend is waiting for block to be loaded */
+} FileCacheBlockState;
+
 
 typedef struct FileCacheEntry
 {
@@ -101,10 +122,16 @@ typedef struct FileCacheEntry
 	uint32		hash;
 	uint32		offset;
 	uint32		access_count;
-	uint32		bitmap[CHUNK_BITMAP_SIZE];
+	uint32		state[(BLOCKS_PER_CHUNK + 31) / 32 * 2]; /* two bits per block */
 	dlist_node	list_node;		/* LRU/holes list node */
 } FileCacheEntry;
 
+#define GET_STATE(entry, i) (((entry)->state[(i) / 16] >> ((i) % 16 * 2)) & 3)
+#define SET_STATE(entry, i, new_state) (entry)->state[(i) / 16] = ((entry)->state[(i) / 16] & ~(3 << ((i) % 16 * 2))) | ((new_state) << ((i) % 16 * 2))
+
+#define N_COND_VARS 	64
+#define CV_WAIT_TIMEOUT	10
+
 typedef struct FileCacheControl
 {
 	uint64		generation;		/* generation is needed to handle correct hash
@@ -118,18 +145,24 @@ typedef struct FileCacheControl
 	uint64		writes;			/* number of writes issued */
 	uint64		time_read;		/* time spent reading (us) */
 	uint64		time_write;		/* time spent writing (us) */
+	uint64		resizes;        /* number of LFC resizes   */
+	uint64		evicted_pages;	/* number of evicted pages */
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
 	dlist_head  holes;          /* double linked list of punched holes */
 	HyperLogLogState wss_estimation; /* estimation of working set size */
+	ConditionVariable cv[N_COND_VARS]; /* turnstile of condition variables */
 } FileCacheControl;
 
+bool lfc_store_prefetch_result;
+
 static HTAB *lfc_hash;
-static int	lfc_desc = 0;
+static int	lfc_desc = -1;
 static LWLockId lfc_lock;
 static int	lfc_max_size;
 static int	lfc_size_limit;
 static char *lfc_path;
+static uint64 lfc_generation;
 static FileCacheControl *lfc_ctl;
 static shmem_startup_hook_type prev_shmem_startup_hook;
 #if PG_VERSION_NUM>=150000
@@ -138,6 +171,20 @@ static shmem_request_hook_type prev_shmem_request_hook;
 
 #define LFC_ENABLED() (lfc_ctl->limit != 0)
 
+/*
+ * Close LFC file if opened.
+ * All backends should close their LFC files once LFC is disabled.
+ */
+static void
+lfc_close_file(void)
+{
+	if (lfc_desc >= 0)
+	{
+		close(lfc_desc);
+		lfc_desc = -1;
+	}
+}
+
 /*
  * Local file cache is optional and Neon can work without it.
  * In case of any any errors with this cache, we should disable it but to not throw error.
@@ -145,20 +192,16 @@ static shmem_request_hook_type prev_shmem_request_hook;
  * All cache content should be invalidated to avoid reading of stale or corrupted data
  */
 static void
-lfc_disable(char const *op)
+lfc_switch_off(void)
 {
 	int			fd;
 
-	elog(WARNING, "Failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path);
-
-	/* Invalidate hash */
-	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-
 	if (LFC_ENABLED())
 	{
 		HASH_SEQ_STATUS status;
 		FileCacheEntry *entry;
 
+		/* Invalidate hash */
 		hash_seq_init(&status, lfc_hash);
 		while ((entry = hash_seq_search(&status)) != NULL)
 		{
@@ -171,41 +214,33 @@ lfc_disable(char const *op)
 		dlist_init(&lfc_ctl->lru);
 		dlist_init(&lfc_ctl->holes);
 
-		if (lfc_desc > 0)
-		{
-			int			rc;
+		/*
+		 * We need to use unlink to to avoid races in LFC write, because it is not
+		 * protected by lock
+		 */
+		unlink(lfc_path);
 
-			/*
-			 * If the reason of error is ENOSPC, then truncation of file may
-			 * help to reclaim some space
-			 */
-			pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_TRUNCATE);
-			rc = ftruncate(lfc_desc, 0);
-			pgstat_report_wait_end();
+		fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
+		if (fd < 0)
+			elog(WARNING, "LFC: failed to recreate local file cache %s: %m", lfc_path);
+		else
+			close(fd);
 
-			if (rc < 0)
-				elog(WARNING, "Failed to truncate local file cache %s: %m", lfc_path);
-		}
+		/* Wakeup waiting backends */
+		for (int i = 0; i < N_COND_VARS; i++)
+			ConditionVariableBroadcast(&lfc_ctl->cv[i]);
 	}
+	lfc_close_file();
+}
 
-	/*
-	 * We need to use unlink to to avoid races in LFC write, because it is not
-	 * protectedby
-	 */
-	unlink(lfc_path);
-
-	fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
-	if (fd < 0)
-		elog(WARNING, "Failed to recreate local file cache %s: %m", lfc_path);
-	else
-		close(fd);
+static void
+lfc_disable(char const *op)
+{
+	elog(WARNING, "LFC: failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path);
 
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	lfc_switch_off();
 	LWLockRelease(lfc_lock);
-
-	if (lfc_desc > 0)
-		close(lfc_desc);
-
-	lfc_desc = -1;
 }
 
 /*
@@ -217,13 +252,20 @@ lfc_maybe_disabled(void)
 	return !lfc_ctl || !LFC_ENABLED();
 }
 
+/*
+ * Open LFC file if not opened yet or generation is changed.
+ * Should be called under LFC lock.
+ */
 static bool
 lfc_ensure_opened(void)
 {
-	bool		enabled = !lfc_maybe_disabled();
-
+	if (lfc_generation != lfc_ctl->generation)
+	{
+		lfc_close_file();
+		lfc_generation = lfc_ctl->generation;
+	}
 	/* Open cache file if not done yet */
-	if (lfc_desc <= 0 && enabled)
+	if (lfc_desc < 0)
 	{
 		lfc_desc = BasicOpenFile(lfc_path, O_RDWR);
 
@@ -233,7 +275,7 @@ lfc_ensure_opened(void)
 			return false;
 		}
 	}
-	return enabled;
+	return true;
 }
 
 static void
@@ -267,14 +309,7 @@ lfc_shmem_startup(void)
 								 n_chunks + 1, n_chunks + 1,
 								 &info,
 								 HASH_ELEM | HASH_BLOBS);
-		lfc_ctl->generation = 0;
-		lfc_ctl->size = 0;
-		lfc_ctl->used = 0;
-		lfc_ctl->hits = 0;
-		lfc_ctl->misses = 0;
-		lfc_ctl->writes = 0;
-		lfc_ctl->time_read = 0;
-		lfc_ctl->time_write = 0;
+		memset(lfc_ctl, 0, sizeof(FileCacheControl));
 		dlist_init(&lfc_ctl->lru);
 		dlist_init(&lfc_ctl->holes);
 
@@ -285,7 +320,7 @@ lfc_shmem_startup(void)
 		fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
 		if (fd < 0)
 		{
-			elog(WARNING, "Failed to create local file cache %s: %m", lfc_path);
+			elog(WARNING, "LFC: failed to create local file cache %s: %m", lfc_path);
 			lfc_ctl->limit = 0;
 		}
 		else
@@ -293,6 +328,11 @@ lfc_shmem_startup(void)
 			close(fd);
 			lfc_ctl->limit = SIZE_MB_TO_CHUNKS(lfc_size_limit);
 		}
+
+		/* Initialize turnstile of condition variables */
+		for (int i = 0; i < N_COND_VARS; i++)
+			ConditionVariableInit(&lfc_ctl->cv[i]);
+
 	}
 	LWLockRelease(AddinShmemInitLock);
 }
@@ -327,7 +367,7 @@ lfc_check_limit_hook(int *newval, void **extra, GucSource source)
 {
 	if (*newval > lfc_max_size)
 	{
-		elog(ERROR, "neon.file_cache_size_limit can not be larger than neon.max_file_cache_size");
+		elog(ERROR, "LFC: neon.file_cache_size_limit can not be larger than neon.max_file_cache_size");
 		return false;
 	}
 	return true;
@@ -338,14 +378,31 @@ lfc_change_limit_hook(int newval, void *extra)
 {
 	uint32		new_size = SIZE_MB_TO_CHUNKS(newval);
 
-	if (!is_normal_backend())
-		return;
-
-	if (!lfc_ensure_opened())
+	if (!lfc_ctl || !is_normal_backend())
 		return;
 
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
 
+	/* Open LFC file only if LFC was enabled or we are going to reenable it */
+	if (newval == 0 && !LFC_ENABLED())
+	{
+		LWLockRelease(lfc_lock);
+		/* File should be reopened if LFC is reenabled */
+		lfc_close_file();
+		return;
+	}
+
+	if (!lfc_ensure_opened())
+	{
+		LWLockRelease(lfc_lock);
+		return;
+	}
+
+	if (lfc_ctl->limit != new_size)
+	{
+		lfc_ctl->resizes += 1;
+	}
+
 	while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru))
 	{
 		/*
@@ -367,7 +424,9 @@ lfc_change_limit_hook(int newval, void *extra)
 		/* We remove the old entry, and re-enter a hole to the hash table */
 		for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 		{
-			lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
+			bool is_page_cached = GET_STATE(victim, i) == AVAILABLE;
+			lfc_ctl->used_pages -= is_page_cached;
+			lfc_ctl->evicted_pages += is_page_cached;
 		}
 		hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
 
@@ -383,10 +442,11 @@ lfc_change_limit_hook(int newval, void *extra)
 
 		lfc_ctl->used -= 1;
 	}
-	lfc_ctl->limit = new_size;
-	if (new_size == 0) {
-		lfc_ctl->generation += 1;
-	}
+	if (new_size == 0)
+		lfc_switch_off();
+	else
+		lfc_ctl->limit = new_size;
+
 	neon_log(DEBUG1, "set local file cache limit to %d", new_size);
 
 	LWLockRelease(lfc_lock);
@@ -403,6 +463,17 @@ lfc_init(void)
 		neon_log(ERROR, "Neon module should be loaded via shared_preload_libraries");
 
 
+	DefineCustomBoolVariable("neon.store_prefetch_result_in_lfc",
+							"Immediately store received prefetch result in LFC",
+							NULL,
+							&lfc_store_prefetch_result,
+							false,
+							PGC_SUSET,
+							0,
+							NULL,
+							NULL,
+							NULL);
+
 	DefineCustomIntVariable("neon.max_file_cache_size",
 							"Maximal size of Neon local file cache",
 							NULL,
@@ -480,7 +551,7 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	if (LFC_ENABLED())
 	{
 		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
-		found = entry != NULL && (entry->bitmap[chunk_offs >> 5] & ((uint32)1 << (chunk_offs & 31))) != 0;
+		found = entry != NULL && GET_STATE(entry, chunk_offs) != UNAVAILABLE;
 	}
 	LWLockRelease(lfc_lock);
 	return found;
@@ -529,8 +600,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		{
 			for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
 			{
-				if ((entry->bitmap[chunk_offs >> 5] & 
-					 ((uint32)1 << (chunk_offs & 31))) != 0)
+				if (GET_STATE(entry, chunk_offs) != UNAVAILABLE)
 				{
 					BITMAP_SET(bitmap, i);
 					found++;
@@ -541,7 +611,6 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		{
 			i += this_chunk;
 		}
-
 		/*
 		 * Break out of the iteration before doing expensive stuff for
 		 * a next iteration
@@ -577,87 +646,6 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	return found;
 }
 
-/*
- * Evict a page (if present) from the local file cache
- */
-void
-lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
-{
-	BufferTag	tag;
-	FileCacheEntry *entry;
-	bool		found;
-	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
-	uint32		hash;
-
-	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
-		return;
-
-	CopyNRelFileInfoToBufTag(tag, rinfo);
-	tag.forkNum = forkNum;
-	tag.blockNum = (blkno & ~(BLOCKS_PER_CHUNK - 1));
-
-	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
-	hash = get_hash_value(lfc_hash, &tag);
-
-	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-
-	if (!LFC_ENABLED())
-	{
-		LWLockRelease(lfc_lock);
-		return;
-	}
-
-	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, &found);
-
-	if (!found)
-	{
-		/* nothing to do */
-		LWLockRelease(lfc_lock);
-		return;
-	}
-
-	/* remove the page from the cache */
-	entry->bitmap[chunk_offs >> 5] &= ~((uint32)1 << (chunk_offs & (32 - 1)));
-
-	if (entry->access_count == 0)
-	{
-		/*
-		 * If the chunk has no live entries, we can position the chunk to be
-		 * recycled first.
-		 */
-		if (entry->bitmap[chunk_offs >> 5] == 0)
-		{
-			bool		has_remaining_pages = false;
-
-			for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
-			{
-				if (entry->bitmap[i] != 0)
-				{
-					has_remaining_pages = true;
-					break;
-				}
-			}
-
-			/*
-			 * Put the entry at the position that is first to be reclaimed when we
-			 * have no cached pages remaining in the chunk
-			 */
-			if (!has_remaining_pages)
-			{
-				dlist_delete(&entry->list_node);
-				dlist_push_head(&lfc_ctl->lru, &entry->list_node);
-			}
-		}
-	}
-
-	/*
-	 * Done: apart from empty chunks, we don't move chunks in the LRU when
-	 * they're empty because eviction isn't usage.
-	 */
-
-	LWLockRelease(lfc_lock);
-}
-
 /*
  * Try to read pages from local cache.
  * Returns the number of pages read from the local cache, and sets bits in
@@ -685,17 +673,14 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	int			buf_offset = 0;
 
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
-		return 0;
-
-	if (!lfc_ensure_opened())
-		return 0;
+		return -1;
 
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
 
 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 
-	/* 
+	/*
 	 * For every chunk that has blocks we're interested in, we
 	 * 1. get the chunk header
 	 * 2. Check if the chunk actually has the blocks we're interested in
@@ -712,22 +697,35 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		int		iteration_hits = 0;
 		int		iteration_misses = 0;
 		uint64	io_time_us = 0;
+		int     n_blocks_to_read = 0;
+		ConditionVariable* cv;
+
 		Assert(blocks_in_chunk > 0);
 
 		for (int i = 0; i < blocks_in_chunk; i++)
 		{
+			n_blocks_to_read += (BITMAP_ISSET(mask, buf_offset + i) != 0);
 			iov[i].iov_base = buffers[buf_offset + i];
 			iov[i].iov_len = BLCKSZ;
+			BITMAP_CLR(mask,  buf_offset + i);
+		}
+		if (n_blocks_to_read == 0)
+		{
+			buf_offset += blocks_in_chunk;
+			nblocks -= blocks_in_chunk;
+			blkno += blocks_in_chunk;
+			continue;
 		}
 
 		tag.blockNum = blkno - chunk_offs;
 		hash = get_hash_value(lfc_hash, &tag);
+		cv = &lfc_ctl->cv[hash % N_COND_VARS];
 
 		LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
 
 		/* We can return the blocks we've read before LFC got disabled;
 		 * assuming we read any. */
-		if (!LFC_ENABLED())
+		if (!LFC_ENABLED() || !lfc_ensure_opened())
 		{
 			LWLockRelease(lfc_lock);
 			return blocks_read;
@@ -763,15 +761,32 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		generation = lfc_ctl->generation;
 		entry_offset = entry->offset;
 
-		LWLockRelease(lfc_lock);
-
 		for (int i = 0; i < blocks_in_chunk; i++)
 		{
-			/*
-			 * If the page is valid, we consider it "read".
-			 * All other pages will be fetched separately by the next cache
-			 */
-			if (entry->bitmap[(chunk_offs + i) / 32] & ((uint32)1 << ((chunk_offs + i) % 32)))
+			FileCacheBlockState state = UNAVAILABLE;
+			bool sleeping = false;
+			while (lfc_ctl->generation == generation)
+			{
+				state = GET_STATE(entry, chunk_offs + i);
+				if (state == PENDING) {
+					SET_STATE(entry, chunk_offs + i, REQUESTED);
+				} else if (state != REQUESTED) {
+					break;
+				}
+				if (!sleeping)
+				{
+					ConditionVariablePrepareToSleep(cv);
+					sleeping = true;
+				}
+				LWLockRelease(lfc_lock);
+				ConditionVariableTimedSleep(cv, CV_WAIT_TIMEOUT, WAIT_EVENT_NEON_LFC_CV_WAIT);
+				LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+			}
+			if (sleeping)
+			{
+				ConditionVariableCancelSleep();
+			}
+			if (state == AVAILABLE)
 			{
 				BITMAP_SET(mask, buf_offset + i);
 				iteration_hits++;
@@ -779,6 +794,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			else
 				iteration_misses++;
 		}
+		LWLockRelease(lfc_lock);
 
 		Assert(iteration_hits + iteration_misses > 0);
 
@@ -820,6 +836,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		else
 		{
 			/* generation mismatch, assume error condition */
+			lfc_close_file();
 			LWLockRelease(lfc_lock);
 			return -1;
 		}
@@ -835,6 +852,249 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	return blocks_read;
 }
 
+/*
+ * Initialize new LFC hash entry, perform eviction if needed.
+ * Returns false if there are no unpinned entries and chunk can not be added.
+ */
+static bool
+lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
+{
+	/*-----------
+	 * If the chunk wasn't already in the LFC then we have these
+	 * options, in order of preference:
+	 *
+	 * Unless there is no space available, we can:
+	 *  1. Use an entry from the `holes` list, and
+	 *  2. Create a new entry.
+	 * We can always, regardless of space in the LFC:
+	 *  3. evict an entry from LRU, and
+	 *  4. ignore the write operation (the least favorite option)
+	 */
+	if (lfc_ctl->used < lfc_ctl->limit)
+	{
+		if (!dlist_is_empty(&lfc_ctl->holes))
+		{
+			/* We can reuse a hole that was left behind when the LFC was shrunk previously */
+			FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node,
+												   dlist_pop_head_node(&lfc_ctl->holes));
+			uint32 offset = hole->offset;
+			bool hole_found;
+
+			hash_search_with_hash_value(lfc_hash, &hole->key,
+										hole->hash, HASH_REMOVE, &hole_found);
+			CriticalAssert(hole_found);
+
+			lfc_ctl->used += 1;
+			entry->offset = offset;			/* reuse the hole */
+		}
+		else
+		{
+			lfc_ctl->used += 1;
+			entry->offset = lfc_ctl->size++;/* allocate new chunk at end
+											 * of file */
+		}
+	}
+	/*
+	 * We've already used up all allocated LFC entries.
+	 *
+	 * If we can clear an entry from the LRU, do that.
+	 * If we can't (e.g. because all other slots are being accessed)
+	 * then we will remove this entry from the hash and continue
+	 * on to the next chunk, as we may not exceed the limit.
+	 */
+	else if (!dlist_is_empty(&lfc_ctl->lru))
+	{
+		/* Cache overflow: evict least recently used chunk */
+		FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node,
+												 dlist_pop_head_node(&lfc_ctl->lru));
+
+		for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+		{
+			bool is_page_cached = GET_STATE(victim, i) == AVAILABLE;
+			lfc_ctl->used_pages -= is_page_cached;
+			lfc_ctl->evicted_pages += is_page_cached;
+		}
+
+		CriticalAssert(victim->access_count == 0);
+		entry->offset = victim->offset; /* grab victim's chunk */
+		hash_search_with_hash_value(lfc_hash, &victim->key,
+									victim->hash, HASH_REMOVE, NULL);
+		neon_log(DEBUG2, "Swap file cache page");
+	}
+	else
+	{
+		/* Can't add this chunk - we don't have the space for it */
+		hash_search_with_hash_value(lfc_hash, &entry->key, hash,
+									HASH_REMOVE, NULL);
+
+		return false;
+	}
+
+	entry->access_count = 1;
+	entry->hash = hash;
+
+	for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+		SET_STATE(entry, i, UNAVAILABLE);
+
+	return true;
+}
+
+/*
+ * Store received prefetch result in LFC cache.
+ * Unlike lfc_read/lfc_write this call is is not protected by shared buffer lock.
+ * So we should be ready that other backends will try to concurrently read or write this block.
+ * We do not store prefetched block if it already exists in LFC or it's not_modified_since LSN is smaller
+ * than current last written LSN (LwLSN).
+ *
+ * We can enforce correctness of storing page in LFC by the following steps:
+ * 1. Check under LFC lock that page in not present in LFC.
+ * 2. Check under LFC lock that LwLSN is not changed since prefetch request time (not_modified_since).
+ * 3. Change page state to "Pending" under LFC lock to prevent all other backends to read or write this
+ *    pages until this write is completed.
+ * 4. Assume that some other backend creates new image of the page without reading it
+ *    (because reads will be blocked because of 2). This version of the page is stored in shared buffer.
+ *    Any attempt to throw away this page from shared buffer will be blocked, because Postgres first
+ *    needs to save dirty page and write will be blocked because of 2.
+ *    So any backend trying to access this page, will take it from shared buffer without accessing
+ *    SMGR and LFC.
+ * 5. After write completion we once again obtain LFC lock and wake-up all waiting backends.
+ *    If there is some backend waiting to write new image of the page (4) then now it will be able to
+ *    do it,overwriting old (prefetched) page image. As far as this write will be completed before
+ *    shared buffer can be reassigned, not other backend can see old page image.
+*/
+bool
+lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
+			 const void* buffer, XLogRecPtr lsn)
+{
+	BufferTag	tag;
+	FileCacheEntry *entry;
+	ssize_t		rc;
+	bool		found;
+	uint32		hash;
+	uint64		generation;
+	uint32		entry_offset;
+	instr_time io_start, io_end;
+	ConditionVariable* cv;
+	FileCacheBlockState state;
+	XLogRecPtr lwlsn;
+
+	int		chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
+
+	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
+		return false;
+
+	CopyNRelFileInfoToBufTag(tag, rinfo);
+	tag.forkNum = forknum;
+
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
+
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+	hash = get_hash_value(lfc_hash, &tag);
+	cv = &lfc_ctl->cv[hash % N_COND_VARS];
+
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+
+	if (!LFC_ENABLED() || !lfc_ensure_opened())
+	{
+		LWLockRelease(lfc_lock);
+		return false;
+	}
+	lwlsn = GetLastWrittenLSN(rinfo, forknum, blkno);
+	if (lwlsn > lsn)
+	{
+		elog(DEBUG1, "Skip LFC write for %d because LwLSN=%X/%X is greater than not_nodified_since LSN %X/%X",
+			 blkno, LSN_FORMAT_ARGS(lwlsn), LSN_FORMAT_ARGS(lsn));
+		LWLockRelease(lfc_lock);
+		return false;
+	}
+
+	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
+
+	if (found)
+	{
+		state = GET_STATE(entry, chunk_offs);
+		if (state != UNAVAILABLE) {
+			/* Do not rewrite existed LFC entry */
+			LWLockRelease(lfc_lock);
+			return false;
+		}
+		/*
+		 * Unlink entry from LRU list to pin it for the duration of IO
+		 * operation
+		 */
+		if (entry->access_count++ == 0)
+			dlist_delete(&entry->list_node);
+	}
+	else
+	{
+		if (!lfc_init_new_entry(entry, hash))
+		{
+			/*
+			 * We can't process this chunk due to lack of space in LFC,
+			 * so skip to the next one
+			 */
+			LWLockRelease(lfc_lock);
+			return false;
+		}
+	}
+
+	generation = lfc_ctl->generation;
+	entry_offset = entry->offset;
+
+	SET_STATE(entry, chunk_offs, PENDING);
+
+	LWLockRelease(lfc_lock);
+
+	pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
+	INSTR_TIME_SET_CURRENT(io_start);
+	rc = pwrite(lfc_desc, buffer, BLCKSZ,
+				((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+	INSTR_TIME_SET_CURRENT(io_end);
+	pgstat_report_wait_end();
+
+	if (rc != BLCKSZ)
+	{
+		lfc_disable("write");
+	}
+	else
+	{
+		LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+
+		if (lfc_ctl->generation == generation)
+		{
+			uint64	time_spent_us;
+			CriticalAssert(LFC_ENABLED());
+			/* Place entry to the head of LRU list */
+			CriticalAssert(entry->access_count > 0);
+
+			lfc_ctl->writes += 1;
+			INSTR_TIME_SUBTRACT(io_start, io_end);
+			time_spent_us = INSTR_TIME_GET_MICROSEC(io_start);
+			lfc_ctl->time_write += time_spent_us;
+			inc_page_cache_write_wait(time_spent_us);
+
+			if (--entry->access_count == 0)
+				dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
+
+			state = GET_STATE(entry, chunk_offs);
+			if (state == REQUESTED) {
+				ConditionVariableBroadcast(cv);
+			}
+			if (state != AVAILABLE)
+			{
+				lfc_ctl->used_pages += 1;
+				SET_STATE(entry, chunk_offs, AVAILABLE);
+			}
+		}
+		else
+		{
+			lfc_close_file();
+		}
+		LWLockRelease(lfc_lock);
+	}
+	return true;
+}
+
 /*
  * Put page in local file cache.
  * If cache is full then evict some other page.
@@ -855,15 +1115,21 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return;
 
-	if (!lfc_ensure_opened())
-		return;
-
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
 
 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 
-	/* 
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+
+	if (!LFC_ENABLED() || !lfc_ensure_opened())
+	{
+		LWLockRelease(lfc_lock);
+		return;
+	}
+	generation = lfc_ctl->generation;
+
+	/*
 	 * For every chunk that has blocks we're interested in, we
 	 * 1. get the chunk header
 	 * 2. Check if the chunk actually has the blocks we're interested in
@@ -878,6 +1144,8 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		int		chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
 		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK));
 		instr_time io_start, io_end;
+		ConditionVariable* cv;
+
 		Assert(blocks_in_chunk > 0);
 
 		for (int i = 0; i < blocks_in_chunk; i++)
@@ -888,14 +1156,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 		tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
 		hash = get_hash_value(lfc_hash, &tag);
-
-		LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-
-		if (!LFC_ENABLED())
-		{
-			LWLockRelease(lfc_lock);
-			return;
-		}
+		cv = &lfc_ctl->cv[hash % N_COND_VARS];
 
 		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
 
@@ -908,92 +1169,50 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			if (entry->access_count++ == 0)
 				dlist_delete(&entry->list_node);
 		}
-		/*-----------
-		 * If the chunk wasn't already in the LFC then we have these
-		 * options, in order of preference:
-		 *
-		 * Unless there is no space available, we can:
-		 *  1. Use an entry from the `holes` list, and
-		 *  2. Create a new entry.
-		 * We can always, regardless of space in the LFC:
-		 *  3. evict an entry from LRU, and
-		 *  4. ignore the write operation (the least favorite option)
-		 */
-		else if (lfc_ctl->used < lfc_ctl->limit)
-		{
-			if (!dlist_is_empty(&lfc_ctl->holes))
-			{
-				/* We can reuse a hole that was left behind when the LFC was shrunk previously */
-				FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node,
-													   dlist_pop_head_node(&lfc_ctl->holes));
-				uint32 offset = hole->offset;
-				bool hole_found;
-
-				hash_search_with_hash_value(lfc_hash, &hole->key,
-											hole->hash, HASH_REMOVE, &hole_found);
-				CriticalAssert(hole_found);
-
-				lfc_ctl->used += 1;
-				entry->offset = offset;			/* reuse the hole */
-			}
-			else
-			{
-				lfc_ctl->used += 1;
-				entry->offset = lfc_ctl->size++;/* allocate new chunk at end
-												 * of file */
-			}
-		}
-		/*
-		 * We've already used up all allocated LFC entries.
-		 *
-		 * If we can clear an entry from the LRU, do that.
-		 * If we can't (e.g. because all other slots are being accessed)
-		 * then we will remove this entry from the hash and continue
-		 * on to the next chunk, as we may not exceed the limit.
-		 */
-		else if (!dlist_is_empty(&lfc_ctl->lru))
-		{
-			/* Cache overflow: evict least recently used chunk */
-			FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node,
-													 dlist_pop_head_node(&lfc_ctl->lru));
-
-			for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
-			{
-				lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
-			}
-
-			CriticalAssert(victim->access_count == 0);
-			entry->offset = victim->offset; /* grab victim's chunk */
-			hash_search_with_hash_value(lfc_hash, &victim->key,
-										victim->hash, HASH_REMOVE, NULL);
-			neon_log(DEBUG2, "Swap file cache page");
-		}
 		else
 		{
-			/* Can't add this chunk - we don't have the space for it */
-			hash_search_with_hash_value(lfc_hash, &entry->key, hash,
-										HASH_REMOVE, NULL);
-
-			/*
-			 * We can't process this chunk due to lack of space in LFC,
-			 * so skip to the next one
-			 */
-			LWLockRelease(lfc_lock);
-			blkno += blocks_in_chunk;
-			buf_offset += blocks_in_chunk;
-			nblocks -= blocks_in_chunk;
-			continue;
+			if (!lfc_init_new_entry(entry, hash))
+			{
+				/*
+				 * We can't process this chunk due to lack of space in LFC,
+				 * so skip to the next one
+				 */
+				blkno += blocks_in_chunk;
+				buf_offset += blocks_in_chunk;
+				nblocks -= blocks_in_chunk;
+				continue;
+			}
 		}
 
-		if (!found)
-		{
-			entry->access_count = 1;
-			entry->hash = hash;
-			memset(entry->bitmap, 0, sizeof entry->bitmap);
-		}
-
-		generation = lfc_ctl->generation;
 		entry_offset = entry->offset;
+
+		for (int i = 0; i < blocks_in_chunk; i++)
+		{
+			FileCacheBlockState state = UNAVAILABLE;
+			bool sleeping = false;
+			while (lfc_ctl->generation == generation)
+			{
+				state = GET_STATE(entry, chunk_offs + i);
+				if (state == PENDING) {
+					SET_STATE(entry, chunk_offs + i, REQUESTED);
+				} else if (state != REQUESTED) {
+					SET_STATE(entry, chunk_offs + i, PENDING);
+					break;
+				}
+				if (!sleeping)
+				{
+					ConditionVariablePrepareToSleep(cv);
+					sleeping = true;
+				}
+				LWLockRelease(lfc_lock);
+				ConditionVariableTimedSleep(cv, CV_WAIT_TIMEOUT, WAIT_EVENT_NEON_LFC_CV_WAIT);
+				LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+			}
+			if (sleeping)
+			{
+				ConditionVariableCancelSleep();
+			}
+		}
 		LWLockRelease(lfc_lock);
 
 		pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
@@ -1006,6 +1225,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		if (rc != BLCKSZ * blocks_in_chunk)
 		{
 			lfc_disable("write");
+			return;
 		}
 		else
 		{
@@ -1029,18 +1249,30 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 				for (int i = 0; i < blocks_in_chunk; i++)
 				{
-					lfc_ctl->used_pages += 1 - ((entry->bitmap[(chunk_offs + i) >> 5] >> ((chunk_offs + i) & 31)) & 1);
-					entry->bitmap[(chunk_offs + i) >> 5] |=
-						((uint32)1 << ((chunk_offs + i) & 31));
+					FileCacheBlockState state = GET_STATE(entry, chunk_offs + i);
+					if (state == REQUESTED)
+					{
+						ConditionVariableBroadcast(cv);
+					}
+					if (state != AVAILABLE)
+					{
+						lfc_ctl->used_pages += 1;
+						SET_STATE(entry, chunk_offs + i, AVAILABLE);
+					}
 				}
 			}
-
-			LWLockRelease(lfc_lock);
+			else
+			{
+				/* stop iteration if LFC was disabled */
+				lfc_close_file();
+				break;
+			}
 		}
 		blkno += blocks_in_chunk;
 		buf_offset += blocks_in_chunk;
 		nblocks -= blocks_in_chunk;
 	}
+	LWLockRelease(lfc_lock);
 }
 
 typedef struct
@@ -1127,6 +1359,16 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 			if (lfc_ctl)
 				value = lfc_ctl->used_pages;
 			break;
+		case 6:
+			key = "file_cache_evicted_pages";
+			if (lfc_ctl)
+				value = lfc_ctl->evicted_pages;
+			break;
+		case 7:
+			key = "file_cache_limit";
+			if (lfc_ctl)
+				value = lfc_ctl->limit;
+			break;
 		default:
 			SRF_RETURN_DONE(funcctx);
 	}
@@ -1250,8 +1492,8 @@ local_cache_pages(PG_FUNCTION_ARGS)
 				hash_seq_init(&status, lfc_hash);
 				while ((entry = hash_seq_search(&status)) != NULL)
 				{
-					for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
-						n_pages += pg_popcount32(entry->bitmap[i]);
+					for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+						n_pages += GET_STATE(entry, i) == AVAILABLE;
 				}
 			}
 		}
@@ -1279,7 +1521,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
 			{
 				for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 				{
-					if (entry->bitmap[i >> 5] & ((uint32)1 << (i & 31)))
+					if (GET_STATE(entry, i) == AVAILABLE)
 					{
 						fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
 						fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index ce2938cfd5..700a942284 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -56,6 +56,7 @@ uint32		WAIT_EVENT_NEON_LFC_MAINTENANCE;
 uint32		WAIT_EVENT_NEON_LFC_READ;
 uint32		WAIT_EVENT_NEON_LFC_TRUNCATE;
 uint32		WAIT_EVENT_NEON_LFC_WRITE;
+uint32		WAIT_EVENT_NEON_LFC_CV_WAIT;
 uint32		WAIT_EVENT_NEON_PS_STARTING;
 uint32		WAIT_EVENT_NEON_PS_CONFIGURING;
 uint32		WAIT_EVENT_NEON_PS_SEND;
@@ -538,6 +539,7 @@ neon_shmem_startup_hook(void)
 	WAIT_EVENT_NEON_LFC_READ = WaitEventExtensionNew("Neon/FileCache_Read");
 	WAIT_EVENT_NEON_LFC_TRUNCATE = WaitEventExtensionNew("Neon/FileCache_Truncate");
 	WAIT_EVENT_NEON_LFC_WRITE = WaitEventExtensionNew("Neon/FileCache_Write");
+	WAIT_EVENT_NEON_LFC_CV_WAIT = WaitEventExtensionNew("Neon/FileCache_CvWait");
 	WAIT_EVENT_NEON_PS_STARTING = WaitEventExtensionNew("Neon/PS_Starting");
 	WAIT_EVENT_NEON_PS_CONFIGURING = WaitEventExtensionNew("Neon/PS_Configuring");
 	WAIT_EVENT_NEON_PS_SEND = WaitEventExtensionNew("Neon/PS_SendIO");
diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h
index 79aa88b8d3..912e09c3d3 100644
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -28,6 +28,7 @@ extern uint32		WAIT_EVENT_NEON_LFC_MAINTENANCE;
 extern uint32		WAIT_EVENT_NEON_LFC_READ;
 extern uint32		WAIT_EVENT_NEON_LFC_TRUNCATE;
 extern uint32		WAIT_EVENT_NEON_LFC_WRITE;
+extern uint32		WAIT_EVENT_NEON_LFC_CV_WAIT;
 extern uint32		WAIT_EVENT_NEON_PS_STARTING;
 extern uint32		WAIT_EVENT_NEON_PS_CONFIGURING;
 extern uint32		WAIT_EVENT_NEON_PS_SEND;
@@ -38,6 +39,7 @@ extern uint32		WAIT_EVENT_NEON_WAL_DL;
 #define WAIT_EVENT_NEON_LFC_READ		WAIT_EVENT_BUFFILE_READ
 #define WAIT_EVENT_NEON_LFC_TRUNCATE	WAIT_EVENT_BUFFILE_TRUNCATE
 #define WAIT_EVENT_NEON_LFC_WRITE		WAIT_EVENT_BUFFILE_WRITE
+#define WAIT_EVENT_NEON_LFC_CV_WAIT 	WAIT_EVENT_BUFFILE_READ
 #define WAIT_EVENT_NEON_PS_STARTING		PG_WAIT_EXTENSION
 #define WAIT_EVENT_NEON_PS_CONFIGURING	PG_WAIT_EXTENSION
 #define WAIT_EVENT_NEON_PS_SEND			PG_WAIT_EXTENSION
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 7b748d7252..9faab1e4f0 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -233,6 +233,7 @@ extern char *neon_timeline;
 extern char *neon_tenant;
 extern int32 max_cluster_size;
 extern int  neon_protocol_version;
+extern bool lfc_store_prefetch_result;
 
 extern shardno_t get_shard_number(BufferTag* tag);
 
@@ -301,14 +302,16 @@ extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum,
 							   BlockNumber blkno);
 extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum,
 							   BlockNumber blkno, int nblocks, bits8 *bitmap);
-extern void lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno);
 extern void lfc_init(void);
+extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
+						 const void* buffer, XLogRecPtr lsn);
+
 
 static inline bool
 lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		 void *buffer)
 {
-	bits8		rv = 0;
+	bits8		rv = 1;
 	return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1;
 }
 
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 6c812f347f..4a79acd777 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -162,7 +162,7 @@ static uint32 local_request_counter;
  * UNUSED ------> REQUESTED --> RECEIVED
  *   ^         :      |            |
  *   |         :      v            |
- *   |         : TAG_UNUSED        |
+ *   |         : TAG_REMAINS       |
  *   |         :      |            |
  *   +----------------+------------+
  *             :
@@ -181,7 +181,7 @@ typedef enum PrefetchStatus
 /* must fit in uint8; bits 0x1 are used */
 typedef enum {
 	PRFSF_NONE	= 0x0,
-	PRFSF_SEQ	= 0x1,
+	PRFSF_LFC	= 0x1  /* received prefetch result is stored in LFC */
 } PrefetchRequestFlags;
 
 typedef struct PrefetchRequest
@@ -305,7 +305,7 @@ GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum,
 static void
 neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum,
 					  BlockNumber blkno, neon_request_lsns *output,
-					  BlockNumber nblocks, const bits8 *mask);
+					  BlockNumber nblocks);
 static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns,
 										  PrefetchRequest *slot);
 
@@ -363,6 +363,7 @@ compact_prefetch_buffers(void)
 		target_slot->buftag = source_slot->buftag;
 		target_slot->shard_no = source_slot->shard_no;
 		target_slot->status = source_slot->status;
+		target_slot->flags = source_slot->flags;
 		target_slot->response = source_slot->response;
 		target_slot->reqid = source_slot->reqid;
 		target_slot->request_lsns = source_slot->request_lsns;
@@ -452,6 +453,18 @@ prefetch_pump_state(void)
 		/* update slot state */
 		slot->status = PRFS_RECEIVED;
 		slot->response = response;
+
+		if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result)
+		{
+			/*
+			 * Store prefetched result in LFC (please read comments to lfc_prefetch
+			 * explaining why it can be done without holding shared buffer lock
+			 */
+			if (lfc_prefetch(BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since))
+			{
+				slot->flags |= PRFSF_LFC;
+			}
+		}
 	}
 }
 
@@ -713,6 +726,18 @@ prefetch_read(PrefetchRequest *slot)
 		/* update slot state */
 		slot->status = PRFS_RECEIVED;
 		slot->response = response;
+
+		if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result)
+		{
+			/*
+			 * Store prefetched result in LFC (please read comments to lfc_prefetch
+			 * explaining why it can be done without holding shared buffer lock
+			 */
+			if (lfc_prefetch(BufTagGetNRelFileInfo(buftag), buftag.forkNum, buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since))
+			{
+				slot->flags |= PRFSF_LFC;
+			}
+		}
 		return true;
 	}
 	else
@@ -864,7 +889,7 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 	else
 		neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag),
 							  slot->buftag.forkNum, slot->buftag.blockNum,
-							  &slot->request_lsns, 1, NULL);
+							  &slot->request_lsns, 1);
 	request.hdr.lsn = slot->request_lsns.request_lsn;
 	request.hdr.not_modified_since = slot->request_lsns.not_modified_since;
 
@@ -890,6 +915,73 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 	Assert(!found);
 }
 
+/*
+ * Lookup of already received prefetch requests. Only already received responses matching required LSNs are accepted.
+ * Present pages are marked in "mask" bitmap and total number of such pages is returned.
+ */
+static int
+prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, neon_request_lsns *lsns,
+				 BlockNumber nblocks, void **buffers, bits8 *mask)
+{
+	int hits = 0;
+	PrefetchRequest hashkey;
+
+	/*
+	 * Use an intermediate PrefetchRequest struct as the hash key to ensure
+	 * correct alignment and that the padding bytes are cleared.
+	 */
+	memset(&hashkey.buftag, 0, sizeof(BufferTag));
+	CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo);
+	hashkey.buftag.forkNum = forknum;
+
+	for (int i = 0; i < nblocks; i++)
+	{
+		PrfHashEntry *entry;
+
+		hashkey.buftag.blockNum = blocknum + i;
+		entry = prfh_lookup(MyPState->prf_hash, &hashkey);
+
+		if (entry != NULL)
+		{
+			PrefetchRequest *slot = entry->slot;
+			uint64 ring_index = slot->my_ring_index;
+			Assert(slot == GetPrfSlot(ring_index));
+
+			Assert(slot->status != PRFS_UNUSED);
+			Assert(MyPState->ring_last <= ring_index &&
+				   ring_index < MyPState->ring_unused);
+			Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag));
+
+			if (slot->status != PRFS_RECEIVED)
+				continue;
+
+			/*
+			 * If the caller specified a request LSN to use, only accept
+			 * prefetch responses that satisfy that request.
+			 */
+			if (!neon_prefetch_response_usable(&lsns[i], slot))
+				continue;
+
+			memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ);
+			prefetch_set_unused(ring_index);
+			BITMAP_SET(mask, i);
+
+			hits += 1;
+		}
+	}
+	pgBufferUsage.prefetch.hits += hits;
+	return hits;
+}
+
+#if PG_MAJORVERSION_NUM < 17
+static bool
+prefetch_lookup(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkn, neon_request_lsns *lsns, void *buffer)
+{
+	bits8 present = 0;
+	return prefetch_lookupv(rinfo, forkNum, blkn, lsns, 1, &buffer, &present) != 0;
+}
+#endif
+
 /*
  * prefetch_register_bufferv() - register and prefetch buffers
  *
@@ -1013,8 +1105,6 @@ Retry:
 					/* The buffered request is good enough, return that index */
 					if (is_prefetch)
 						pgBufferUsage.prefetch.duplicates++;
-					else
-						pgBufferUsage.prefetch.hits++;
 					continue;
 				}
 			}
@@ -1116,6 +1206,7 @@ Retry:
 		slot->buftag = hashkey.buftag;
 		slot->shard_no = get_shard_number(&tag);
 		slot->my_ring_index = ring_index;
+		slot->flags = 0;
 
 		min_ring_index = Min(min_ring_index, ring_index);
 
@@ -2056,8 +2147,7 @@ GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum,
  */
 static void
 neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
-					  neon_request_lsns *output, BlockNumber nblocks,
-					  const bits8 *mask)
+					  neon_request_lsns *output, BlockNumber nblocks)
 {
 	XLogRecPtr	last_written_lsns[PG_IOV_MAX];
 
@@ -2145,9 +2235,6 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 			neon_request_lsns *result = &output[i];
 			XLogRecPtr	last_written_lsn = last_written_lsns[i];
 
-			if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
-				continue;
-
 			if (last_written_lsn > replay_lsn)
 			{
 				/* GetCurrentReplayRecPtr was introduced in v15 */
@@ -2190,8 +2277,6 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 			neon_request_lsns *result = &output[i];
 			XLogRecPtr	last_written_lsn = last_written_lsns[i];
 
-			if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
-				continue;
 			/*
 			 * Use the latest LSN that was evicted from the buffer cache as the
 			 * 'not_modified_since' hint. Any pages modified by later WAL records
@@ -2413,7 +2498,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 	}
 
 	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
-						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL);
+						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
 	{
 		NeonExistsRequest request = {
 			.hdr.tag = T_NeonExistsRequest,
@@ -2832,8 +2917,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	while (nblocks > 0)
 	{
 		int		iterblocks = Min(nblocks, PG_IOV_MAX);
-		bits8		lfc_present[PG_IOV_MAX / 8];
-		memset(lfc_present, 0, sizeof(lfc_present));
+		bits8	lfc_present[PG_IOV_MAX / 8] = {0};
 
 		if (lfc_cache_containsv(InfoFromSMgrRel(reln), forknum, blocknum,
 								iterblocks, lfc_present) == iterblocks)
@@ -2844,12 +2928,13 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		}
 
 		tag.blockNum = blocknum;
-		
+
 		for (int i = 0; i < PG_IOV_MAX / 8; i++)
 			lfc_present[i] = ~(lfc_present[i]);
 
 		ring_index = prefetch_register_bufferv(tag, NULL, iterblocks,
 											   lfc_present, true);
+
 		nblocks -= iterblocks;
 		blocknum += iterblocks;
 
@@ -3105,7 +3190,8 @@ Retry:
 					}
 				}
 				memcpy(buffer, getpage_resp->page, BLCKSZ);
-				lfc_write(rinfo, forkNum, blockno, buffer);
+				if (!lfc_store_prefetch_result)
+					lfc_write(rinfo, forkNum, blockno, buffer);
 				break;
 			}
 			case T_NeonErrorResponse:
@@ -3190,6 +3276,17 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
+	/* Try to read PS results if they are available */
+	prefetch_pump_state();
+
+	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
+
+	if (prefetch_lookup(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, buffer))
+	{
+		/* Prefetch hit */
+		return;
+	}
+
 	/* Try to read from local file cache */
 	if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
 	{
@@ -3197,9 +3294,11 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 		return;
 	}
 
-	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, NULL);
 	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
 
+	/*
+	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
+	 */
 	prefetch_pump_state();
 
 #ifdef DEBUG_COMPARE_LOCAL
@@ -3280,11 +3379,14 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 #if PG_MAJORVERSION_NUM >= 17
 static void
 neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		void **buffers, BlockNumber nblocks)
+		   void **buffers, BlockNumber nblocks)
 {
+	bits8		prefetch_hits[PG_IOV_MAX / 8] = {0};
+	bits8		lfc_hits[PG_IOV_MAX / 8];
 	bits8		read[PG_IOV_MAX / 8];
 	neon_request_lsns request_lsns[PG_IOV_MAX];
 	int			lfc_result;
+	int			prefetch_result;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -3307,38 +3409,52 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		neon_log(ERROR, "Read request too large: %d is larger than max %d",
 				 nblocks, PG_IOV_MAX);
 
-	memset(read, 0, sizeof(read));
+	/* Try to read PS results if they are available */
+	prefetch_pump_state();
+
+	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
+						  request_lsns, nblocks);
+
+
+	prefetch_result = prefetch_lookupv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, nblocks, buffers, prefetch_hits);
+
+	if (prefetch_result == nblocks)
+		return;
+
+	/* invert the result: exclude prefetched blocks */
+	for (int i = 0; i < PG_IOV_MAX / 8; i++)
+		lfc_hits[i] = ~prefetch_hits[i];
 
 	/* Try to read from local file cache */
 	lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
-								  nblocks, read);
+								  nblocks, lfc_hits);
 
 	if (lfc_result > 0)
 		MyNeonCounters->file_cache_hits_total += lfc_result;
 
 	/* Read all blocks from LFC, so we're done */
-	if (lfc_result == nblocks)
+	if (prefetch_result + lfc_result == nblocks)
 		return;
 
-	if (lfc_result == -1)
+	if (lfc_result <= 0)
 	{
 		/* can't use the LFC result, so read all blocks from PS */
 		for (int i = 0; i < PG_IOV_MAX / 8; i++)
-			read[i] = 0xFF;
+			read[i] = ~prefetch_hits[i];
 	}
 	else
 	{
 		/* invert the result: exclude blocks read from lfc */
 		for (int i = 0; i < PG_IOV_MAX / 8; i++)
-			read[i] = ~(read[i]);
+			read[i] = ~(prefetch_hits[i] | lfc_hits[i]);
 	}
 
-	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
-						  request_lsns, nblocks, read);
-
 	neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
 					  buffers, nblocks, read);
 
+	/*
+	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
+	 */
 	prefetch_pump_state();
 
 #ifdef DEBUG_COMPARE_LOCAL
@@ -3610,7 +3726,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 	}
 
 	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
-						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL);
+						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
 
 	{
 		NeonNblocksRequest request = {
@@ -3695,7 +3811,7 @@ neon_dbsize(Oid dbNode)
 	NRelFileInfo dummy_node = {0};
 
 	neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
-						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL);
+						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
 
 	{
 		NeonDbSizeRequest request = {
@@ -4430,7 +4546,12 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	if (no_redo_needed)
 	{
 		SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
-		lfc_evict(rinfo, forknum, blkno);
+		/*
+		 * Redo changes if page exists in LFC.
+		 * We should perform this check after assigning LwLSN to prevent
+		 * prefetching of some older version of the page by some other backend.
+		 */
+		no_redo_needed = !lfc_cache_contains(rinfo, forknum, blkno);
 	}
 
 	LWLockRelease(partitionLock);
diff --git a/test_runner/regress/test_lfc_prefetch.py b/test_runner/regress/test_lfc_prefetch.py
new file mode 100644
index 0000000000..dd422d996e
--- /dev/null
+++ b/test_runner/regress/test_lfc_prefetch.py
@@ -0,0 +1,101 @@
+from __future__ import annotations
+
+import time
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.utils import USE_LFC
+
+
+@pytest.mark.timeout(600)
+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
+def test_lfc_prefetch(neon_simple_env: NeonEnv):
+    """
+    Test resizing the Local File Cache
+    """
+    env = neon_simple_env
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            "neon.max_file_cache_size=1GB",
+            "neon.file_cache_size_limit=1GB",
+            "effective_io_concurrency=100",
+            "shared_buffers=1MB",
+            "enable_bitmapscan=off",
+            "enable_seqscan=off",
+            "autovacuum=off",
+        ],
+    )
+    conn = endpoint.connect()
+    cur = conn.cursor()
+    cur.execute("create extension neon")
+    cur.execute("create table t(pk integer, sk integer, filler text default repeat('x',200))")
+    cur.execute("set statement_timeout=0")
+    cur.execute("select setseed(0.5)")
+    cur.execute("insert into t values (generate_series(1,1000000),random()*1000000)")
+    cur.execute("create index on t(sk)")
+    cur.execute("vacuum t")
+
+    # reset LFC
+    cur.execute("alter system set neon.file_cache_size_limit=0")
+    cur.execute("select pg_reload_conf()")
+    time.sleep(1)
+    cur.execute("alter system set neon.file_cache_size_limit='1GB'")
+    cur.execute("select pg_reload_conf()")
+
+    cur.execute(
+        "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 100000 and 200000 limit 100) s1"
+    )
+    prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"]
+    log.info(f"Unused prefetches: {prefetch_expired}")
+
+    cur.execute(
+        "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 200000 and 300000 limit 100) s2"
+    )
+    prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"]
+    log.info(f"Unused prefetches: {prefetch_expired}")
+
+    cur.execute(
+        "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 300000 and 400000 limit 100) s3"
+    )
+    prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"]
+    log.info(f"Unused prefetches: {prefetch_expired}")
+
+    cur.execute(
+        "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 100000 and 200000 limit 100) s4"
+    )
+    prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"]
+    log.info(f"Unused prefetches: {prefetch_expired}")
+
+    # if prefetch requests are not stored in LFC, we continue to sent unused prefetch request tyo PS
+    assert prefetch_expired > 0
+
+    cur.execute("set neon.store_prefetch_result_in_lfc=on")
+
+    cur.execute(
+        "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 500000 and 600000 limit 100) s5"
+    )
+    prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"]
+    log.info(f"Unused prefetches: {prefetch_expired}")
+
+    cur.execute(
+        "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 600000 and 700000 limit 100) s6"
+    )
+    prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"]
+    log.info(f"Unused prefetches: {prefetch_expired}")
+
+    cur.execute(
+        "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 700000 and 800000 limit 100) s7"
+    )
+    prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"]
+    log.info(f"Unused prefetches: {prefetch_expired}")
+
+    cur.execute(
+        "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 500000 and 600000 limit 100) s8"
+    )
+    prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"]
+    log.info(f"Unused prefetches: {prefetch_expired}")
+
+    # No redundant prefethc requrests if prefetch results are stored in LFC
+    assert prefetch_expired == 0

From c0c3ed94a9b5dc11a82e6df3bdbe82b9a7386075 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 21 Feb 2025 12:29:48 -0600
Subject: [PATCH 554/583] Fix flaky test_compute_installed_extensions_metric
 test (#10933)

There was a race condition with compute_ctl and the metric being
collected related to whether the neon extension had been updated or not.
compute_ctl will run `ALTER EXTENSION neon UPDATE` on compute start in
the postgres database.

Fixes: https://github.com/neondatabase/neon/issues/10932

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 test_runner/regress/test_compute_metrics.py | 22 ++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py
index 99d41e410a..b360162dc1 100644
--- a/test_runner/regress/test_compute_metrics.py
+++ b/test_runner/regress/test_compute_metrics.py
@@ -501,19 +501,31 @@ def test_compute_installed_extensions_metric(neon_simple_env: NeonEnv):
     """
     Test that the compute_installed_extensions properly reports accurate
     results. Important to note that currently this metric is only gathered on
-    compute start.
+    compute start. We install the neon extension into a database other than
+    postgres because compute_ctl will run `ALTER EXTENSION neon UPDATE` during
+    Postgres startup in the postgres database, creating a race condition.
     """
+    DB_NAME = "test"
+
     env = neon_simple_env
 
     endpoint = env.endpoints.create_start("main")
+    endpoint.safe_psql(f"CREATE DATABASE {DB_NAME}")
+
+    # The metric is only gathered on compute start, so restart to check that
+    # plpgsql is now in 3 databases, instead of its regular 2, template1 and
+    # postgres.
+    endpoint.stop()
+    endpoint.start()
 
     client = endpoint.http_client()
 
     def __has_plpgsql(samples: list[Sample]) -> bool:
         """
-        Check that plpgsql is installed in the template1 and postgres databases
+        Check that plpgsql is installed in the template1, postgres, and test
+        databases
         """
-        return len(samples) == 1 and samples[0].value == 2
+        return len(samples) == 1 and samples[0].value == 3
 
     wait_until(
         collect_metric(
@@ -525,8 +537,8 @@ def test_compute_installed_extensions_metric(neon_simple_env: NeonEnv):
         name="compute_installed_extensions",
     )
 
-    # Install the neon extension, so we can check for it on the restart
-    endpoint.safe_psql("CREATE EXTENSION neon VERSION '1.0'")
+    # Install the neon extension, so we can check for it on the restart.
+    endpoint.safe_psql("CREATE EXTENSION neon VERSION '1.0'", dbname=DB_NAME)
 
     # The metric is only gathered on compute start, so restart to check if the
     # neon extension will now be there.

From 4bbe75de8ce20b866260661b5d3244475d32e790 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 21 Feb 2025 21:29:05 +0100
Subject: [PATCH 555/583] Update vm_monitor to edition 2024 (#10916)

Updates `vm_monitor` to edition 2024. We like to stay on the latest
edition if possible. There is no functional changes, it's only changes
due to the rustfmt edition.

part of https://github.com/neondatabase/neon/issues/10918
---
 libs/vm_monitor/Cargo.toml        |  2 +-
 libs/vm_monitor/src/cgroup.rs     | 10 ++++------
 libs/vm_monitor/src/dispatcher.rs | 12 +++++-------
 libs/vm_monitor/src/filecache.rs  |  8 +++++---
 libs/vm_monitor/src/lib.rs        | 22 ++++++++++++----------
 libs/vm_monitor/src/protocol.rs   |  3 ++-
 libs/vm_monitor/src/runner.rs     |  8 +++++---
 7 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/libs/vm_monitor/Cargo.toml b/libs/vm_monitor/Cargo.toml
index ba73902d38..a70465921c 100644
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "vm_monitor"
 version = "0.1.0"
-edition.workspace = true
+edition = "2024"
 license.workspace = true
 
 [[bin]]
diff --git a/libs/vm_monitor/src/cgroup.rs b/libs/vm_monitor/src/cgroup.rs
index 1d70cedcf9..dda9b23818 100644
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -1,12 +1,10 @@
 use std::fmt::{self, Debug, Formatter};
 use std::time::{Duration, Instant};
 
-use anyhow::{anyhow, Context};
-use cgroups_rs::{
-    hierarchies::{self, is_cgroup2_unified_mode},
-    memory::MemController,
-    Subsystem,
-};
+use anyhow::{Context, anyhow};
+use cgroups_rs::Subsystem;
+use cgroups_rs::hierarchies::{self, is_cgroup2_unified_mode};
+use cgroups_rs::memory::MemController;
 use tokio::sync::watch;
 use tracing::{info, warn};
 
diff --git a/libs/vm_monitor/src/dispatcher.rs b/libs/vm_monitor/src/dispatcher.rs
index c81848cb70..7b7201ab77 100644
--- a/libs/vm_monitor/src/dispatcher.rs
+++ b/libs/vm_monitor/src/dispatcher.rs
@@ -6,17 +6,15 @@
 //! the cgroup (requesting upscale), and the signals that go to the cgroup
 //! (notifying it of upscale).
 
-use anyhow::{bail, Context};
+use anyhow::{Context, bail};
 use axum::extract::ws::{Message, Utf8Bytes, WebSocket};
-use futures::{
-    stream::{SplitSink, SplitStream},
-    SinkExt, StreamExt,
-};
+use futures::stream::{SplitSink, SplitStream};
+use futures::{SinkExt, StreamExt};
 use tracing::{debug, info};
 
 use crate::protocol::{
-    OutboundMsg, OutboundMsgKind, ProtocolRange, ProtocolResponse, ProtocolVersion,
-    PROTOCOL_MAX_VERSION, PROTOCOL_MIN_VERSION,
+    OutboundMsg, OutboundMsgKind, PROTOCOL_MAX_VERSION, PROTOCOL_MIN_VERSION, ProtocolRange,
+    ProtocolResponse, ProtocolVersion,
 };
 
 /// The central handler for all communications in the monitor.
diff --git a/libs/vm_monitor/src/filecache.rs b/libs/vm_monitor/src/filecache.rs
index 4f5bf1c1e3..bc42347e5a 100644
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -2,12 +2,14 @@
 
 use std::num::NonZeroU64;
 
-use crate::MiB;
-use anyhow::{anyhow, Context};
-use tokio_postgres::{types::ToSql, Client, NoTls, Row};
+use anyhow::{Context, anyhow};
+use tokio_postgres::types::ToSql;
+use tokio_postgres::{Client, NoTls, Row};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info};
 
+use crate::MiB;
+
 /// Manages Postgres' file cache by keeping a connection open.
 #[derive(Debug)]
 pub struct FileCacheState {
diff --git a/libs/vm_monitor/src/lib.rs b/libs/vm_monitor/src/lib.rs
index 0cd97d4ca1..7c77aca35d 100644
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -2,24 +2,26 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 #![cfg(target_os = "linux")]
 
+use std::fmt::Debug;
+use std::net::SocketAddr;
+use std::time::Duration;
+
 use anyhow::Context;
-use axum::{
-    extract::{ws::WebSocket, State, WebSocketUpgrade},
-    response::Response,
-};
-use axum::{routing::get, Router};
+use axum::Router;
+use axum::extract::ws::WebSocket;
+use axum::extract::{State, WebSocketUpgrade};
+use axum::response::Response;
+use axum::routing::get;
 use clap::Parser;
 use futures::Future;
-use std::net::SocketAddr;
-use std::{fmt::Debug, time::Duration};
+use runner::Runner;
 use sysinfo::{RefreshKind, System, SystemExt};
 use tokio::net::TcpListener;
-use tokio::{sync::broadcast, task::JoinHandle};
+use tokio::sync::broadcast;
+use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info};
 
-use runner::Runner;
-
 // Code that interfaces with agent
 pub mod dispatcher;
 pub mod protocol;
diff --git a/libs/vm_monitor/src/protocol.rs b/libs/vm_monitor/src/protocol.rs
index 5f07435503..4fce3cdefc 100644
--- a/libs/vm_monitor/src/protocol.rs
+++ b/libs/vm_monitor/src/protocol.rs
@@ -35,7 +35,8 @@
 use core::fmt;
 use std::cmp;
 
-use serde::{de::Error, Deserialize, Serialize};
+use serde::de::Error;
+use serde::{Deserialize, Serialize};
 
 /// A Message we send to the agent.
 #[derive(Serialize, Deserialize, Debug, Clone)]
diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs
index 8839f5803f..6f75ff0abd 100644
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -7,7 +7,7 @@
 use std::fmt::Debug;
 use std::time::{Duration, Instant};
 
-use anyhow::{bail, Context};
+use anyhow::{Context, bail};
 use axum::extract::ws::{Message, WebSocket};
 use futures::StreamExt;
 use tokio::sync::{broadcast, watch};
@@ -18,7 +18,7 @@ use crate::cgroup::{self, CgroupWatcher};
 use crate::dispatcher::Dispatcher;
 use crate::filecache::{FileCacheConfig, FileCacheState};
 use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
-use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args, MiB};
+use crate::{Args, MiB, bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel};
 
 /// Central struct that interacts with agent, dispatcher, and cgroup to handle
 /// signals from the agent.
@@ -233,7 +233,9 @@ impl Runner {
             //
             // TODO: make the duration here configurable.
             if last_time.elapsed() > Duration::from_secs(5) {
-                bail!("haven't gotten cgroup memory stats recently enough to determine downscaling information");
+                bail!(
+                    "haven't gotten cgroup memory stats recently enough to determine downscaling information"
+                );
             } else if last_history.samples_count <= 1 {
                 let status = "haven't received enough cgroup memory stats yet";
                 info!(status, "discontinuing downscale");

From df264380b91a1d4f065b09a19bf4773a78536405 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Fri, 21 Feb 2025 22:50:50 +0100
Subject: [PATCH 556/583] fix(compute_ctl): Skip invalid DBs in
 PerDatabasePhase (#10910)

## Problem

After refactoring the configuration code to phases, it became a bit
fuzzy who filters out DBs that are not present in Postgres, are invalid,
or have `datallowconn = false`. The first 2 are important for the DB
dropping case, as we could be in operation retry, so DB could be already
absent in Postgres or invalid (interrupted `DROP DATABASE`).

Recent case:
https://neondb.slack.com/archives/C03H1K0PGKH/p1740053359712419

## Summary of changes

Add a common code that filters out inaccessible DBs inside
`ApplySpecPhase::RunInEachDatabase`.
---
 compute_tools/src/spec_apply.rs | 57 ++++++++++++++++++++++++---------
 1 file changed, 41 insertions(+), 16 deletions(-)

diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs
index 5ee9c5fbd8..c4416480d8 100644
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -7,12 +7,12 @@ use std::sync::Arc;
 
 use crate::compute::construct_superuser_query;
 use crate::pg_helpers::{escape_literal, DatabaseExt, Escaping, GenericOptionsSearch, RoleExt};
-use anyhow::{bail, Result};
+use anyhow::Result;
 use compute_api::spec::{ComputeFeature, ComputeSpec, Database, PgIdent, Role};
 use futures::future::join_all;
 use tokio::sync::RwLock;
 use tokio_postgres::Client;
-use tracing::{debug, info_span, Instrument};
+use tracing::{debug, info_span, warn, Instrument};
 
 #[derive(Clone)]
 pub enum DB {
@@ -47,6 +47,11 @@ pub enum PerDatabasePhase {
     DeleteDBRoleReferences,
     ChangeSchemaPerms,
     HandleAnonExtension,
+    /// This is a shared phase, used for both i) dropping dangling LR subscriptions
+    /// before dropping the DB, and ii) dropping all subscriptions after creating
+    /// a fresh branch.
+    /// N.B. we will skip all DBs that are not present in Postgres, invalid, or
+    /// have `datallowconn = false` (`restrict_conn`).
     DropLogicalSubscriptions,
 }
 
@@ -168,7 +173,7 @@ where
 ///
 /// In the future we may generate a single stream of changes and then
 /// sort/merge/batch execution, but for now this is a nice way to improve
-/// batching behaviour of the commands.
+/// batching behavior of the commands.
 async fn get_operations<'a>(
     spec: &'a ComputeSpec,
     ctx: &'a RwLock<MutableApplyContext>,
@@ -451,6 +456,38 @@ async fn get_operations<'a>(
             )),
         }))),
         ApplySpecPhase::RunInEachDatabase { db, subphase } => {
+            // Do some checks that user DB exists and we can access it.
+            //
+            // During the phases like DropLogicalSubscriptions, DeleteDBRoleReferences,
+            // which happen before dropping the DB, the current run could be a retry,
+            // so it's a valid case when DB is absent already. The case of
+            // `pg_database.datallowconn = false`/`restrict_conn` is a bit tricky, as
+            // in theory user can have some dangling objects there, so we will fail at
+            // the actual drop later. Yet, to fix that in the current code we would need
+            // to ALTER DATABASE, and then check back, but that even more invasive, so
+            // that's not what we really want to do here.
+            //
+            // For ChangeSchemaPerms, skipping DBs we cannot access is totally fine.
+            if let DB::UserDB(db) = db {
+                let databases = &ctx.read().await.dbs;
+
+                let edb = match databases.get(&db.name) {
+                    Some(edb) => edb,
+                    None => {
+                        warn!("skipping RunInEachDatabase phase {:?}, database {} doesn't exist in PostgreSQL", subphase, db.name);
+                        return Ok(Box::new(empty()));
+                    }
+                };
+
+                if edb.restrict_conn || edb.invalid {
+                    warn!(
+                        "skipping RunInEachDatabase phase {:?}, database {} is (restrict_conn={}, invalid={})",
+                        subphase, db.name, edb.restrict_conn, edb.invalid
+                    );
+                    return Ok(Box::new(empty()));
+                }
+            }
+
             match subphase {
                 PerDatabasePhase::DropLogicalSubscriptions => {
                     match &db {
@@ -530,25 +567,12 @@ async fn get_operations<'a>(
                     Ok(Box::new(operations))
                 }
                 PerDatabasePhase::ChangeSchemaPerms => {
-                    let ctx = ctx.read().await;
-                    let databases = &ctx.dbs;
-
                     let db = match &db {
                         // ignore schema permissions on the system database
                         DB::SystemDB => return Ok(Box::new(empty())),
                         DB::UserDB(db) => db,
                     };
 
-                    if databases.get(&db.name).is_none() {
-                        bail!("database {} doesn't exist in PostgreSQL", db.name);
-                    }
-
-                    let edb = databases.get(&db.name).unwrap();
-
-                    if edb.restrict_conn || edb.invalid {
-                        return Ok(Box::new(empty()));
-                    }
-
                     let operations = vec![
                         Operation {
                             query: format!(
@@ -566,6 +590,7 @@ async fn get_operations<'a>(
 
                     Ok(Box::new(operations))
                 }
+                // TODO: remove this completely https://github.com/neondatabase/cloud/issues/22663
                 PerDatabasePhase::HandleAnonExtension => {
                     // Only install Anon into user databases
                     let db = match &db {

From a6f315c9c93abb81f2bd634344d1050b4237c261 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 24 Feb 2025 11:40:25 +0200
Subject: [PATCH 557/583] Remove unnecessary dependencies to synchronous
 'postgres' crate (#10938)

The synchronous 'postgres' crate is just a wrapper around the async
'tokio_postgres' crate. Some places were unnecessarily using the
re-exported NoTls and Error from the synchronous 'postgres' crate, even
though they were otherwise using the 'tokio_postgres' crate. Tidy up by
using the tokio_postgres types directly.
---
 Cargo.lock                                                 | 4 ----
 libs/postgres_connection/Cargo.toml                        | 1 -
 libs/postgres_connection/src/lib.rs                        | 4 ++--
 pageserver/Cargo.toml                                      | 1 -
 pageserver/client/Cargo.toml                               | 1 -
 pageserver/client/src/page_service.rs                      | 3 ++-
 .../tenant/timeline/walreceiver/walreceiver_connection.rs  | 6 +++---
 safekeeper/Cargo.toml                                      | 1 -
 safekeeper/src/recovery.rs                                 | 7 +++++--
 9 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f0dbdff3ec..038727f1a8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4172,7 +4172,6 @@ dependencies = [
  "pageserver_client",
  "pageserver_compaction",
  "pin-project-lite",
- "postgres",
  "postgres-protocol",
  "postgres-types",
  "postgres_backend",
@@ -4259,7 +4258,6 @@ dependencies = [
  "futures",
  "http-utils",
  "pageserver_api",
- "postgres",
  "reqwest",
  "serde",
  "thiserror 1.0.69",
@@ -4674,7 +4672,6 @@ dependencies = [
  "anyhow",
  "itertools 0.10.5",
  "once_cell",
- "postgres",
  "tokio-postgres",
  "url",
 ]
@@ -5816,7 +5813,6 @@ dependencies = [
  "once_cell",
  "pageserver_api",
  "parking_lot 0.12.1",
- "postgres",
  "postgres-protocol",
  "postgres_backend",
  "postgres_ffi",
diff --git a/libs/postgres_connection/Cargo.toml b/libs/postgres_connection/Cargo.toml
index 19027d13ff..462fb4a533 100644
--- a/libs/postgres_connection/Cargo.toml
+++ b/libs/postgres_connection/Cargo.toml
@@ -7,7 +7,6 @@ license.workspace = true
 [dependencies]
 anyhow.workspace = true
 itertools.workspace = true
-postgres.workspace = true
 tokio-postgres.workspace = true
 url.workspace = true
 
diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs
index ddf9f7b610..e3d31c6cfc 100644
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -171,10 +171,10 @@ impl PgConnectionConfig {
             tokio_postgres::Client,
             tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
         ),
-        postgres::Error,
+        tokio_postgres::Error,
     > {
         self.to_tokio_postgres_config()
-            .connect(postgres::NoTls)
+            .connect(tokio_postgres::NoTls)
             .await
     }
 }
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 41ac3b69b8..9d4463d595 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -40,7 +40,6 @@ num_cpus.workspace = true
 num-traits.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
-postgres.workspace = true
 postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml
index db77a395e0..970a437a42 100644
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -21,5 +21,4 @@ tokio.workspace = true
 futures.workspace = true
 tokio-util.workspace = true
 anyhow.workspace = true
-postgres.workspace = true
 bytes.workspace = true
diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs
index 27280912b4..47da83b0eb 100644
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -34,7 +34,8 @@ pub struct BasebackupRequest {
 
 impl Client {
     pub async fn new(connstring: String) -> anyhow::Result<Self> {
-        let (client, connection) = tokio_postgres::connect(&connstring, postgres::NoTls).await?;
+        let (client, connection) =
+            tokio_postgres::connect(&connstring, tokio_postgres::NoTls).await?;
 
         let conn_task_cancel = CancellationToken::new();
         let conn_task = tokio::spawn({
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index ff05a8f902..bb34a181da 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -13,12 +13,12 @@ use bytes::BytesMut;
 use chrono::{NaiveDateTime, Utc};
 use fail::fail_point;
 use futures::StreamExt;
-use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow};
 use postgres_ffi::WAL_SEGMENT_SIZE;
 use postgres_ffi::{v14::xlog_utils::normalize_lsn, waldecoder::WalDecodeError};
 use postgres_protocol::message::backend::ReplicationMessage;
 use postgres_types::PgLsn;
 use tokio::{select, sync::watch, time};
+use tokio_postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow};
 use tokio_postgres::{replication::ReplicationStream, Client};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn, Instrument};
@@ -64,7 +64,7 @@ pub(super) struct WalConnectionStatus {
 
 pub(super) enum WalReceiverError {
     /// An error of a type that does not indicate an issue, e.g. a connection closing
-    ExpectedSafekeeperError(postgres::Error),
+    ExpectedSafekeeperError(tokio_postgres::Error),
     /// An "error" message that carries a SUCCESSFUL_COMPLETION status code.  Carries
     /// the message part of the original postgres error
     SuccessfulCompletion(String),
@@ -143,7 +143,7 @@ pub(super) async fn handle_walreceiver_connection(
         let mut config = wal_source_connconf.to_tokio_postgres_config();
         config.application_name(format!("pageserver-{}", timeline.conf.id.0).as_str());
         config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
-        match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
+        match time::timeout(connect_timeout, config.connect(tokio_postgres::NoTls)).await {
             Ok(client_and_conn) => client_and_conn?,
             Err(_elapsed) => {
                 // Timing out to connect to a safekeeper node could happen long time, due to
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index d12ebc1030..c86ac576ad 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -31,7 +31,6 @@ futures.workspace = true
 once_cell.workspace = true
 parking_lot.workspace = true
 pageserver_api.workspace = true
-postgres.workspace = true
 postgres-protocol.workspace = true
 pprof.workspace = true
 rand.workspace = true
diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs
index 61647c16b0..35394eb6ed 100644
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -343,8 +343,11 @@ async fn recovery_stream(
     cfg.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
 
     let connect_timeout = Duration::from_millis(10000);
-    let (client, connection) = match time::timeout(connect_timeout, cfg.connect(postgres::NoTls))
-        .await
+    let (client, connection) = match time::timeout(
+        connect_timeout,
+        cfg.connect(tokio_postgres::NoTls),
+    )
+    .await
     {
         Ok(client_and_conn) => client_and_conn?,
         Err(_elapsed) => {

From fb77f28326492b1dff44d2623a81ce822a45ef9e Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 24 Feb 2025 11:49:11 +0000
Subject: [PATCH 558/583] feat(proxy): add direction and private link id to
 billing export (#10925)

ref: https://github.com/neondatabase/cloud/issues/23385

Adds a direction flag as well as private-link ID to the traffic
reporting pipeline. We do not yet actually count ingress, but we include
the flag anyway.

I have additionally moved vpce_id string parsing earlier, since we
expect it to be utf8 (ascii).
---
 proxy/src/auth/backend/mod.rs          |  7 ++---
 proxy/src/cancellation.rs              |  5 +---
 proxy/src/console_redirect_proxy.rs    |  1 +
 proxy/src/protocol2.rs                 | 16 ++++++----
 proxy/src/proxy/mod.rs                 | 41 +++++++++++++++++++++-----
 proxy/src/proxy/passthrough.rs         | 20 ++++++++++---
 proxy/src/serverless/backend.rs        |  5 +---
 proxy/src/serverless/conn_pool_lib.rs  | 27 ++++++++++++++---
 proxy/src/serverless/http_conn_pool.rs | 19 ++++++++++--
 proxy/src/serverless/sql_over_http.rs  | 14 ++++-----
 proxy/src/usage_metrics.rs             | 34 +++++++++++++++++++++
 11 files changed, 146 insertions(+), 43 deletions(-)

diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index dc595844c5..8f1625278f 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -308,10 +308,7 @@ async fn auth_quirks(
 
         let incoming_vpc_endpoint_id = match ctx.extra() {
             None => return Err(AuthError::MissingEndpointName),
-            Some(ConnectionInfoExtra::Aws { vpce_id }) => {
-                // Convert the vcpe_id to a string
-                String::from_utf8(vpce_id.to_vec()).unwrap_or_default()
-            }
+            Some(ConnectionInfoExtra::Aws { vpce_id }) => vpce_id.to_string(),
             Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(),
         };
         let allowed_vpc_endpoint_ids = api.get_allowed_vpc_endpoint_ids(ctx, &info).await?;
@@ -451,7 +448,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
                 Ok((Backend::ControlPlane(api, credentials), ip_allowlist))
             }
             Self::Local(_) => {
-                return Err(auth::AuthError::bad_auth_method("invalid for local proxy"))
+                return Err(auth::AuthError::bad_auth_method("invalid for local proxy"));
             }
         };
 
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 1f9c8a48b7..422e6f741d 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -358,10 +358,7 @@ impl CancellationHandler {
 
             let incoming_vpc_endpoint_id = match ctx.extra() {
                 None => return Err(CancelError::AuthError(AuthError::MissingVPCEndpointId)),
-                Some(ConnectionInfoExtra::Aws { vpce_id }) => {
-                    // Convert the vcpe_id to a string
-                    String::from_utf8(vpce_id.to_vec()).unwrap_or_default()
-                }
+                Some(ConnectionInfoExtra::Aws { vpce_id }) => vpce_id.to_string(),
                 Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(),
             };
 
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 1044f5f8e2..a2e7299d39 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -241,6 +241,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     Ok(Some(ProxyPassthrough {
         client: stream,
         aux: node.aux.clone(),
+        private_link_id: None,
         compute: node,
         session_id: ctx.session_id(),
         cancel: session,
diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index 74a15d9bf4..99d645878f 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -9,6 +9,7 @@ use std::task::{Context, Poll};
 
 use bytes::{Buf, Bytes, BytesMut};
 use pin_project_lite::pin_project;
+use smol_str::SmolStr;
 use strum_macros::FromRepr;
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
 use zerocopy::{FromBytes, FromZeroes};
@@ -99,7 +100,7 @@ impl fmt::Display for ConnectionInfo {
 
 #[derive(PartialEq, Eq, Clone, Debug)]
 pub enum ConnectionInfoExtra {
-    Aws { vpce_id: Bytes },
+    Aws { vpce_id: SmolStr },
     Azure { link_id: u32 },
 }
 
@@ -193,7 +194,7 @@ fn process_proxy_payload(
             return Err(io::Error::new(
                 io::ErrorKind::Other,
                 "invalid proxy protocol address family/transport protocol.",
-            ))
+            ));
         }
     };
 
@@ -207,9 +208,14 @@ fn process_proxy_payload(
                 }
                 let subtype = tlv.value.get_u8();
                 match Pp2AwsType::from_repr(subtype) {
-                    Some(Pp2AwsType::VpceId) => {
-                        extra = Some(ConnectionInfoExtra::Aws { vpce_id: tlv.value });
-                    }
+                    Some(Pp2AwsType::VpceId) => match std::str::from_utf8(&tlv.value) {
+                        Ok(s) => {
+                            extra = Some(ConnectionInfoExtra::Aws { vpce_id: s.into() });
+                        }
+                        Err(e) => {
+                            tracing::warn!("invalid aws vpce id: {e}");
+                        }
+                    },
                     None => {
                         tracing::warn!("unknown aws tlv: subtype={subtype}");
                     }
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 2a406fcb34..49566e5172 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -16,7 +16,7 @@ use once_cell::sync::OnceCell;
 use pq_proto::{BeMessage as Be, CancelKeyData, StartupMessageParams};
 use regex::Regex;
 use serde::{Deserialize, Serialize};
-use smol_str::{format_smolstr, SmolStr};
+use smol_str::{format_smolstr, SmolStr, ToSmolStr};
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
@@ -29,7 +29,7 @@ use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
 use crate::context::RequestContext;
 use crate::error::ReportableError;
 use crate::metrics::{Metrics, NumClientConnectionsGuard};
-use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo};
+use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo, ConnectionInfoExtra};
 use crate::proxy::handshake::{handshake, HandshakeData};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::stream::{PqStream, Stream};
@@ -100,22 +100,34 @@ pub async fn task_main(
                     debug!("healthcheck received");
                     return;
                 }
-                Ok((_socket, ConnectHeader::Missing)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
+                Ok((_socket, ConnectHeader::Missing))
+                    if config.proxy_protocol_v2 == ProxyProtocolV2::Required =>
+                {
                     warn!("missing required proxy protocol header");
                     return;
                 }
-                Ok((_socket, ConnectHeader::Proxy(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
+                Ok((_socket, ConnectHeader::Proxy(_)))
+                    if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected =>
+                {
                     warn!("proxy protocol header not supported");
                     return;
                 }
                 Ok((socket, ConnectHeader::Proxy(info))) => (socket, info),
-                Ok((socket, ConnectHeader::Missing)) => (socket, ConnectionInfo { addr: peer_addr, extra: None }),
+                Ok((socket, ConnectHeader::Missing)) => (
+                    socket,
+                    ConnectionInfo {
+                        addr: peer_addr,
+                        extra: None,
+                    },
+                ),
             };
 
             match socket.inner.set_nodelay(true) {
                 Ok(()) => {}
                 Err(e) => {
-                    error!("per-client task finished with an error: failed to set socket option: {e:#}");
+                    error!(
+                        "per-client task finished with an error: failed to set socket option: {e:#}"
+                    );
                     return;
                 }
             }
@@ -156,10 +168,16 @@ pub async fn task_main(
                     match p.proxy_pass(&config.connect_to_compute).await {
                         Ok(()) => {}
                         Err(ErrorSource::Client(e)) => {
-                            warn!(?session_id, "per-client task finished with an IO error from the client: {e:#}");
+                            warn!(
+                                ?session_id,
+                                "per-client task finished with an IO error from the client: {e:#}"
+                            );
                         }
                         Err(ErrorSource::Compute(e)) => {
-                            error!(?session_id, "per-client task finished with an IO error from the compute: {e:#}");
+                            error!(
+                                ?session_id,
+                                "per-client task finished with an IO error from the compute: {e:#}"
+                            );
                         }
                     }
                 }
@@ -374,9 +392,16 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     let (stream, read_buf) = stream.into_inner();
     node.stream.write_all(&read_buf).await?;
 
+    let private_link_id = match ctx.extra() {
+        Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()),
+        Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()),
+        None => None,
+    };
+
     Ok(Some(ProxyPassthrough {
         client: stream,
         aux: node.aux.clone(),
+        private_link_id,
         compute: node,
         session_id: ctx.session_id(),
         cancel: session,
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index 08871380d6..23b9897155 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -1,3 +1,4 @@
+use smol_str::SmolStr;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::debug;
 use utils::measured_stream::MeasuredStream;
@@ -9,7 +10,7 @@ use crate::config::ComputeConfig;
 use crate::control_plane::messages::MetricsAuxInfo;
 use crate::metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard};
 use crate::stream::Stream;
-use crate::usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS};
+use crate::usage_metrics::{Ids, MetricCounterRecorder, TrafficDirection, USAGE_METRICS};
 
 /// Forward bytes in both directions (client <-> compute).
 #[tracing::instrument(skip_all)]
@@ -17,10 +18,14 @@ pub(crate) async fn proxy_pass(
     client: impl AsyncRead + AsyncWrite + Unpin,
     compute: impl AsyncRead + AsyncWrite + Unpin,
     aux: MetricsAuxInfo,
+    private_link_id: Option<SmolStr>,
 ) -> Result<(), ErrorSource> {
-    let usage = USAGE_METRICS.register(Ids {
+    // we will report ingress at a later date
+    let usage_tx = USAGE_METRICS.register(Ids {
         endpoint_id: aux.endpoint_id,
         branch_id: aux.branch_id,
+        direction: TrafficDirection::Egress,
+        private_link_id,
     });
 
     let metrics = &Metrics::get().proxy.io_bytes;
@@ -31,7 +36,7 @@ pub(crate) async fn proxy_pass(
         |cnt| {
             // Number of bytes we sent to the client (outbound).
             metrics.get_metric(m_sent).inc_by(cnt as u64);
-            usage.record_egress(cnt as u64);
+            usage_tx.record_egress(cnt as u64);
         },
     );
 
@@ -61,6 +66,7 @@ pub(crate) struct ProxyPassthrough<S> {
     pub(crate) compute: PostgresConnection,
     pub(crate) aux: MetricsAuxInfo,
     pub(crate) session_id: uuid::Uuid,
+    pub(crate) private_link_id: Option<SmolStr>,
     pub(crate) cancel: cancellation::Session,
 
     pub(crate) _req: NumConnectionRequestsGuard<'static>,
@@ -72,7 +78,13 @@ impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
         self,
         compute_config: &ComputeConfig,
     ) -> Result<(), ErrorSource> {
-        let res = proxy_pass(self.client, self.compute.stream, self.aux).await;
+        let res = proxy_pass(
+            self.client,
+            self.compute.stream,
+            self.aux,
+            self.private_link_id,
+        )
+        .await;
         if let Err(err) = self
             .compute
             .cancel_closure
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index f35c375ba2..70dd7bc0e7 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -75,10 +75,7 @@ impl PoolingBackend {
             let extra = ctx.extra();
             let incoming_endpoint_id = match extra {
                 None => String::new(),
-                Some(ConnectionInfoExtra::Aws { vpce_id }) => {
-                    // Convert the vcpe_id to a string
-                    String::from_utf8(vpce_id.to_vec()).unwrap_or_default()
-                }
+                Some(ConnectionInfoExtra::Aws { vpce_id }) => vpce_id.to_string(),
                 Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(),
             };
 
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index a300198de4..9e21491655 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -9,6 +9,7 @@ use clashmap::ClashMap;
 use parking_lot::RwLock;
 use postgres_client::ReadyForQueryStatus;
 use rand::Rng;
+use smol_str::ToSmolStr;
 use tracing::{debug, info, Span};
 
 use super::backend::HttpConnError;
@@ -19,8 +20,9 @@ use crate::auth::backend::ComputeUserInfo;
 use crate::context::RequestContext;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
+use crate::protocol2::ConnectionInfoExtra;
 use crate::types::{DbName, EndpointCacheKey, RoleName};
-use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
+use crate::usage_metrics::{Ids, MetricCounter, TrafficDirection, USAGE_METRICS};
 
 #[derive(Debug, Clone)]
 pub(crate) struct ConnInfo {
@@ -473,7 +475,9 @@ where
                 .http_pool_opened_connections
                 .get_metric()
                 .dec_by(clients_removed as i64);
-            info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
+            info!(
+                "pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"
+            );
         }
         let removed = current_len - new_len;
 
@@ -635,15 +639,28 @@ impl<C: ClientInnerExt> Client<C> {
         (&mut inner.inner, Discard { conn_info, pool })
     }
 
-    pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
+    pub(crate) fn metrics(
+        &self,
+        direction: TrafficDirection,
+        ctx: &RequestContext,
+    ) -> Arc<MetricCounter> {
         let aux = &self
             .inner
             .as_ref()
             .expect("client inner should not be removed")
             .aux;
+
+        let private_link_id = match ctx.extra() {
+            None => None,
+            Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()),
+            Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()),
+        };
+
         USAGE_METRICS.register(Ids {
             endpoint_id: aux.endpoint_id,
             branch_id: aux.branch_id,
+            direction,
+            private_link_id,
         })
     }
 }
@@ -700,7 +717,9 @@ impl<C: ClientInnerExt> Discard<'_, C> {
     pub(crate) fn discard(&mut self) {
         let conn_info = &self.conn_info;
         if std::mem::take(self.pool).strong_count() > 0 {
-            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
+            info!(
+                "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state"
+            );
         }
     }
 }
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index fde38d0de3..fa21f24a1c 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -5,6 +5,7 @@ use std::sync::{Arc, Weak};
 use hyper::client::conn::http2;
 use hyper_util::rt::{TokioExecutor, TokioIo};
 use parking_lot::RwLock;
+use smol_str::ToSmolStr;
 use tokio::net::TcpStream;
 use tracing::{debug, error, info, info_span, Instrument};
 
@@ -16,8 +17,9 @@ use super::conn_pool_lib::{
 use crate::context::RequestContext;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
+use crate::protocol2::ConnectionInfoExtra;
 use crate::types::EndpointCacheKey;
-use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
+use crate::usage_metrics::{Ids, MetricCounter, TrafficDirection, USAGE_METRICS};
 
 pub(crate) type Send = http2::SendRequest<hyper::body::Incoming>;
 pub(crate) type Connect =
@@ -264,11 +266,24 @@ impl<C: ClientInnerExt + Clone> Client<C> {
         Self { inner }
     }
 
-    pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
+    pub(crate) fn metrics(
+        &self,
+        direction: TrafficDirection,
+        ctx: &RequestContext,
+    ) -> Arc<MetricCounter> {
         let aux = &self.inner.aux;
+
+        let private_link_id = match ctx.extra() {
+            None => None,
+            Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()),
+            Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()),
+        };
+
         USAGE_METRICS.register(Ids {
             endpoint_id: aux.endpoint_id,
             branch_id: aux.branch_id,
+            direction,
+            private_link_id,
         })
     }
 }
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 5982fe225d..7c21d90ed8 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -42,7 +42,7 @@ use crate::metrics::{HttpDirection, Metrics};
 use crate::proxy::{run_until_cancelled, NeonOptions};
 use crate::serverless::backend::HttpConnError;
 use crate::types::{DbName, RoleName};
-use crate::usage_metrics::{MetricCounter, MetricCounterRecorder};
+use crate::usage_metrics::{MetricCounter, MetricCounterRecorder, TrafficDirection};
 
 #[derive(serde::Deserialize)]
 #[serde(rename_all = "camelCase")]
@@ -209,7 +209,7 @@ fn get_conn_info(
             }
         }
         Some(url::Host::Ipv4(_) | url::Host::Ipv6(_)) | None => {
-            return Err(ConnInfoError::MissingHostname)
+            return Err(ConnInfoError::MissingHostname);
         }
     };
     ctx.set_endpoint_id(endpoint.clone());
@@ -745,7 +745,7 @@ async fn handle_db_inner(
         }
     };
 
-    let metrics = client.metrics();
+    let metrics = client.metrics(TrafficDirection::Egress, ctx);
 
     let len = json_output.len();
     let response = response
@@ -818,7 +818,7 @@ async fn handle_auth_broker_inner(
         .expect("all headers and params received via hyper should be valid for request");
 
     // todo: map body to count egress
-    let _metrics = client.metrics();
+    let _metrics = client.metrics(TrafficDirection::Egress, ctx);
 
     Ok(client
         .inner
@@ -1118,10 +1118,10 @@ enum Discard<'a> {
 }
 
 impl Client {
-    fn metrics(&self) -> Arc<MetricCounter> {
+    fn metrics(&self, direction: TrafficDirection, ctx: &RequestContext) -> Arc<MetricCounter> {
         match self {
-            Client::Remote(client) => client.metrics(),
-            Client::Local(local_client) => local_client.metrics(),
+            Client::Remote(client) => client.metrics(direction, ctx),
+            Client::Local(local_client) => local_client.metrics(direction, ctx),
         }
     }
 
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index d369e3742f..6a23f0e129 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -16,6 +16,7 @@ use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_S
 use once_cell::sync::Lazy;
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use serde::{Deserialize, Serialize};
+use smol_str::SmolStr;
 use tokio::io::AsyncWriteExt;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, instrument, trace, warn};
@@ -43,6 +44,37 @@ const HTTP_REPORTING_RETRY_DURATION: Duration = Duration::from_secs(60);
 pub(crate) struct Ids {
     pub(crate) endpoint_id: EndpointIdInt,
     pub(crate) branch_id: BranchIdInt,
+    pub(crate) direction: TrafficDirection,
+    #[serde(with = "none_as_empty_string")]
+    pub(crate) private_link_id: Option<SmolStr>,
+}
+
+mod none_as_empty_string {
+    use serde::Deserialize;
+    use smol_str::SmolStr;
+
+    #[allow(clippy::ref_option)]
+    pub fn serialize<S: serde::Serializer>(t: &Option<SmolStr>, s: S) -> Result<S::Ok, S::Error> {
+        s.serialize_str(t.as_deref().unwrap_or(""))
+    }
+
+    pub fn deserialize<'de, D: serde::Deserializer<'de>>(
+        d: D,
+    ) -> Result<Option<SmolStr>, D::Error> {
+        let s = SmolStr::deserialize(d)?;
+        if s.is_empty() {
+            Ok(None)
+        } else {
+            Ok(Some(s))
+        }
+    }
+}
+
+#[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
+#[serde(rename_all = "lowercase")]
+pub(crate) enum TrafficDirection {
+    Ingress,
+    Egress,
 }
 
 pub(crate) trait MetricCounterRecorder {
@@ -505,6 +537,8 @@ mod tests {
         let counter = metrics.register(Ids {
             endpoint_id: (&EndpointId::from("e1")).into(),
             branch_id: (&BranchId::from("b1")).into(),
+            direction: TrafficDirection::Egress,
+            private_link_id: None,
         });
 
         // the counter should be observed despite 0 egress

From 2a5d7e5a78f7d699ee6590220609111bd93b07f6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 24 Feb 2025 12:22:22 +0000
Subject: [PATCH 559/583] tests: improve compat test coverage of
 controller-pageserver interaction (#10848)

## Problem

We failed to detect https://github.com/neondatabase/neon/pull/10845
before merging, because the tests we run with a matrix of component
versions didn't include the ones that did live migrations.

## Summary of changes

- Do a live migration during the storage controller smoke test, since
this is a pretty core piece of functionality
- Apply a compat version matrix to the graceful cluster restart test,
since this is the functionality that we most urgently need to work
across versions to make deploys work.

I expect the first CI run of this to fail, because
https://github.com/neondatabase/neon/pull/10845 isn't merged yet.
---
 test_runner/regress/test_storage_controller.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index d18cbb3393..d5acc257b2 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -182,6 +182,13 @@ def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination)
     time.sleep(1)
     assert get_node_shard_counts(env, tenant_ids)[env.pageservers[0].id] == 0
 
+    # Exercise live migration of a tenant back to the original pageserver
+    migrate_tenant = env.pageservers[1].http_client().tenant_list_locations()["tenant_shards"][0][0]
+    env.storage_controller.tenant_shard_migrate(
+        TenantShardId.parse(migrate_tenant), env.pageservers[0].id
+    )
+    assert get_node_shard_counts(env, tenant_ids)[env.pageservers[0].id] == 1
+
     # Restarting a pageserver should not detach any tenants (i.e. /re-attach works)
     before_restart = env.pageservers[1].http_client().tenant_list_locations()
     env.pageservers[1].stop()
@@ -2139,8 +2146,9 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto
         workload.validate()
 
 
+@pytest.mark.parametrize(**fixtures.utils.allpairs_versions())
 @pytest.mark.parametrize("num_azs", [1, 2])
-def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder, num_azs: int):
+def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder, num_azs: int, combination):
     """
     Graceful reststart of storage controller clusters use the drain and
     fill hooks in order to migrate attachments away from pageservers before

From 17724a19e689f8984dd29281d30a1aff63fd1f4f Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 24 Feb 2025 15:07:14 +0000
Subject: [PATCH 560/583] CI(allure-reports): update dependencies and cleanup
 code (#10794)

## Problem

There are a bunch of minor improvements that are too small and
insignificant as is, so collecting them in one PR.

## Summary of changes
- Add runner arch to artifact name to make it easier to distinguish
files on S3
([ref](https://neondb.slack.com/archives/C059ZC138NR/p1739365938371149))
- Use `github.event.pull_request.number` instead of parsing
`$GITHUB_EVENT_PATH` file
- Update Allure CLI and `allure-pytest`
---
 .../actions/allure-report-generate/action.yml | 12 ++--
 .../actions/allure-report-store/action.yml    |  8 +--
 .../actions/run-python-test-set/action.yml    |  2 +-
 poetry.lock                                   | 60 +++++++++----------
 pyproject.toml                                |  2 +-
 5 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index d07e3e32e8..b85ca7874d 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -38,9 +38,11 @@ runs:
     #
     - name: Set variables
       shell: bash -euxo pipefail {0}
+      env:
+        PR_NUMBER: ${{ github.event.pull_request.number }}
+        BUCKET: neon-github-public-dev
       run: |
-        PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
-        if [ "${PR_NUMBER}" != "null" ]; then
+        if [ -n "${PR_NUMBER}" ]; then
           BRANCH_OR_PR=pr-${PR_NUMBER}
         elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \
              [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then
@@ -59,8 +61,6 @@ runs:
         echo "LOCK_FILE=${LOCK_FILE}"       >> $GITHUB_ENV
         echo "WORKDIR=${WORKDIR}"           >> $GITHUB_ENV
         echo "BUCKET=${BUCKET}"             >> $GITHUB_ENV
-      env:
-        BUCKET: neon-github-public-dev
 
     # TODO: We can replace with a special docker image with Java and Allure pre-installed
     - uses: actions/setup-java@v4
@@ -80,8 +80,8 @@ runs:
           rm -f ${ALLURE_ZIP}
         fi
       env:
-        ALLURE_VERSION: 2.27.0
-        ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777
+        ALLURE_VERSION: 2.32.2
+        ALLURE_ZIP_SHA256: 3f28885e2118f6317c92f667eaddcc6491400af1fb9773c1f3797a5fa5174953
 
     - uses: aws-actions/configure-aws-credentials@v4
       if: ${{ !cancelled() }}
diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml
index 8548a886cf..687bfd49af 100644
--- a/.github/actions/allure-report-store/action.yml
+++ b/.github/actions/allure-report-store/action.yml
@@ -18,9 +18,11 @@ runs:
   steps:
     - name: Set variables
       shell: bash -euxo pipefail {0}
+      env:
+        PR_NUMBER: ${{ github.event.pull_request.number }}
+        REPORT_DIR: ${{ inputs.report-dir }}
       run: |
-        PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
-        if [ "${PR_NUMBER}" != "null" ]; then
+        if [ -n "${PR_NUMBER}" ]; then
           BRANCH_OR_PR=pr-${PR_NUMBER}
         elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \
              [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then
@@ -32,8 +34,6 @@ runs:
 
         echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV
         echo "REPORT_DIR=${REPORT_DIR}"     >> $GITHUB_ENV
-      env:
-        REPORT_DIR: ${{ inputs.report-dir }}
 
     - uses: aws-actions/configure-aws-credentials@v4
       if: ${{ !cancelled() }}
diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 0eddfe5da6..122fe48b68 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -236,5 +236,5 @@ runs:
       uses: ./.github/actions/allure-report-store
       with:
         report-dir: /tmp/test_output/allure/results
-        unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
+        unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}-${{ runner.arch }}
         aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
diff --git a/poetry.lock b/poetry.lock
index d66c3aae7a..ba3b0535e4 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -122,7 +122,7 @@ multidict = ">=4.5,<7.0"
 yarl = ">=1.12.0,<2.0"
 
 [package.extras]
-speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"]
+speedups = ["Brotli ; platform_python_implementation == \"CPython\"", "aiodns (>=3.2.0) ; sys_platform == \"linux\" or sys_platform == \"darwin\"", "brotlicffi ; platform_python_implementation != \"CPython\""]
 
 [[package]]
 name = "aiopg"
@@ -160,30 +160,30 @@ frozenlist = ">=1.1.0"
 
 [[package]]
 name = "allure-pytest"
-version = "2.13.2"
+version = "2.13.5"
 description = "Allure pytest integration"
 optional = false
 python-versions = "*"
 groups = ["main"]
 files = [
-    {file = "allure-pytest-2.13.2.tar.gz", hash = "sha256:22243159e8ec81ce2b5254b4013802198821b1b42f118f69d4a289396607c7b3"},
-    {file = "allure_pytest-2.13.2-py3-none-any.whl", hash = "sha256:17de9dbee7f61c8e66a5b5e818b00e419dbcea44cb55c24319401ba813220690"},
+    {file = "allure-pytest-2.13.5.tar.gz", hash = "sha256:0ef8e1790c44a988db6b83c4d4f5e91451e2c4c8ea10601dfa88528d23afcf6e"},
+    {file = "allure_pytest-2.13.5-py3-none-any.whl", hash = "sha256:94130bac32964b78058e62cf4b815ad97a5ac82a065e6dd2d43abac2be7640fc"},
 ]
 
 [package.dependencies]
-allure-python-commons = "2.13.2"
+allure-python-commons = "2.13.5"
 pytest = ">=4.5.0"
 
 [[package]]
 name = "allure-python-commons"
-version = "2.13.2"
-description = "Common module for integrate allure with python-based frameworks"
+version = "2.13.5"
+description = "('Contains the API for end users as well as helper functions and classes to build Allure adapters for Python test frameworks',)"
 optional = false
 python-versions = ">=3.6"
 groups = ["main"]
 files = [
-    {file = "allure-python-commons-2.13.2.tar.gz", hash = "sha256:8a03681330231b1deadd86b97ff68841c6591320114ae638570f1ed60d7a2033"},
-    {file = "allure_python_commons-2.13.2-py3-none-any.whl", hash = "sha256:2bb3646ec3fbf5b36d178a5e735002bc130ae9f9ba80f080af97d368ba375051"},
+    {file = "allure-python-commons-2.13.5.tar.gz", hash = "sha256:a232e7955811f988e49a4c1dd6c16cce7e9b81d0ea0422b1e5654d3254e2caf3"},
+    {file = "allure_python_commons-2.13.5-py3-none-any.whl", hash = "sha256:8b0e837b6e32d810adec563f49e1d04127a5b6770e0232065b7cb09b9953980d"},
 ]
 
 [package.dependencies]
@@ -232,7 +232,7 @@ sniffio = ">=1.1"
 
 [package.extras]
 doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
-test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"]
+test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17) ; platform_python_implementation == \"CPython\" and platform_system != \"Windows\""]
 trio = ["trio (>=0.23)"]
 
 [[package]]
@@ -308,8 +308,8 @@ files = [
 
 [package.extras]
 docs = ["Sphinx (>=8.1.3,<8.2.0)", "sphinx-rtd-theme (>=1.2.2)"]
-gssauth = ["gssapi", "sspilib"]
-test = ["distro (>=1.9.0,<1.10.0)", "flake8 (>=6.1,<7.0)", "flake8-pyi (>=24.1.0,<24.2.0)", "gssapi", "k5test", "mypy (>=1.8.0,<1.9.0)", "sspilib", "uvloop (>=0.15.3)"]
+gssauth = ["gssapi ; platform_system != \"Windows\"", "sspilib ; platform_system == \"Windows\""]
+test = ["distro (>=1.9.0,<1.10.0)", "flake8 (>=6.1,<7.0)", "flake8-pyi (>=24.1.0,<24.2.0)", "gssapi ; platform_system == \"Linux\"", "k5test ; platform_system == \"Linux\"", "mypy (>=1.8.0,<1.9.0)", "sspilib ; platform_system == \"Windows\"", "uvloop (>=0.15.3) ; platform_system != \"Windows\" and python_version < \"3.14.0\""]
 
 [[package]]
 name = "attrs"
@@ -324,10 +324,10 @@ files = [
 ]
 
 [package.extras]
-dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"]
+dev = ["cloudpickle ; platform_python_implementation == \"CPython\"", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"]
 docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"]
-tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"]
-tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
+tests = ["cloudpickle ; platform_python_implementation == \"CPython\"", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"]
+tests-no-zope = ["cloudpickle ; platform_python_implementation == \"CPython\"", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
 
 [[package]]
 name = "aws-sam-translator"
@@ -1074,10 +1074,10 @@ files = [
 cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""}
 
 [package.extras]
-docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=3.0.0)"]
+docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=3.0.0) ; python_version >= \"3.8\""]
 docstest = ["pyenchant (>=3)", "readme-renderer (>=30.0)", "sphinxcontrib-spelling (>=7.3.1)"]
-nox = ["nox (>=2024.4.15)", "nox[uv] (>=2024.3.2)"]
-pep8test = ["check-sdist", "click (>=8.0.1)", "mypy (>=1.4)", "ruff (>=0.3.6)"]
+nox = ["nox (>=2024.4.15)", "nox[uv] (>=2024.3.2) ; python_version >= \"3.8\""]
+pep8test = ["check-sdist ; python_version >= \"3.8\"", "click (>=8.0.1)", "mypy (>=1.4)", "ruff (>=0.3.6)"]
 sdist = ["build (>=1.0.0)"]
 ssh = ["bcrypt (>=3.1.5)"]
 test = ["certifi (>=2024)", "cryptography-vectors (==44.0.1)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"]
@@ -1359,7 +1359,7 @@ idna = "*"
 sniffio = "*"
 
 [package.extras]
-brotli = ["brotli", "brotlicffi"]
+brotli = ["brotli ; platform_python_implementation == \"CPython\"", "brotlicffi ; platform_python_implementation != \"CPython\""]
 cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
 http2 = ["h2 (>=3,<5)"]
 socks = ["socksio (==1.*)"]
@@ -1545,8 +1545,8 @@ files = [
 
 [package.extras]
 docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"]
-testing = ["ecdsa", "enum34", "feedparser", "jsonlib", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0)", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"]
-testing-libs = ["simplejson", "ujson", "yajl"]
+testing = ["ecdsa", "enum34 ; python_version == \"2.7\"", "feedparser", "jsonlib ; python_version == \"2.7\"", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0) ; python_version <= \"3.6\"", "pytest-flake8 (>=1.1.1) ; python_version >= \"3.7\"", "scikit-learn", "sqlalchemy"]
+testing-libs = ["simplejson", "ujson", "yajl ; python_version == \"2.7\""]
 
 [[package]]
 name = "jsonpointer"
@@ -1867,7 +1867,7 @@ files = [
 [package.extras]
 develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"]
 docs = ["sphinx"]
-gmpy = ["gmpy2 (>=2.1.0a4)"]
+gmpy = ["gmpy2 (>=2.1.0a4) ; platform_python_implementation != \"PyPy\""]
 tests = ["pytest (>=4.6)"]
 
 [[package]]
@@ -2330,7 +2330,7 @@ files = [
 ]
 
 [package.extras]
-test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
+test = ["enum34 ; python_version <= \"3.4\"", "ipaddress ; python_version < \"3.0\"", "mock ; python_version < \"3.0\"", "pywin32 ; sys_platform == \"win32\"", "wmi ; sys_platform == \"win32\""]
 
 [[package]]
 name = "psycopg2-binary"
@@ -2456,7 +2456,7 @@ typing-extensions = ">=4.12.2"
 
 [package.extras]
 email = ["email-validator (>=2.0.0)"]
-timezone = ["tzdata"]
+timezone = ["tzdata ; python_version >= \"3.9\" and platform_system == \"Windows\""]
 
 [[package]]
 name = "pydantic-core"
@@ -3068,7 +3068,7 @@ requests = ">=2.30.0,<3.0"
 urllib3 = ">=1.25.10,<3.0"
 
 [package.extras]
-tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "tomli", "tomli-w", "types-PyYAML", "types-requests"]
+tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "tomli ; python_version < \"3.11\"", "tomli-w", "types-PyYAML", "types-requests"]
 
 [[package]]
 name = "rfc3339-validator"
@@ -3161,7 +3161,7 @@ files = [
 
 [package.extras]
 docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
-testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov ; platform_python_implementation != \"PyPy\"", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
 
 [[package]]
 name = "six"
@@ -3407,8 +3407,8 @@ files = [
 ]
 
 [package.extras]
-brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"]
-secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"]
+brotli = ["brotli (==1.0.9) ; os_name != \"nt\" and python_version < \"3\" and platform_python_implementation == \"CPython\"", "brotli (>=1.0.9) ; python_version >= \"3\" and platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; (os_name != \"nt\" or python_version >= \"3\") and platform_python_implementation != \"CPython\"", "brotlipy (>=0.6.0) ; os_name == \"nt\" and python_version < \"3\""]
+secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress ; python_version == \"2.7\"", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"]
 socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
 
 [[package]]
@@ -3820,4 +3820,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.11"
-content-hash = "00ddc42c32e235b6171845fc066dcab078282ed832cd464d5e8a0afa959dd04a"
+content-hash = "9711c5479c867fa614ce3d352f1bbc63dba1cb2376d347f96fbeda6f512ee308"
diff --git a/pyproject.toml b/pyproject.toml
index 92a660c233..c6e5073bcd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,7 @@ prometheus-client = "^0.14.1"
 pytest-timeout = "^2.3.1"
 Werkzeug = "^3.0.6"
 pytest-order = "^1.1.0"
-allure-pytest = "^2.13.2"
+allure-pytest = "^2.13.5"
 pytest-asyncio = "^0.21.0"
 toml = "^0.10.2"
 psutil = "^5.9.4"

From 459446fcb8e259cef9b6df2537bc042c427acdf1 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 24 Feb 2025 15:21:17 +0000
Subject: [PATCH 561/583] pagesever: include visible layers in heatmaps after
 unarchival (#10880)

## Problem

https://github.com/neondatabase/neon/pull/10788 introduced an API for
warming up attached locations
by downloading all layers in the heatmap. We intend to use it for
warming up timelines after unarchival too,
but it doesn't work. Any heatmap generated after the unarchival will not
include our timeline, so we've lost
all those layers.

## Summary of changes

Generate a cheeky heatmap on unarchival. It includes all the visible
layers.
Use that as the `PreviousHeatmap` which inputs into actual heatmap
generation.

Closes: https://github.com/neondatabase/neon/issues/10541
---
 pageserver/src/tenant.rs                      |  33 ++++++
 pageserver/src/tenant/timeline.rs             |  41 ++++++-
 pageserver/src/tenant/timeline/compaction.rs  |   2 +-
 .../src/tenant/timeline/layer_manager.rs      |  10 +-
 .../regress/test_pageserver_secondary.py      | 101 +++++++++++++++---
 5 files changed, 168 insertions(+), 19 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index efb35625f2..56718f5294 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1189,6 +1189,39 @@ impl Tenant {
                 format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}")
             })?;
 
+        // When unarchiving, we've mostly likely lost the heatmap generated prior
+        // to the archival operation. To allow warming this timeline up, generate
+        // a previous heatmap which contains all visible layers in the layer map.
+        // This previous heatmap will be used whenever a fresh heatmap is generated
+        // for the timeline.
+        if matches!(cause, LoadTimelineCause::Unoffload) {
+            let mut tline_ending_at = Some((&timeline, timeline.get_last_record_lsn()));
+            while let Some((tline, end_lsn)) = tline_ending_at {
+                let unarchival_heatmap = tline.generate_unarchival_heatmap(end_lsn).await;
+                if !tline.is_previous_heatmap_active() {
+                    tline
+                        .previous_heatmap
+                        .store(Some(Arc::new(unarchival_heatmap)));
+                } else {
+                    tracing::info!("Previous heatmap still active. Dropping unarchival heatmap.")
+                }
+
+                match tline.ancestor_timeline() {
+                    Some(ancestor) => {
+                        if ancestor.update_layer_visibility().await.is_err() {
+                            // Ancestor timeline is shutting down.
+                            break;
+                        }
+
+                        tline_ending_at = Some((ancestor, tline.get_ancestor_lsn()));
+                    }
+                    None => {
+                        tline_ending_at = None;
+                    }
+                }
+            }
+        }
+
         match import_pgdata {
             Some(import_pgdata) if !import_pgdata.is_done() => {
                 match cause {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 30de4d90dc..319c5e3d87 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -468,7 +468,7 @@ pub struct Timeline {
     /// If Some, collects GetPage metadata for an ongoing PageTrace.
     pub(crate) page_trace: ArcSwapOption<Sender<PageTraceEvent>>,
 
-    previous_heatmap: ArcSwapOption<PreviousHeatmap>,
+    pub(super) previous_heatmap: ArcSwapOption<PreviousHeatmap>,
 
     /// May host a background Tokio task which downloads all the layers from the current
     /// heatmap on demand.
@@ -3524,6 +3524,14 @@ impl Timeline {
         Ok(layer)
     }
 
+    pub(super) fn is_previous_heatmap_active(&self) -> bool {
+        self.previous_heatmap
+            .load()
+            .as_ref()
+            .map(|prev| matches!(**prev, PreviousHeatmap::Active { .. }))
+            .unwrap_or(false)
+    }
+
     /// The timeline heatmap is a hint to secondary locations from the primary location,
     /// indicating which layers are currently on-disk on the primary.
     ///
@@ -3596,6 +3604,7 @@ impl Timeline {
             Some(non_resident) => {
                 let mut non_resident = non_resident.peekable();
                 if non_resident.peek().is_none() {
+                    tracing::info!(timeline_id=%self.timeline_id, "Previous heatmap now obsolete");
                     self.previous_heatmap
                         .store(Some(PreviousHeatmap::Obsolete.into()));
                 }
@@ -3627,6 +3636,36 @@ impl Timeline {
         Some(HeatMapTimeline::new(self.timeline_id, layers))
     }
 
+    pub(super) async fn generate_unarchival_heatmap(&self, end_lsn: Lsn) -> PreviousHeatmap {
+        let guard = self.layers.read().await;
+
+        let now = SystemTime::now();
+        let mut heatmap_layers = Vec::default();
+        for vl in guard.visible_layers() {
+            if vl.layer_desc().get_lsn_range().start >= end_lsn {
+                continue;
+            }
+
+            let hl = HeatMapLayer {
+                name: vl.layer_desc().layer_name(),
+                metadata: vl.metadata(),
+                access_time: now,
+            };
+            heatmap_layers.push(hl);
+        }
+
+        tracing::info!(
+            "Generating unarchival heatmap with {} layers",
+            heatmap_layers.len()
+        );
+
+        let heatmap = HeatMapTimeline::new(self.timeline_id, heatmap_layers);
+        PreviousHeatmap::Active {
+            heatmap,
+            read_at: Instant::now(),
+        }
+    }
+
     /// Returns true if the given lsn is or was an ancestor branchpoint.
     pub(crate) fn is_ancestor_lsn(&self, lsn: Lsn) -> bool {
         // upon timeline detach, we set the ancestor_lsn to Lsn::INVALID and the store the original
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 0361ce8cd1..d75591bd74 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1020,7 +1020,7 @@ impl Timeline {
     ///
     /// The result may be used as an input to eviction and secondary downloads to de-prioritize layers
     /// that we know won't be needed for reads.
-    pub(super) async fn update_layer_visibility(
+    pub(crate) async fn update_layer_visibility(
         &self,
     ) -> Result<(), super::layer_manager::Shutdown> {
         let head_lsn = self.get_last_record_lsn();
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index cb7783d779..60e36a5d4d 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -15,8 +15,8 @@ use crate::{
     tenant::{
         layer_map::{BatchedUpdates, LayerMap},
         storage_layer::{
-            AsLayerDesc, InMemoryLayer, Layer, PersistentLayerDesc, PersistentLayerKey,
-            ResidentLayer,
+            AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc,
+            PersistentLayerKey, ResidentLayer,
         },
     },
 };
@@ -118,6 +118,12 @@ impl LayerManager {
         self.layers().values().filter(|l| l.is_likely_resident())
     }
 
+    pub(crate) fn visible_layers(&self) -> impl Iterator<Item = &'_ Layer> + '_ {
+        self.layers()
+            .values()
+            .filter(|l| l.visibility() == LayerVisibilityHint::Visible)
+    }
+
     pub(crate) fn contains(&self, layer: &Layer) -> bool {
         self.contains_key(&layer.layer_desc().key())
     }
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 602d493ae6..a9b897b741 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -8,9 +8,10 @@ from pathlib import Path
 from typing import TYPE_CHECKING
 
 import pytest
-from fixtures.common_types import TenantId, TenantShardId, TimelineId
+from fixtures.common_types import TenantId, TenantShardId, TimelineArchivalState, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    DEFAULT_BRANCH_NAME,
     NeonEnvBuilder,
     NeonPageserver,
     StorageControllerMigrationConfig,
@@ -927,8 +928,12 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
     workload.write_rows(128, upload=True)
     workload.write_rows(128, upload=True)
     workload.write_rows(128, upload=True)
+
+    child_timeline_id = env.create_branch(
+        "foo", tenant_id, ancestor_branch_name=DEFAULT_BRANCH_NAME
+    )
+
     workload.write_rows(128, upload=True)
-    workload.stop()
 
     # Expect lots of layers
     assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10
@@ -937,9 +942,19 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
     for ps in env.pageservers:
         ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")])
 
+    def timeline_heatmap(tlid):
+        assert env.pageserver_remote_storage is not None
+
+        heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id)
+        for htl in heatmap["timelines"]:
+            if htl["timeline_id"] == str(tlid):
+                return htl
+
+        raise RuntimeError(f"No heatmap for timeline: {tlid}")
+
     # Upload a heatmap, so that secondaries have something to download
     ps_attached.http_client().tenant_heatmap_upload(tenant_id)
-    heatmap_before_migration = env.pageserver_remote_storage.heatmap_content(tenant_id)
+    heatmap_before_migration = timeline_heatmap(timeline_id)
 
     # This has no chance to succeed: we have lots of layers and each one takes at least 1000ms.
     # However, it pulls the heatmap, which will be important later.
@@ -971,17 +986,12 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
     assert env.storage_controller.locate(tenant_id)[0]["node_id"] == ps_secondary.id
 
     ps_secondary.http_client().tenant_heatmap_upload(tenant_id)
-    heatmap_after_migration = env.pageserver_remote_storage.heatmap_content(tenant_id)
+    heatmap_after_migration = timeline_heatmap(timeline_id)
 
-    assert len(heatmap_before_migration["timelines"][0]["layers"]) > 0
+    assert len(heatmap_before_migration["layers"]) > 0
 
-    # The new layer map should contain all the layers in the pre-migration one
-    # and a new in memory layer
-    after_migration_heatmap_layers_count = len(heatmap_after_migration["timelines"][0]["layers"])
-    assert (
-        len(heatmap_before_migration["timelines"][0]["layers"]) + 1
-        == after_migration_heatmap_layers_count
-    )
+    after_migration_heatmap_layers_count = len(heatmap_after_migration["layers"])
+    assert len(heatmap_before_migration["layers"]) <= after_migration_heatmap_layers_count
 
     log.info(f"Heatmap size after cold migration is {after_migration_heatmap_layers_count}")
 
@@ -989,10 +999,71 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
         TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id
     )
 
-    def all_layers_downloaded():
+    # Now simulate the case where a child timeline is archived, parent layers
+    # are evicted and the child is unarchived. When the child is unarchived,
+    # itself and the parent update their heatmaps to contain layers needed by the
+    # child. One can warm up the timeline hierarchy since the heatmaps are ready.
+
+    def all_layers_downloaded(expected_layer_count: int):
         local_layers_count = len(ps_secondary.list_layers(tenant_id, timeline_id))
 
         log.info(f"{local_layers_count=} {after_migration_heatmap_layers_count=}")
-        assert local_layers_count == after_migration_heatmap_layers_count
+        assert local_layers_count >= expected_layer_count
 
-    wait_until(all_layers_downloaded)
+    wait_until(lambda: all_layers_downloaded(after_migration_heatmap_layers_count))
+    ps_secondary.http_client().tenant_heatmap_upload(tenant_id)
+
+    before = (
+        ps_secondary.http_client()
+        .get_metrics()
+        .query_one("pageserver_remote_ondemand_downloaded_layers_total")
+        .value
+    )
+    workload.validate()
+    after = (
+        ps_secondary.http_client()
+        .get_metrics()
+        .query_one("pageserver_remote_ondemand_downloaded_layers_total")
+        .value
+    )
+
+    workload.stop()
+    assert before == after
+
+    def check_archival_state(state: TimelineArchivalState, tline):
+        timelines = (
+            timeline["timeline_id"]
+            for timeline in ps_secondary.http_client().timeline_list(tenant_id=tenant_id)
+        )
+
+        if state == TimelineArchivalState.ARCHIVED:
+            assert str(tline) not in timelines
+        elif state == TimelineArchivalState.UNARCHIVED:
+            assert str(tline) in timelines
+
+    ps_secondary.http_client().timeline_archival_config(
+        tenant_id, child_timeline_id, TimelineArchivalState.ARCHIVED
+    )
+    ps_secondary.http_client().timeline_offload(tenant_id, child_timeline_id)
+    wait_until(lambda: check_archival_state(TimelineArchivalState.ARCHIVED, child_timeline_id))
+
+    ps_secondary.http_client().evict_all_layers(tenant_id, timeline_id)
+    ps_secondary.http_client().tenant_heatmap_upload(tenant_id)
+    assert len(timeline_heatmap(timeline_id)["layers"]) == 0
+
+    ps_secondary.http_client().timeline_archival_config(
+        tenant_id, child_timeline_id, TimelineArchivalState.UNARCHIVED
+    )
+    wait_until(lambda: check_archival_state(TimelineArchivalState.UNARCHIVED, child_timeline_id))
+
+    ps_secondary.http_client().tenant_heatmap_upload(tenant_id)
+    log.info(f"Parent timeline heatmap size: {len(timeline_heatmap(timeline_id)['layers'])}")
+    log.info(f"Child timeline heatmap size: {len(timeline_heatmap(child_timeline_id)['layers'])}")
+
+    expected_locally = len(timeline_heatmap(timeline_id)["layers"])
+    assert expected_locally > 0
+
+    env.storage_controller.download_heatmap_layers(
+        TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id
+    )
+    wait_until(lambda: all_layers_downloaded(expected_locally))

From fdde58120c2e64815469f44d1abea2c413dbdb9f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 24 Feb 2025 16:26:28 +0100
Subject: [PATCH 562/583] Upgrade proxy crates to edition 2024 (#10942)

This upgrades the `proxy/` crate as well as the forked libraries in
`libs/proxy/` to edition 2024.

Also reformats the imports of those forked libraries via:

```
cargo +nightly fmt -p proxy -p postgres-protocol2 -p postgres-types2 -p tokio-postgres2 -- -l --config imports_granularity=Module,group_imports=StdExternalCrate,reorder_imports=true
```

It can be read commit-by-commit: the first commit has no formatting
changes, only changes to accomodate the new edition.

Part of #10918
---
 libs/proxy/postgres-protocol2/Cargo.toml      |  2 +-
 .../src/authentication/sasl.rs                | 14 +++----
 libs/proxy/postgres-protocol2/src/lib.rs      |  3 +-
 .../postgres-protocol2/src/message/backend.rs |  8 ++--
 .../src/message/frontend.rs                   |  8 ++--
 .../postgres-protocol2/src/password/mod.rs    |  3 +-
 .../proxy/postgres-protocol2/src/types/mod.rs |  7 ++--
 libs/proxy/postgres-types2/Cargo.toml         |  2 +-
 libs/proxy/postgres-types2/src/lib.rs         |  9 ++--
 libs/proxy/postgres-types2/src/private.rs     |  6 ++-
 libs/proxy/tokio-postgres2/Cargo.toml         |  2 +-
 .../proxy/tokio-postgres2/src/cancel_query.rs |  7 ++--
 .../tokio-postgres2/src/cancel_query_raw.rs   |  7 ++--
 .../proxy/tokio-postgres2/src/cancel_token.rs | 10 ++---
 libs/proxy/tokio-postgres2/src/client.rs      | 41 +++++++++----------
 libs/proxy/tokio-postgres2/src/codec.rs       |  3 +-
 libs/proxy/tokio-postgres2/src/config.rs      | 20 ++++-----
 libs/proxy/tokio-postgres2/src/connect.rs     |  7 ++--
 libs/proxy/tokio-postgres2/src/connect_raw.rs | 32 ++++++++-------
 .../tokio-postgres2/src/connect_socket.rs     |  6 ++-
 libs/proxy/tokio-postgres2/src/connect_tls.rs | 13 +++---
 libs/proxy/tokio-postgres2/src/connection.rs  | 24 ++++++-----
 libs/proxy/tokio-postgres2/src/error/mod.rs   |  6 +--
 .../tokio-postgres2/src/generic_client.rs     |  3 +-
 libs/proxy/tokio-postgres2/src/lib.rs         |  3 +-
 .../tokio-postgres2/src/maybe_tls_stream.rs   |  4 +-
 libs/proxy/tokio-postgres2/src/prepare.rs     | 23 ++++++-----
 libs/proxy/tokio-postgres2/src/query.rs       | 30 +++++++-------
 libs/proxy/tokio-postgres2/src/row.rs         | 15 +++----
 .../proxy/tokio-postgres2/src/simple_query.rs | 24 ++++++-----
 libs/proxy/tokio-postgres2/src/statement.rs   | 15 ++++---
 libs/proxy/tokio-postgres2/src/tls.rs         |  1 +
 libs/proxy/tokio-postgres2/src/transaction.rs |  3 +-
 proxy/Cargo.toml                              |  2 +-
 proxy/src/auth/backend/console_redirect.rs    |  4 +-
 proxy/src/auth/backend/jwt.rs                 |  6 +--
 proxy/src/auth/backend/local.rs               |  2 +-
 proxy/src/auth/backend/mod.rs                 |  8 ++--
 proxy/src/auth/credentials.rs                 |  7 +++-
 proxy/src/auth/mod.rs                         |  6 +--
 proxy/src/binary/local_proxy.rs               |  4 +-
 proxy/src/binary/pg_sni_router.rs             |  8 ++--
 proxy/src/binary/proxy.rs                     | 14 ++++---
 proxy/src/cache/project_info.rs               |  4 +-
 proxy/src/cache/timed_lru.rs                  |  4 +-
 proxy/src/cancellation.rs                     |  4 +-
 proxy/src/config.rs                           | 11 +++--
 proxy/src/console_redirect_proxy.rs           | 40 +++++++++++++-----
 proxy/src/context/mod.rs                      |  2 +-
 proxy/src/context/parquet.rs                  | 28 ++++++-------
 .../control_plane/client/cplane_proxy_v1.rs   |  6 +--
 proxy/src/control_plane/client/mock.rs        |  6 +--
 proxy/src/control_plane/client/mod.rs         |  6 +--
 proxy/src/control_plane/errors.rs             |  2 +-
 proxy/src/control_plane/mgmt.rs               |  2 +-
 proxy/src/control_plane/mod.rs                |  2 +-
 proxy/src/http/health_server.rs               |  2 +-
 proxy/src/http/mod.rs                         |  2 +-
 proxy/src/logging.rs                          |  2 +-
 proxy/src/metrics.rs                          |  6 +--
 proxy/src/protocol2.rs                        |  2 +-
 proxy/src/proxy/connect_compute.rs            |  4 +-
 proxy/src/proxy/copy_bidirectional.rs         |  2 +-
 proxy/src/proxy/mod.rs                        | 12 +++---
 proxy/src/proxy/tests/mod.rs                  |  6 +--
 proxy/src/proxy/wake_compute.rs               |  2 +-
 proxy/src/rate_limiter/leaky_bucket.rs        |  2 +-
 proxy/src/rate_limiter/limit_algorithm.rs     |  2 +-
 proxy/src/rate_limiter/limiter.rs             |  2 +-
 proxy/src/redis/elasticache.rs                |  2 +-
 proxy/src/redis/keys.rs                       |  2 +-
 proxy/src/sasl/stream.rs                      |  2 +-
 proxy/src/scram/countmin.rs                   |  2 +-
 proxy/src/scram/exchange.rs                   |  4 +-
 proxy/src/scram/messages.rs                   |  2 +-
 proxy/src/scram/mod.rs                        |  2 +-
 proxy/src/scram/signature.rs                  |  2 +-
 proxy/src/serverless/backend.rs               | 10 ++---
 proxy/src/serverless/cancel_set.rs            |  6 +--
 proxy/src/serverless/conn_pool.rs             |  8 ++--
 proxy/src/serverless/conn_pool_lib.rs         |  2 +-
 proxy/src/serverless/http_conn_pool.rs        |  2 +-
 proxy/src/serverless/json.rs                  |  2 +-
 proxy/src/serverless/local_conn_pool.rs       | 15 ++++---
 proxy/src/serverless/mod.rs                   | 12 +++---
 proxy/src/serverless/sql_over_http.rs         | 18 ++++----
 proxy/src/serverless/websocket.rs             | 12 +++---
 proxy/src/signals.rs                          |  2 +-
 proxy/src/tls/postgres_rustls.rs              |  4 +-
 proxy/src/tls/server_config.rs                |  4 +-
 proxy/src/usage_metrics.rs                    | 14 +++----
 91 files changed, 374 insertions(+), 340 deletions(-)

diff --git a/libs/proxy/postgres-protocol2/Cargo.toml b/libs/proxy/postgres-protocol2/Cargo.toml
index f66a292d5e..7ebb05eec1 100644
--- a/libs/proxy/postgres-protocol2/Cargo.toml
+++ b/libs/proxy/postgres-protocol2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "postgres-protocol2"
 version = "0.1.0"
-edition = "2021"
+edition = "2024"
 license = "MIT/Apache-2.0"
 
 [dependencies]
diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
index f2200a40ce..27e05e24ec 100644
--- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
+++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
@@ -1,14 +1,12 @@
 //! SASL-based authentication support.
 
+use std::fmt::Write;
+use std::{io, iter, mem, str};
+
 use hmac::{Hmac, Mac};
 use rand::{self, Rng};
 use sha2::digest::FixedOutput;
 use sha2::{Digest, Sha256};
-use std::fmt::Write;
-use std::io;
-use std::iter;
-use std::mem;
-use std::str;
 use tokio::task::yield_now;
 
 const NONCE_LENGTH: usize = 24;
@@ -493,11 +491,9 @@ mod test {
         let nonce = "9IZ2O01zb9IgiIZ1WJ/zgpJB";
 
         let client_first = "n,,n=,r=9IZ2O01zb9IgiIZ1WJ/zgpJB";
-        let server_first =
-            "r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,s=fs3IXBy7U7+IvVjZ,i\
+        let server_first = "r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,s=fs3IXBy7U7+IvVjZ,i\
              =4096";
-        let client_final =
-            "c=biws,r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,p=AmNKosjJzS3\
+        let client_final = "c=biws,r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,p=AmNKosjJzS3\
              1NTlQYNs5BTeQjdHdk7lOflDo5re2an8=";
         let server_final = "v=U+ppxD5XUKtradnv8e2MkeupiA8FU87Sg8CXzXHDAzw=";
 
diff --git a/libs/proxy/postgres-protocol2/src/lib.rs b/libs/proxy/postgres-protocol2/src/lib.rs
index 6032440f9a..afbd1e92bd 100644
--- a/libs/proxy/postgres-protocol2/src/lib.rs
+++ b/libs/proxy/postgres-protocol2/src/lib.rs
@@ -11,9 +11,10 @@
 //! set to `UTF8`. It will most likely not behave properly if that is not the case.
 #![warn(missing_docs, clippy::all)]
 
+use std::io;
+
 use byteorder::{BigEndian, ByteOrder};
 use bytes::{BufMut, BytesMut};
-use std::io;
 
 pub mod authentication;
 pub mod escape;
diff --git a/libs/proxy/postgres-protocol2/src/message/backend.rs b/libs/proxy/postgres-protocol2/src/message/backend.rs
index 097964f9c1..d7eaef9509 100644
--- a/libs/proxy/postgres-protocol2/src/message/backend.rs
+++ b/libs/proxy/postgres-protocol2/src/message/backend.rs
@@ -1,13 +1,13 @@
 #![allow(missing_docs)]
 
+use std::io::{self, Read};
+use std::ops::Range;
+use std::{cmp, str};
+
 use byteorder::{BigEndian, ByteOrder, ReadBytesExt};
 use bytes::{Bytes, BytesMut};
 use fallible_iterator::FallibleIterator;
 use memchr::memchr;
-use std::cmp;
-use std::io::{self, Read};
-use std::ops::Range;
-use std::str;
 
 use crate::Oid;
 
diff --git a/libs/proxy/postgres-protocol2/src/message/frontend.rs b/libs/proxy/postgres-protocol2/src/message/frontend.rs
index 640f35ada3..b447290ea8 100644
--- a/libs/proxy/postgres-protocol2/src/message/frontend.rs
+++ b/libs/proxy/postgres-protocol2/src/message/frontend.rs
@@ -1,13 +1,13 @@
 //! Frontend message serialization.
 #![allow(missing_docs)]
 
+use std::error::Error;
+use std::{io, marker};
+
 use byteorder::{BigEndian, ByteOrder};
 use bytes::{Buf, BufMut, BytesMut};
-use std::error::Error;
-use std::io;
-use std::marker;
 
-use crate::{write_nullable, FromUsize, IsNull, Oid};
+use crate::{FromUsize, IsNull, Oid, write_nullable};
 
 #[inline]
 fn write_body<F, E>(buf: &mut BytesMut, f: F) -> Result<(), E>
diff --git a/libs/proxy/postgres-protocol2/src/password/mod.rs b/libs/proxy/postgres-protocol2/src/password/mod.rs
index 38eb31dfcf..4cd9bfb060 100644
--- a/libs/proxy/postgres-protocol2/src/password/mod.rs
+++ b/libs/proxy/postgres-protocol2/src/password/mod.rs
@@ -6,12 +6,13 @@
 //! side. This is good because it ensures the cleartext password won't
 //! end up in logs pg_stat displays, etc.
 
-use crate::authentication::sasl;
 use hmac::{Hmac, Mac};
 use rand::RngCore;
 use sha2::digest::FixedOutput;
 use sha2::{Digest, Sha256};
 
+use crate::authentication::sasl;
+
 #[cfg(test)]
 mod test;
 
diff --git a/libs/proxy/postgres-protocol2/src/types/mod.rs b/libs/proxy/postgres-protocol2/src/types/mod.rs
index 78131c05bf..6a9b334bcb 100644
--- a/libs/proxy/postgres-protocol2/src/types/mod.rs
+++ b/libs/proxy/postgres-protocol2/src/types/mod.rs
@@ -1,11 +1,12 @@
 //! Conversions to and from Postgres's binary format for various types.
-use byteorder::{BigEndian, ReadBytesExt};
-use bytes::{BufMut, BytesMut};
-use fallible_iterator::FallibleIterator;
 use std::boxed::Box as StdBox;
 use std::error::Error;
 use std::str;
 
+use byteorder::{BigEndian, ReadBytesExt};
+use bytes::{BufMut, BytesMut};
+use fallible_iterator::FallibleIterator;
+
 use crate::Oid;
 
 #[cfg(test)]
diff --git a/libs/proxy/postgres-types2/Cargo.toml b/libs/proxy/postgres-types2/Cargo.toml
index 57efd94cd3..25ad23ba35 100644
--- a/libs/proxy/postgres-types2/Cargo.toml
+++ b/libs/proxy/postgres-types2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "postgres-types2"
 version = "0.1.0"
-edition = "2021"
+edition = "2024"
 license = "MIT/Apache-2.0"
 
 [dependencies]
diff --git a/libs/proxy/postgres-types2/src/lib.rs b/libs/proxy/postgres-types2/src/lib.rs
index d4f3afdfd4..0ccd8c295f 100644
--- a/libs/proxy/postgres-types2/src/lib.rs
+++ b/libs/proxy/postgres-types2/src/lib.rs
@@ -4,19 +4,18 @@
 //! unless you want to define your own `ToSql` or `FromSql` definitions.
 #![warn(clippy::all, missing_docs)]
 
-use fallible_iterator::FallibleIterator;
-use postgres_protocol2::types;
 use std::any::type_name;
 use std::error::Error;
 use std::fmt;
 use std::sync::Arc;
 
-use crate::type_gen::{Inner, Other};
-
+use bytes::BytesMut;
+use fallible_iterator::FallibleIterator;
 #[doc(inline)]
 pub use postgres_protocol2::Oid;
+use postgres_protocol2::types;
 
-use bytes::BytesMut;
+use crate::type_gen::{Inner, Other};
 
 /// Generates a simple implementation of `ToSql::accepts` which accepts the
 /// types passed to it.
diff --git a/libs/proxy/postgres-types2/src/private.rs b/libs/proxy/postgres-types2/src/private.rs
index 774f9a301c..188b982812 100644
--- a/libs/proxy/postgres-types2/src/private.rs
+++ b/libs/proxy/postgres-types2/src/private.rs
@@ -1,7 +1,9 @@
-use crate::{FromSql, Type};
-pub use bytes::BytesMut;
 use std::error::Error;
 
+pub use bytes::BytesMut;
+
+use crate::{FromSql, Type};
+
 pub fn read_be_i32(buf: &mut &[u8]) -> Result<i32, Box<dyn Error + Sync + Send>> {
     if buf.len() < 4 {
         return Err("invalid buffer size".into());
diff --git a/libs/proxy/tokio-postgres2/Cargo.toml b/libs/proxy/tokio-postgres2/Cargo.toml
index 161c6b8309..540876742f 100644
--- a/libs/proxy/tokio-postgres2/Cargo.toml
+++ b/libs/proxy/tokio-postgres2/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "tokio-postgres2"
 version = "0.1.0"
-edition = "2021"
+edition = "2024"
 license = "MIT/Apache-2.0"
 
 [dependencies]
diff --git a/libs/proxy/tokio-postgres2/src/cancel_query.rs b/libs/proxy/tokio-postgres2/src/cancel_query.rs
index cddbf16336..b65fb571e6 100644
--- a/libs/proxy/tokio-postgres2/src/cancel_query.rs
+++ b/libs/proxy/tokio-postgres2/src/cancel_query.rs
@@ -1,10 +1,11 @@
+use std::io;
+
 use tokio::net::TcpStream;
 
 use crate::client::SocketConfig;
 use crate::config::{Host, SslMode};
 use crate::tls::MakeTlsConnect;
-use crate::{cancel_query_raw, connect_socket, Error};
-use std::io;
+use crate::{Error, cancel_query_raw, connect_socket};
 
 pub(crate) async fn cancel_query<T>(
     config: Option<SocketConfig>,
@@ -22,7 +23,7 @@ where
             return Err(Error::connect(io::Error::new(
                 io::ErrorKind::InvalidInput,
                 "unknown host",
-            )))
+            )));
         }
     };
 
diff --git a/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs b/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs
index 8c08296435..c720214e9b 100644
--- a/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs
+++ b/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs
@@ -1,10 +1,11 @@
-use crate::config::SslMode;
-use crate::tls::TlsConnect;
-use crate::{connect_tls, Error};
 use bytes::BytesMut;
 use postgres_protocol2::message::frontend;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 
+use crate::config::SslMode;
+use crate::tls::TlsConnect;
+use crate::{Error, connect_tls};
+
 pub async fn cancel_query_raw<S, T>(
     stream: S,
     mode: SslMode,
diff --git a/libs/proxy/tokio-postgres2/src/cancel_token.rs b/libs/proxy/tokio-postgres2/src/cancel_token.rs
index 718f903a92..f6526395ee 100644
--- a/libs/proxy/tokio-postgres2/src/cancel_token.rs
+++ b/libs/proxy/tokio-postgres2/src/cancel_token.rs
@@ -1,12 +1,12 @@
-use crate::config::SslMode;
-use crate::tls::TlsConnect;
-
-use crate::{cancel_query, client::SocketConfig, tls::MakeTlsConnect};
-use crate::{cancel_query_raw, Error};
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::TcpStream;
 
+use crate::client::SocketConfig;
+use crate::config::SslMode;
+use crate::tls::{MakeTlsConnect, TlsConnect};
+use crate::{Error, cancel_query, cancel_query_raw};
+
 /// The capability to request cancellation of in-progress queries on a
 /// connection.
 #[derive(Clone, Serialize, Deserialize)]
diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs
index 46151ab924..39b1db75da 100644
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -1,31 +1,28 @@
-use crate::codec::{BackendMessages, FrontendMessage};
-
-use crate::config::Host;
-use crate::config::SslMode;
-use crate::connection::{Request, RequestMessages};
-
-use crate::query::RowStream;
-use crate::simple_query::SimpleQueryStream;
-
-use crate::types::{Oid, ToSql, Type};
-
-use crate::{
-    query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row,
-    SimpleQueryMessage, Statement, Transaction, TransactionBuilder,
-};
-use bytes::BytesMut;
-use fallible_iterator::FallibleIterator;
-use futures_util::{future, ready, TryStreamExt};
-use parking_lot::Mutex;
-use postgres_protocol2::message::{backend::Message, frontend};
-use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::fmt;
 use std::sync::Arc;
 use std::task::{Context, Poll};
+use std::time::Duration;
+
+use bytes::BytesMut;
+use fallible_iterator::FallibleIterator;
+use futures_util::{TryStreamExt, future, ready};
+use parking_lot::Mutex;
+use postgres_protocol2::message::backend::Message;
+use postgres_protocol2::message::frontend;
+use serde::{Deserialize, Serialize};
 use tokio::sync::mpsc;
 
-use std::time::Duration;
+use crate::codec::{BackendMessages, FrontendMessage};
+use crate::config::{Host, SslMode};
+use crate::connection::{Request, RequestMessages};
+use crate::query::RowStream;
+use crate::simple_query::SimpleQueryStream;
+use crate::types::{Oid, ToSql, Type};
+use crate::{
+    CancelToken, Error, ReadyForQueryStatus, Row, SimpleQueryMessage, Statement, Transaction,
+    TransactionBuilder, query, simple_query, slice_iter,
+};
 
 pub struct Responses {
     receiver: mpsc::Receiver<BackendMessages>,
diff --git a/libs/proxy/tokio-postgres2/src/codec.rs b/libs/proxy/tokio-postgres2/src/codec.rs
index 0ec46198ce..f1fd9b47b3 100644
--- a/libs/proxy/tokio-postgres2/src/codec.rs
+++ b/libs/proxy/tokio-postgres2/src/codec.rs
@@ -1,8 +1,9 @@
+use std::io;
+
 use bytes::{Buf, Bytes, BytesMut};
 use fallible_iterator::FallibleIterator;
 use postgres_protocol2::message::backend;
 use postgres_protocol2::message::frontend::CopyData;
-use std::io;
 use tokio_util::codec::{Decoder, Encoder};
 
 pub enum FrontendMessage {
diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs
index 47cc45ac80..4c25491b67 100644
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -1,21 +1,19 @@
 //! Connection configuration.
 
-use crate::connect::connect;
-use crate::connect_raw::connect_raw;
-use crate::connect_raw::RawConnection;
-use crate::tls::MakeTlsConnect;
-use crate::tls::TlsConnect;
-use crate::{Client, Connection, Error};
-use postgres_protocol2::message::frontend::StartupMessageParams;
-use serde::{Deserialize, Serialize};
-use std::fmt;
-use std::str;
 use std::time::Duration;
-use tokio::io::{AsyncRead, AsyncWrite};
+use std::{fmt, str};
 
 pub use postgres_protocol2::authentication::sasl::ScramKeys;
+use postgres_protocol2::message::frontend::StartupMessageParams;
+use serde::{Deserialize, Serialize};
+use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::TcpStream;
 
+use crate::connect::connect;
+use crate::connect_raw::{RawConnection, connect_raw};
+use crate::tls::{MakeTlsConnect, TlsConnect};
+use crate::{Client, Connection, Error};
+
 /// TLS configuration.
 #[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
 #[non_exhaustive]
diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs
index e0cb69748d..d2bd0dfbcd 100644
--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -1,3 +1,7 @@
+use postgres_protocol2::message::backend::Message;
+use tokio::net::TcpStream;
+use tokio::sync::mpsc;
+
 use crate::client::SocketConfig;
 use crate::codec::BackendMessage;
 use crate::config::Host;
@@ -5,9 +9,6 @@ use crate::connect_raw::connect_raw;
 use crate::connect_socket::connect_socket;
 use crate::tls::{MakeTlsConnect, TlsConnect};
 use crate::{Client, Config, Connection, Error, RawConnection};
-use postgres_protocol2::message::backend::Message;
-use tokio::net::TcpStream;
-use tokio::sync::mpsc;
 
 pub async fn connect<T>(
     mut tls: T,
diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs
index 66db85e07d..20dc538cf2 100644
--- a/libs/proxy/tokio-postgres2/src/connect_raw.rs
+++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs
@@ -1,22 +1,24 @@
+use std::collections::HashMap;
+use std::io;
+use std::pin::Pin;
+use std::task::{Context, Poll};
+
+use bytes::BytesMut;
+use fallible_iterator::FallibleIterator;
+use futures_util::{Sink, SinkExt, Stream, TryStreamExt, ready};
+use postgres_protocol2::authentication::sasl;
+use postgres_protocol2::authentication::sasl::ScramSha256;
+use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message, NoticeResponseBody};
+use postgres_protocol2::message::frontend;
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio_util::codec::Framed;
+
+use crate::Error;
 use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec};
 use crate::config::{self, AuthKeys, Config};
 use crate::connect_tls::connect_tls;
 use crate::maybe_tls_stream::MaybeTlsStream;
 use crate::tls::{TlsConnect, TlsStream};
-use crate::Error;
-use bytes::BytesMut;
-use fallible_iterator::FallibleIterator;
-use futures_util::{ready, Sink, SinkExt, Stream, TryStreamExt};
-use postgres_protocol2::authentication::sasl;
-use postgres_protocol2::authentication::sasl::ScramSha256;
-use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message, NoticeResponseBody};
-use postgres_protocol2::message::frontend;
-use std::collections::HashMap;
-use std::io;
-use std::pin::Pin;
-use std::task::{Context, Poll};
-use tokio::io::{AsyncRead, AsyncWrite};
-use tokio_util::codec::Framed;
 
 pub struct StartupStream<S, T> {
     inner: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
@@ -158,7 +160,7 @@ where
         | Some(Message::AuthenticationSspi) => {
             return Err(Error::authentication(
                 "unsupported authentication method".into(),
-            ))
+            ));
         }
         Some(Message::ErrorResponse(body)) => return Err(Error::db(body)),
         Some(_) => return Err(Error::unexpected_message()),
diff --git a/libs/proxy/tokio-postgres2/src/connect_socket.rs b/libs/proxy/tokio-postgres2/src/connect_socket.rs
index 336a13317f..15411f7ef3 100644
--- a/libs/proxy/tokio-postgres2/src/connect_socket.rs
+++ b/libs/proxy/tokio-postgres2/src/connect_socket.rs
@@ -1,11 +1,13 @@
-use crate::config::Host;
-use crate::Error;
 use std::future::Future;
 use std::io;
 use std::time::Duration;
+
 use tokio::net::{self, TcpStream};
 use tokio::time;
 
+use crate::Error;
+use crate::config::Host;
+
 pub(crate) async fn connect_socket(
     host: &Host,
     port: u16,
diff --git a/libs/proxy/tokio-postgres2/src/connect_tls.rs b/libs/proxy/tokio-postgres2/src/connect_tls.rs
index 64b0b68abc..4dc929a9e2 100644
--- a/libs/proxy/tokio-postgres2/src/connect_tls.rs
+++ b/libs/proxy/tokio-postgres2/src/connect_tls.rs
@@ -1,12 +1,13 @@
-use crate::config::SslMode;
-use crate::maybe_tls_stream::MaybeTlsStream;
-use crate::tls::private::ForcePrivateApi;
-use crate::tls::TlsConnect;
-use crate::Error;
 use bytes::BytesMut;
 use postgres_protocol2::message::frontend;
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
 
+use crate::Error;
+use crate::config::SslMode;
+use crate::maybe_tls_stream::MaybeTlsStream;
+use crate::tls::TlsConnect;
+use crate::tls::private::ForcePrivateApi;
+
 pub async fn connect_tls<S, T>(
     mut stream: S,
     mode: SslMode,
@@ -19,7 +20,7 @@ where
     match mode {
         SslMode::Disable => return Ok(MaybeTlsStream::Raw(stream)),
         SslMode::Prefer if !tls.can_connect(ForcePrivateApi) => {
-            return Ok(MaybeTlsStream::Raw(stream))
+            return Ok(MaybeTlsStream::Raw(stream));
         }
         SslMode::Prefer | SslMode::Require => {}
     }
diff --git a/libs/proxy/tokio-postgres2/src/connection.rs b/libs/proxy/tokio-postgres2/src/connection.rs
index f478717e0d..60e39b3b44 100644
--- a/libs/proxy/tokio-postgres2/src/connection.rs
+++ b/libs/proxy/tokio-postgres2/src/connection.rs
@@ -1,22 +1,24 @@
-use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec};
-use crate::error::DbError;
-use crate::maybe_tls_stream::MaybeTlsStream;
-use crate::{AsyncMessage, Error, Notification};
-use bytes::BytesMut;
-use fallible_iterator::FallibleIterator;
-use futures_util::{ready, Sink, Stream};
-use log::{info, trace};
-use postgres_protocol2::message::backend::Message;
-use postgres_protocol2::message::frontend;
 use std::collections::{HashMap, VecDeque};
 use std::future::Future;
 use std::pin::Pin;
 use std::task::{Context, Poll};
+
+use bytes::BytesMut;
+use fallible_iterator::FallibleIterator;
+use futures_util::{Sink, Stream, ready};
+use log::{info, trace};
+use postgres_protocol2::message::backend::Message;
+use postgres_protocol2::message::frontend;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::sync::mpsc;
 use tokio_util::codec::Framed;
 use tokio_util::sync::PollSender;
 
+use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec};
+use crate::error::DbError;
+use crate::maybe_tls_stream::MaybeTlsStream;
+use crate::{AsyncMessage, Error, Notification};
+
 pub enum RequestMessages {
     Single(FrontendMessage),
 }
@@ -139,7 +141,7 @@ where
                 Some(response) => response,
                 None => match messages.next().map_err(Error::parse)? {
                     Some(Message::ErrorResponse(error)) => {
-                        return Poll::Ready(Err(Error::db(error)))
+                        return Poll::Ready(Err(Error::db(error)));
                     }
                     _ => return Poll::Ready(Err(Error::unexpected_message())),
                 },
diff --git a/libs/proxy/tokio-postgres2/src/error/mod.rs b/libs/proxy/tokio-postgres2/src/error/mod.rs
index 922c348525..b12e76e5bf 100644
--- a/libs/proxy/tokio-postgres2/src/error/mod.rs
+++ b/libs/proxy/tokio-postgres2/src/error/mod.rs
@@ -1,10 +1,10 @@
 //! Errors.
 
+use std::error::{self, Error as _Error};
+use std::{fmt, io};
+
 use fallible_iterator::FallibleIterator;
 use postgres_protocol2::message::backend::{ErrorFields, ErrorResponseBody};
-use std::error::{self, Error as _Error};
-use std::fmt;
-use std::io;
 
 pub use self::sqlstate::*;
 
diff --git a/libs/proxy/tokio-postgres2/src/generic_client.rs b/libs/proxy/tokio-postgres2/src/generic_client.rs
index 042b5a675e..31c3d8fa3e 100644
--- a/libs/proxy/tokio-postgres2/src/generic_client.rs
+++ b/libs/proxy/tokio-postgres2/src/generic_client.rs
@@ -1,9 +1,10 @@
 #![allow(async_fn_in_trait)]
 
+use postgres_protocol2::Oid;
+
 use crate::query::RowStream;
 use crate::types::Type;
 use crate::{Client, Error, Transaction};
-use postgres_protocol2::Oid;
 
 mod private {
     pub trait Sealed {}
diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs
index 7426279167..c8ebba5487 100644
--- a/libs/proxy/tokio-postgres2/src/lib.rs
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -1,6 +1,8 @@
 //! An asynchronous, pipelined, PostgreSQL client.
 #![warn(clippy::all)]
 
+use postgres_protocol2::message::backend::ReadyForQueryBody;
+
 pub use crate::cancel_token::CancelToken;
 pub use crate::client::{Client, SocketConfig};
 pub use crate::config::Config;
@@ -17,7 +19,6 @@ pub use crate::tls::NoTls;
 pub use crate::transaction::Transaction;
 pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder};
 use crate::types::ToSql;
-use postgres_protocol2::message::backend::ReadyForQueryBody;
 
 /// After executing a query, the connection will be in one of these states
 #[derive(Clone, Copy, Debug, PartialEq)]
diff --git a/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs b/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs
index 9a7e248997..4aa838613e 100644
--- a/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs
+++ b/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs
@@ -1,12 +1,14 @@
 //! MaybeTlsStream.
 //!
 //! Represents a stream that may or may not be encrypted with TLS.
-use crate::tls::{ChannelBinding, TlsStream};
 use std::io;
 use std::pin::Pin;
 use std::task::{Context, Poll};
+
 use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
 
+use crate::tls::{ChannelBinding, TlsStream};
+
 /// A stream that may or may not be encrypted with TLS.
 pub enum MaybeTlsStream<S, T> {
     /// An unencrypted stream.
diff --git a/libs/proxy/tokio-postgres2/src/prepare.rs b/libs/proxy/tokio-postgres2/src/prepare.rs
index 58bbb26cbc..b36d2e5f74 100644
--- a/libs/proxy/tokio-postgres2/src/prepare.rs
+++ b/libs/proxy/tokio-postgres2/src/prepare.rs
@@ -1,18 +1,19 @@
+use std::future::Future;
+use std::pin::Pin;
+use std::sync::Arc;
+
+use bytes::Bytes;
+use fallible_iterator::FallibleIterator;
+use futures_util::{TryStreamExt, pin_mut};
+use log::debug;
+use postgres_protocol2::message::backend::Message;
+use postgres_protocol2::message::frontend;
+
 use crate::client::InnerClient;
 use crate::codec::FrontendMessage;
 use crate::connection::RequestMessages;
 use crate::types::{Field, Kind, Oid, Type};
-use crate::{query, slice_iter};
-use crate::{Column, Error, Statement};
-use bytes::Bytes;
-use fallible_iterator::FallibleIterator;
-use futures_util::{pin_mut, TryStreamExt};
-use log::debug;
-use postgres_protocol2::message::backend::Message;
-use postgres_protocol2::message::frontend;
-use std::future::Future;
-use std::pin::Pin;
-use std::sync::Arc;
+use crate::{Column, Error, Statement, query, slice_iter};
 
 pub(crate) const TYPEINFO_QUERY: &str = "\
 SELECT t.typname, t.typtype, t.typelem, r.rngsubtype, t.typbasetype, n.nspname, t.typrelid
diff --git a/libs/proxy/tokio-postgres2/src/query.rs b/libs/proxy/tokio-postgres2/src/query.rs
index e21631c85d..29f05fba79 100644
--- a/libs/proxy/tokio-postgres2/src/query.rs
+++ b/libs/proxy/tokio-postgres2/src/query.rs
@@ -1,22 +1,24 @@
-use crate::client::{InnerClient, Responses};
-use crate::codec::FrontendMessage;
-use crate::connection::RequestMessages;
-use crate::types::IsNull;
-use crate::{Column, Error, ReadyForQueryStatus, Row, Statement};
-use bytes::{BufMut, Bytes, BytesMut};
-use fallible_iterator::FallibleIterator;
-use futures_util::{ready, Stream};
-use log::{debug, log_enabled, Level};
-use pin_project_lite::pin_project;
-use postgres_protocol2::message::backend::Message;
-use postgres_protocol2::message::frontend;
-use postgres_types2::{Format, ToSql, Type};
 use std::fmt;
 use std::marker::PhantomPinned;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 
+use bytes::{BufMut, Bytes, BytesMut};
+use fallible_iterator::FallibleIterator;
+use futures_util::{Stream, ready};
+use log::{Level, debug, log_enabled};
+use pin_project_lite::pin_project;
+use postgres_protocol2::message::backend::Message;
+use postgres_protocol2::message::frontend;
+use postgres_types2::{Format, ToSql, Type};
+
+use crate::client::{InnerClient, Responses};
+use crate::codec::FrontendMessage;
+use crate::connection::RequestMessages;
+use crate::types::IsNull;
+use crate::{Column, Error, ReadyForQueryStatus, Row, Statement};
+
 struct BorrowToSqlParamsDebug<'a>(&'a [&'a (dyn ToSql + Sync)]);
 
 impl fmt::Debug for BorrowToSqlParamsDebug<'_> {
@@ -257,7 +259,7 @@ impl Stream for RowStream {
                         this.statement.clone(),
                         body,
                         *this.output_format,
-                    )?)))
+                    )?)));
                 }
                 Message::EmptyQueryResponse | Message::PortalSuspended => {}
                 Message::CommandComplete(body) => {
diff --git a/libs/proxy/tokio-postgres2/src/row.rs b/libs/proxy/tokio-postgres2/src/row.rs
index 10e130707d..5fc955eef4 100644
--- a/libs/proxy/tokio-postgres2/src/row.rs
+++ b/libs/proxy/tokio-postgres2/src/row.rs
@@ -1,17 +1,18 @@
 //! Rows.
 
+use std::ops::Range;
+use std::sync::Arc;
+use std::{fmt, str};
+
+use fallible_iterator::FallibleIterator;
+use postgres_protocol2::message::backend::DataRowBody;
+use postgres_types2::{Format, WrongFormat};
+
 use crate::row::sealed::{AsName, Sealed};
 use crate::simple_query::SimpleColumn;
 use crate::statement::Column;
 use crate::types::{FromSql, Type, WrongType};
 use crate::{Error, Statement};
-use fallible_iterator::FallibleIterator;
-use postgres_protocol2::message::backend::DataRowBody;
-use postgres_types2::{Format, WrongFormat};
-use std::fmt;
-use std::ops::Range;
-use std::str;
-use std::sync::Arc;
 
 mod sealed {
     pub trait Sealed {}
diff --git a/libs/proxy/tokio-postgres2/src/simple_query.rs b/libs/proxy/tokio-postgres2/src/simple_query.rs
index fb2550377b..f13d63983f 100644
--- a/libs/proxy/tokio-postgres2/src/simple_query.rs
+++ b/libs/proxy/tokio-postgres2/src/simple_query.rs
@@ -1,19 +1,21 @@
-use crate::client::{InnerClient, Responses};
-use crate::codec::FrontendMessage;
-use crate::connection::RequestMessages;
-use crate::{Error, ReadyForQueryStatus, SimpleQueryMessage, SimpleQueryRow};
-use bytes::Bytes;
-use fallible_iterator::FallibleIterator;
-use futures_util::{ready, Stream};
-use log::debug;
-use pin_project_lite::pin_project;
-use postgres_protocol2::message::backend::Message;
-use postgres_protocol2::message::frontend;
 use std::marker::PhantomPinned;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 
+use bytes::Bytes;
+use fallible_iterator::FallibleIterator;
+use futures_util::{Stream, ready};
+use log::debug;
+use pin_project_lite::pin_project;
+use postgres_protocol2::message::backend::Message;
+use postgres_protocol2::message::frontend;
+
+use crate::client::{InnerClient, Responses};
+use crate::codec::FrontendMessage;
+use crate::connection::RequestMessages;
+use crate::{Error, ReadyForQueryStatus, SimpleQueryMessage, SimpleQueryRow};
+
 /// Information about a column of a single query row.
 #[derive(Debug)]
 pub struct SimpleColumn {
diff --git a/libs/proxy/tokio-postgres2/src/statement.rs b/libs/proxy/tokio-postgres2/src/statement.rs
index 591872fbc5..e4828db712 100644
--- a/libs/proxy/tokio-postgres2/src/statement.rs
+++ b/libs/proxy/tokio-postgres2/src/statement.rs
@@ -1,15 +1,14 @@
+use std::fmt;
+use std::sync::{Arc, Weak};
+
+use postgres_protocol2::Oid;
+use postgres_protocol2::message::backend::Field;
+use postgres_protocol2::message::frontend;
+
 use crate::client::InnerClient;
 use crate::codec::FrontendMessage;
 use crate::connection::RequestMessages;
 use crate::types::Type;
-use postgres_protocol2::{
-    message::{backend::Field, frontend},
-    Oid,
-};
-use std::{
-    fmt,
-    sync::{Arc, Weak},
-};
 
 struct StatementInner {
     client: Weak<InnerClient>,
diff --git a/libs/proxy/tokio-postgres2/src/tls.rs b/libs/proxy/tokio-postgres2/src/tls.rs
index dc8140719f..41b51368ff 100644
--- a/libs/proxy/tokio-postgres2/src/tls.rs
+++ b/libs/proxy/tokio-postgres2/src/tls.rs
@@ -5,6 +5,7 @@ use std::future::Future;
 use std::pin::Pin;
 use std::task::{Context, Poll};
 use std::{fmt, io};
+
 use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
 
 pub(crate) mod private {
diff --git a/libs/proxy/tokio-postgres2/src/transaction.rs b/libs/proxy/tokio-postgres2/src/transaction.rs
index 03a57e4947..eecbfc5873 100644
--- a/libs/proxy/tokio-postgres2/src/transaction.rs
+++ b/libs/proxy/tokio-postgres2/src/transaction.rs
@@ -1,8 +1,9 @@
+use postgres_protocol2::message::frontend;
+
 use crate::codec::FrontendMessage;
 use crate::connection::RequestMessages;
 use crate::query::RowStream;
 use crate::{CancelToken, Client, Error, ReadyForQueryStatus};
-use postgres_protocol2::message::frontend;
 
 /// A representation of a PostgreSQL database transaction.
 ///
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 6a381bf094..5964b76ecf 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "proxy"
 version = "0.1.0"
-edition.workspace = true
+edition = "2024"
 license.workspace = true
 
 [features]
diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index 7503b4eac9..dd48384c03 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -8,16 +8,16 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, info_span};
 
 use super::ComputeCredentialKeys;
-use crate::auth::backend::ComputeUserInfo;
 use crate::auth::IpPattern;
+use crate::auth::backend::ComputeUserInfo;
 use crate::cache::Cached;
 use crate::config::AuthenticationConfig;
 use crate::context::RequestContext;
 use crate::control_plane::client::cplane_proxy_v1;
 use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
 use crate::error::{ReportableError, UserFacingError};
-use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
+use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::stream::PqStream;
 use crate::types::RoleName;
 use crate::{auth, compute, waiters};
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 5d032c0deb..942f1e13d1 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -6,9 +6,9 @@ use std::time::{Duration, SystemTime};
 use arc_swap::ArcSwapOption;
 use clashmap::ClashMap;
 use jose_jwk::crypto::KeyInfo;
-use reqwest::{redirect, Client};
-use reqwest_retry::policies::ExponentialBackoff;
+use reqwest::{Client, redirect};
 use reqwest_retry::RetryTransientMiddleware;
+use reqwest_retry::policies::ExponentialBackoff;
 use serde::de::Visitor;
 use serde::{Deserialize, Deserializer};
 use serde_json::value::RawValue;
@@ -498,8 +498,8 @@ fn verify_rsa_signature(
     alg: &jose_jwa::Algorithm,
 ) -> Result<(), JwtError> {
     use jose_jwa::{Algorithm, Signing};
-    use rsa::pkcs1v15::{Signature, VerifyingKey};
     use rsa::RsaPublicKey;
+    use rsa::pkcs1v15::{Signature, VerifyingKey};
 
     let key = RsaPublicKey::try_from(key).map_err(JwtError::InvalidRsaKey)?;
 
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index d10f0e82b2..9c3a3772cd 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -8,8 +8,8 @@ use crate::auth::backend::jwt::FetchAuthRulesError;
 use crate::compute::ConnCfg;
 use crate::compute_ctl::ComputeCtlApi;
 use crate::context::RequestContext;
-use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo};
 use crate::control_plane::NodeInfo;
+use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo};
 use crate::http;
 use crate::intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag};
 use crate::types::EndpointId;
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 8f1625278f..83feed5094 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -18,7 +18,7 @@ use tracing::{debug, info, warn};
 
 use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::{
-    self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint, IpPattern,
+    self, AuthError, ComputeUserInfoMaybeEndpoint, IpPattern, validate_password_and_exchange,
 };
 use crate::cache::Cached;
 use crate::config::AuthenticationConfig;
@@ -32,8 +32,8 @@ use crate::control_plane::{
 use crate::intern::EndpointIdInt;
 use crate::metrics::Metrics;
 use crate::protocol2::ConnectionInfoExtra;
-use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
+use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter};
 use crate::stream::Stream;
 use crate::types::{EndpointCacheKey, EndpointId, RoleName};
@@ -542,7 +542,7 @@ mod tests {
     use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};
 
     use super::jwt::JwkCache;
-    use super::{auth_quirks, AuthRateLimiter};
+    use super::{AuthRateLimiter, auth_quirks};
     use crate::auth::backend::MaskedIp;
     use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern};
     use crate::config::AuthenticationConfig;
@@ -553,8 +553,8 @@ mod tests {
     };
     use crate::proxy::NeonOptions;
     use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo};
-    use crate::scram::threadpool::ThreadPool;
     use crate::scram::ServerSecret;
+    use crate::scram::threadpool::ThreadPool;
     use crate::stream::{PqStream, Stream};
 
     struct Auth {
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index eff49a402a..c1b7718e4f 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -197,7 +197,10 @@ impl<'de> serde::de::Deserialize<'de> for IpPattern {
             type Value = IpPattern;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                write!(formatter, "comma separated list with ip address, ip address range, or ip address subnet mask")
+                write!(
+                    formatter,
+                    "comma separated list with ip address, ip address range, or ip address subnet mask"
+                )
             }
 
             fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
@@ -252,8 +255,8 @@ fn project_name_valid(name: &str) -> bool {
 #[cfg(test)]
 #[expect(clippy::unwrap_used)]
 mod tests {
-    use serde_json::json;
     use ComputeUserInfoParseError::*;
+    use serde_json::json;
 
     use super::*;
 
diff --git a/proxy/src/auth/mod.rs b/proxy/src/auth/mod.rs
index 6082695a6b..5670f8e43d 100644
--- a/proxy/src/auth/mod.rs
+++ b/proxy/src/auth/mod.rs
@@ -5,13 +5,13 @@ pub use backend::Backend;
 
 mod credentials;
 pub(crate) use credentials::{
-    check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint,
-    ComputeUserInfoParseError, IpPattern,
+    ComputeUserInfoMaybeEndpoint, ComputeUserInfoParseError, IpPattern, check_peer_addr_is_in_list,
+    endpoint_sni,
 };
 
 mod password_hack;
-pub(crate) use password_hack::parse_endpoint_param;
 use password_hack::PasswordHackPayload;
+pub(crate) use password_hack::parse_endpoint_param;
 
 mod flow;
 use std::io;
diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs
index 4ab11f828c..dedd225cba 100644
--- a/proxy/src/binary/local_proxy.rs
+++ b/proxy/src/binary/local_proxy.rs
@@ -4,7 +4,7 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 
-use anyhow::{bail, ensure, Context};
+use anyhow::{Context, bail, ensure};
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::Parser;
 use compute_api::spec::LocalProxySpec;
@@ -19,7 +19,7 @@ use utils::sentry_init::init_sentry;
 use utils::{pid_file, project_build_tag, project_git_version};
 
 use crate::auth::backend::jwt::JwkCache;
-use crate::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP};
+use crate::auth::backend::local::{JWKS_ROLE_MAP, LocalBackend};
 use crate::auth::{self};
 use crate::cancellation::CancellationHandler;
 use crate::config::{
diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs
index 94e771a61c..1aa290399c 100644
--- a/proxy/src/binary/pg_sni_router.rs
+++ b/proxy/src/binary/pg_sni_router.rs
@@ -5,24 +5,24 @@
 /// the outside. Similar to an ingress controller for HTTPS.
 use std::{net::SocketAddr, sync::Arc};
 
-use anyhow::{anyhow, bail, ensure, Context};
+use anyhow::{Context, anyhow, bail, ensure};
 use clap::Arg;
-use futures::future::Either;
 use futures::TryFutureExt;
+use futures::future::Either;
 use itertools::Itertools;
 use rustls::crypto::ring;
 use rustls::pki_types::PrivateKeyDer;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::TcpListener;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, Instrument};
+use tracing::{Instrument, error, info};
 use utils::project_git_version;
 use utils::sentry_init::init_sentry;
 
 use crate::context::RequestContext;
 use crate::metrics::{Metrics, ThreadPoolMetrics};
 use crate::protocol2::ConnectionInfo;
-use crate::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
+use crate::proxy::{ErrorSource, copy_bidirectional_client_compute, run_until_cancelled};
 use crate::stream::{PqStream, Stream};
 use crate::tls::TlsServerEndPoint;
 
diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs
index b72799df54..eec0bf8f99 100644
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -9,16 +9,16 @@ use remote_storage::RemoteStorageConfig;
 use tokio::net::TcpListener;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
-use tracing::{info, warn, Instrument};
+use tracing::{Instrument, info, warn};
 use utils::sentry_init::init_sentry;
 use utils::{project_build_tag, project_git_version};
 
 use crate::auth::backend::jwt::JwkCache;
 use crate::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned};
-use crate::cancellation::{handle_cancel_messages, CancellationHandler};
+use crate::cancellation::{CancellationHandler, handle_cancel_messages};
 use crate::config::{
-    self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig,
-    ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2,
+    self, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, ProjectInfoCacheOptions,
+    ProxyConfig, ProxyProtocolV2, remote_storage_from_toml,
 };
 use crate::context::parquet::ParquetUploadArgs;
 use crate::http::health_server::AppMetrics;
@@ -30,8 +30,8 @@ use crate::redis::connection_with_credentials_provider::ConnectionWithCredential
 use crate::redis::kv_ops::RedisKVClient;
 use crate::redis::{elasticache, notifications};
 use crate::scram::threadpool::ThreadPool;
-use crate::serverless::cancel_set::CancelSet;
 use crate::serverless::GlobalConnPoolOptions;
+use crate::serverless::cancel_set::CancelSet;
 use crate::tls::client_config::compute_client_config_with_root_certs;
 use crate::{auth, control_plane, http, serverless, usage_metrics};
 
@@ -331,7 +331,9 @@ pub async fn run() -> anyhow::Result<()> {
                 ),
             ),
             (None, None) => {
-                warn!("irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client");
+                warn!(
+                    "irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client"
+                );
                 None
             }
             _ => {
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index 7651eb71a2..e153e9f61f 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -1,12 +1,12 @@
 use std::collections::HashSet;
 use std::convert::Infallible;
-use std::sync::atomic::AtomicU64;
 use std::sync::Arc;
+use std::sync::atomic::AtomicU64;
 use std::time::Duration;
 
 use async_trait::async_trait;
 use clashmap::ClashMap;
-use rand::{thread_rng, Rng};
+use rand::{Rng, thread_rng};
 use smol_str::SmolStr;
 use tokio::sync::Mutex;
 use tokio::time::Instant;
diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs
index 06eaeb9a30..7cfe5100ea 100644
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -11,11 +11,11 @@ use std::time::{Duration, Instant};
 //   This severely hinders its usage both in terms of creating wrappers and supported key types.
 //
 // On the other hand, `hashlink` has good download stats and appears to be maintained.
-use hashlink::{linked_hash_map::RawEntryMut, LruCache};
+use hashlink::{LruCache, linked_hash_map::RawEntryMut};
 use tracing::debug;
 
 use super::common::Cached;
-use super::{timed_lru, Cache};
+use super::{Cache, timed_lru};
 
 /// An implementation of timed LRU cache with fixed capacity.
 /// Key properties:
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 422e6f741d..8263e5aa2a 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -3,8 +3,8 @@ use std::net::{IpAddr, SocketAddr};
 use std::sync::Arc;
 
 use ipnet::{IpNet, Ipv4Net, Ipv6Net};
-use postgres_client::tls::MakeTlsConnect;
 use postgres_client::CancelToken;
+use postgres_client::tls::MakeTlsConnect;
 use pq_proto::CancelKeyData;
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
@@ -13,7 +13,7 @@ use tokio::sync::{mpsc, oneshot};
 use tracing::{debug, info};
 
 use crate::auth::backend::ComputeUserInfo;
-use crate::auth::{check_peer_addr_is_in_list, AuthError};
+use crate::auth::{AuthError, check_peer_addr_is_in_list};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::control_plane::ControlPlaneApi;
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 460e0cff54..1bcd22e98f 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -2,18 +2,18 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 
-use anyhow::{bail, ensure, Context, Ok};
+use anyhow::{Context, Ok, bail, ensure};
 use clap::ValueEnum;
 use remote_storage::RemoteStorageConfig;
 
-use crate::auth::backend::jwt::JwkCache;
 use crate::auth::backend::AuthRateLimiter;
+use crate::auth::backend::jwt::JwkCache;
 use crate::control_plane::locks::ApiLocks;
 use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig};
 use crate::scram::threadpool::ThreadPool;
-use crate::serverless::cancel_set::CancelSet;
 use crate::serverless::GlobalConnPoolOptions;
-pub use crate::tls::server_config::{configure_tls, TlsConfig};
+use crate::serverless::cancel_set::CancelSet;
+pub use crate::tls::server_config::{TlsConfig, configure_tls};
 use crate::types::Host;
 
 pub struct ProxyConfig {
@@ -97,8 +97,7 @@ pub struct EndpointCacheConfig {
 impl EndpointCacheConfig {
     /// Default options for [`crate::control_plane::NodeInfoCache`].
     /// Notice that by default the limiter is empty, which means that cache is disabled.
-    pub const CACHE_DEFAULT_OPTIONS: &'static str =
-        "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s";
+    pub const CACHE_DEFAULT_OPTIONS: &'static str = "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s";
 
     /// Parse cache options passed via cmdline.
     /// Example: [`Self::CACHE_DEFAULT_OPTIONS`].
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index a2e7299d39..4662860b3f 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -3,7 +3,7 @@ use std::sync::Arc;
 use futures::{FutureExt, TryFutureExt};
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, Instrument};
+use tracing::{Instrument, debug, error, info};
 
 use crate::auth::backend::ConsoleRedirectBackend;
 use crate::cancellation::CancellationHandler;
@@ -11,12 +11,12 @@ use crate::config::{ProxyConfig, ProxyProtocolV2};
 use crate::context::RequestContext;
 use crate::error::ReportableError;
 use crate::metrics::{Metrics, NumClientConnectionsGuard};
-use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo};
-use crate::proxy::connect_compute::{connect_to_compute, TcpMechanism};
-use crate::proxy::handshake::{handshake, HandshakeData};
+use crate::protocol2::{ConnectHeader, ConnectionInfo, read_proxy_protocol};
+use crate::proxy::connect_compute::{TcpMechanism, connect_to_compute};
+use crate::proxy::handshake::{HandshakeData, handshake};
 use crate::proxy::passthrough::ProxyPassthrough;
 use crate::proxy::{
-    prepare_client_connection, run_until_cancelled, ClientRequestError, ErrorSource,
+    ClientRequestError, ErrorSource, prepare_client_connection, run_until_cancelled,
 };
 
 pub async fn task_main(
@@ -64,22 +64,34 @@ pub async fn task_main(
                     debug!("healthcheck received");
                     return;
                 }
-                Ok((_socket, ConnectHeader::Missing)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
+                Ok((_socket, ConnectHeader::Missing))
+                    if config.proxy_protocol_v2 == ProxyProtocolV2::Required =>
+                {
                     error!("missing required proxy protocol header");
                     return;
                 }
-                Ok((_socket, ConnectHeader::Proxy(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
+                Ok((_socket, ConnectHeader::Proxy(_)))
+                    if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected =>
+                {
                     error!("proxy protocol header not supported");
                     return;
                 }
                 Ok((socket, ConnectHeader::Proxy(info))) => (socket, info),
-                Ok((socket, ConnectHeader::Missing)) => (socket, ConnectionInfo{ addr: peer_addr, extra: None }),
+                Ok((socket, ConnectHeader::Missing)) => (
+                    socket,
+                    ConnectionInfo {
+                        addr: peer_addr,
+                        extra: None,
+                    },
+                ),
             };
 
             match socket.inner.set_nodelay(true) {
                 Ok(()) => {}
                 Err(e) => {
-                    error!("per-client task finished with an error: failed to set socket option: {e:#}");
+                    error!(
+                        "per-client task finished with an error: failed to set socket option: {e:#}"
+                    );
                     return;
                 }
             }
@@ -118,10 +130,16 @@ pub async fn task_main(
                     match p.proxy_pass(&config.connect_to_compute).await {
                         Ok(()) => {}
                         Err(ErrorSource::Client(e)) => {
-                            error!(?session_id, "per-client task finished with an IO error from the client: {e:#}");
+                            error!(
+                                ?session_id,
+                                "per-client task finished with an IO error from the client: {e:#}"
+                            );
                         }
                         Err(ErrorSource::Compute(e)) => {
-                            error!(?session_id, "per-client task finished with an IO error from the compute: {e:#}");
+                            error!(
+                                ?session_id,
+                                "per-client task finished with an IO error from the compute: {e:#}"
+                            );
                         }
                     }
                 }
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index 3236b2e1bf..74b48a1bea 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -8,7 +8,7 @@ use pq_proto::StartupMessageParams;
 use smol_str::SmolStr;
 use tokio::sync::mpsc;
 use tracing::field::display;
-use tracing::{debug, error, info_span, Span};
+use tracing::{Span, debug, error, info_span};
 use try_lock::TryLock;
 use uuid::Uuid;
 
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 0537ae6a62..f029327266 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -8,7 +8,7 @@ use chrono::{Datelike, Timelike};
 use futures::{Stream, StreamExt};
 use parquet::basic::Compression;
 use parquet::file::metadata::RowGroupMetaDataPtr;
-use parquet::file::properties::{WriterProperties, WriterPropertiesPtr, DEFAULT_PAGE_SIZE};
+use parquet::file::properties::{DEFAULT_PAGE_SIZE, WriterProperties, WriterPropertiesPtr};
 use parquet::file::writer::SerializedFileWriter;
 use parquet::record::RecordWriter;
 use pq_proto::StartupMessageParams;
@@ -17,10 +17,10 @@ use serde::ser::SerializeMap;
 use tokio::sync::mpsc;
 use tokio::time;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, info, Span};
+use tracing::{Span, debug, info};
 use utils::backoff;
 
-use super::{RequestContextInner, LOG_CHAN};
+use super::{LOG_CHAN, RequestContextInner};
 use crate::config::remote_storage_from_toml;
 use crate::context::LOG_CHAN_DISCONNECT;
 use crate::ext::TaskExt;
@@ -425,20 +425,20 @@ mod tests {
     use futures::{Stream, StreamExt};
     use itertools::Itertools;
     use parquet::basic::{Compression, ZstdLevel};
-    use parquet::file::properties::{WriterProperties, DEFAULT_PAGE_SIZE};
+    use parquet::file::properties::{DEFAULT_PAGE_SIZE, WriterProperties};
     use parquet::file::reader::FileReader;
     use parquet::file::serialized_reader::SerializedFileReader;
     use rand::rngs::StdRng;
     use rand::{Rng, SeedableRng};
     use remote_storage::{
-        GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config,
         DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
+        GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config,
     };
     use tokio::sync::mpsc;
     use tokio::time;
     use walkdir::WalkDir;
 
-    use super::{worker_inner, ParquetConfig, ParquetUploadArgs, RequestData};
+    use super::{ParquetConfig, ParquetUploadArgs, RequestData, worker_inner};
 
     #[derive(Parser)]
     struct ProxyCliArgs {
@@ -514,26 +514,26 @@ mod tests {
 
     fn generate_request_data(rng: &mut impl Rng) -> RequestData {
         RequestData {
-            session_id: uuid::Builder::from_random_bytes(rng.gen()).into_uuid(),
-            peer_addr: Ipv4Addr::from(rng.gen::<[u8; 4]>()).to_string(),
+            session_id: uuid::Builder::from_random_bytes(rng.r#gen()).into_uuid(),
+            peer_addr: Ipv4Addr::from(rng.r#gen::<[u8; 4]>()).to_string(),
             timestamp: chrono::DateTime::from_timestamp_millis(
                 rng.gen_range(1703862754..1803862754),
             )
             .unwrap()
             .naive_utc(),
             application_name: Some("test".to_owned()),
-            username: Some(hex::encode(rng.gen::<[u8; 4]>())),
-            endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())),
-            database: Some(hex::encode(rng.gen::<[u8; 16]>())),
-            project: Some(hex::encode(rng.gen::<[u8; 16]>())),
-            branch: Some(hex::encode(rng.gen::<[u8; 16]>())),
+            username: Some(hex::encode(rng.r#gen::<[u8; 4]>())),
+            endpoint_id: Some(hex::encode(rng.r#gen::<[u8; 16]>())),
+            database: Some(hex::encode(rng.r#gen::<[u8; 16]>())),
+            project: Some(hex::encode(rng.r#gen::<[u8; 16]>())),
+            branch: Some(hex::encode(rng.r#gen::<[u8; 16]>())),
             pg_options: None,
             auth_method: None,
             jwt_issuer: None,
             protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
             region: "us-east-1",
             error: None,
-            success: rng.gen(),
+            success: rng.r#gen(),
             cold_start_info: "no",
             duration_us: rng.gen_range(0..30_000_000),
             disconnect_timestamp: None,
diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs
index ef6621fc59..977fcf4727 100644
--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -3,16 +3,16 @@
 use std::sync::Arc;
 use std::time::Duration;
 
-use ::http::header::AUTHORIZATION;
 use ::http::HeaderName;
+use ::http::header::AUTHORIZATION;
 use futures::TryFutureExt;
 use postgres_client::config::SslMode;
 use tokio::time::Instant;
-use tracing::{debug, info, info_span, warn, Instrument};
+use tracing::{Instrument, debug, info, info_span, warn};
 
 use super::super::messages::{ControlPlaneErrorMessage, GetEndpointAccessControl, WakeCompute};
-use crate::auth::backend::jwt::AuthRule;
 use crate::auth::backend::ComputeUserInfo;
+use crate::auth::backend::jwt::AuthRule;
 use crate::cache::Cached;
 use crate::context::RequestContext;
 use crate::control_plane::caches::ApiCaches;
diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs
index 1e6cde8fb0..7da5464aa5 100644
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -6,11 +6,11 @@ use std::sync::Arc;
 use futures::TryFutureExt;
 use thiserror::Error;
 use tokio_postgres::Client;
-use tracing::{error, info, info_span, warn, Instrument};
+use tracing::{Instrument, error, info, info_span, warn};
 
-use crate::auth::backend::jwt::AuthRule;
-use crate::auth::backend::ComputeUserInfo;
 use crate::auth::IpPattern;
+use crate::auth::backend::ComputeUserInfo;
+use crate::auth::backend::jwt::AuthRule;
 use crate::cache::Cached;
 use crate::context::RequestContext;
 use crate::control_plane::client::{
diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs
index c28ff4789d..746595de38 100644
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -10,15 +10,15 @@ use clashmap::ClashMap;
 use tokio::time::Instant;
 use tracing::{debug, info};
 
-use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError};
 use crate::auth::backend::ComputeUserInfo;
+use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError};
 use crate::cache::endpoints::EndpointsCache;
 use crate::cache::project_info::ProjectInfoCacheImpl;
 use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions};
 use crate::context::RequestContext;
 use crate::control_plane::{
-    errors, CachedAccessBlockerFlags, CachedAllowedIps, CachedAllowedVpcEndpointIds,
-    CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache,
+    CachedAccessBlockerFlags, CachedAllowedIps, CachedAllowedVpcEndpointIds, CachedNodeInfo,
+    CachedRoleSecret, ControlPlaneApi, NodeInfoCache, errors,
 };
 use crate::error::ReportableError;
 use crate::metrics::ApiLockMetrics;
diff --git a/proxy/src/control_plane/errors.rs b/proxy/src/control_plane/errors.rs
index d6f565e34a..bc30cffd27 100644
--- a/proxy/src/control_plane/errors.rs
+++ b/proxy/src/control_plane/errors.rs
@@ -2,7 +2,7 @@ use thiserror::Error;
 
 use crate::control_plane::client::ApiLockError;
 use crate::control_plane::messages::{self, ControlPlaneErrorMessage, Reason};
-use crate::error::{io_error, ErrorKind, ReportableError, UserFacingError};
+use crate::error::{ErrorKind, ReportableError, UserFacingError, io_error};
 use crate::proxy::retry::CouldRetry;
 
 /// A go-to error message which doesn't leak any detail.
diff --git a/proxy/src/control_plane/mgmt.rs b/proxy/src/control_plane/mgmt.rs
index 2f7359240d..df31abcc8c 100644
--- a/proxy/src/control_plane/mgmt.rs
+++ b/proxy/src/control_plane/mgmt.rs
@@ -6,7 +6,7 @@ use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
 use tokio::net::{TcpListener, TcpStream};
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, info_span, Instrument};
+use tracing::{Instrument, error, info, info_span};
 
 use crate::control_plane::messages::{DatabaseInfo, KickSession};
 use crate::waiters::{self, Waiter, Waiters};
diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs
index 89ec4f9b33..d592223be1 100644
--- a/proxy/src/control_plane/mod.rs
+++ b/proxy/src/control_plane/mod.rs
@@ -11,9 +11,9 @@ pub(crate) mod errors;
 
 use std::sync::Arc;
 
+use crate::auth::IpPattern;
 use crate::auth::backend::jwt::AuthRule;
 use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
-use crate::auth::IpPattern;
 use crate::cache::project_info::ProjectInfoCacheImpl;
 use crate::cache::{Cached, TimedLru};
 use crate::config::ComputeConfig;
diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs
index 141f319567..5278fe2a3e 100644
--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
@@ -9,8 +9,8 @@ use http_utils::json::json_response;
 use http_utils::{RouterBuilder, RouterService};
 use hyper0::header::CONTENT_TYPE;
 use hyper0::{Body, Request, Response, StatusCode};
-use measured::text::BufferedTextEncoder;
 use measured::MetricGroup;
+use measured::text::BufferedTextEncoder;
 use metrics::NeonMetrics;
 use tracing::{info, info_span};
 
diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs
index ed88c77256..96f600d836 100644
--- a/proxy/src/http/mod.rs
+++ b/proxy/src/http/mod.rs
@@ -13,8 +13,8 @@ use hyper::body::Body;
 pub(crate) use reqwest::{Request, Response};
 use reqwest_middleware::RequestBuilder;
 pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error};
-pub(crate) use reqwest_retry::policies::ExponentialBackoff;
 pub(crate) use reqwest_retry::RetryTransientMiddleware;
+pub(crate) use reqwest_retry::policies::ExponentialBackoff;
 use thiserror::Error;
 
 use crate::metrics::{ConsoleRequest, Metrics};
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index fbd4811b54..3c34918d84 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -8,7 +8,7 @@ use opentelemetry::trace::TraceContextExt;
 use scopeguard::defer;
 use serde::ser::{SerializeMap, Serializer};
 use tracing::subscriber::Interest;
-use tracing::{callsite, span, Event, Metadata, Span, Subscriber};
+use tracing::{Event, Metadata, Span, Subscriber, callsite, span};
 use tracing_opentelemetry::OpenTelemetrySpanExt;
 use tracing_subscriber::filter::{EnvFilter, LevelFilter};
 use tracing_subscriber::fmt::format::{Format, Full};
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index f3447e063e..db1f096de1 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -543,11 +543,7 @@ impl Drop for LatencyTimer {
 
 impl From<bool> for Bool {
     fn from(value: bool) -> Self {
-        if value {
-            Bool::True
-        } else {
-            Bool::False
-        }
+        if value { Bool::True } else { Bool::False }
     }
 }
 
diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index 99d645878f..41180fa6c1 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -407,7 +407,7 @@ mod tests {
     use tokio::io::AsyncReadExt;
 
     use crate::protocol2::{
-        read_proxy_protocol, ConnectHeader, LOCAL_V2, PROXY_V2, TCP_OVER_IPV4, UDP_OVER_IPV6,
+        ConnectHeader, LOCAL_V2, PROXY_V2, TCP_OVER_IPV4, UDP_OVER_IPV6, read_proxy_protocol,
     };
 
     #[tokio::test]
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 26fb1754bf..b8b39fa121 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -5,7 +5,7 @@ use tracing::{debug, info, warn};
 
 use super::retry::ShouldRetryWakeCompute;
 use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
-use crate::compute::{self, PostgresConnection, COULD_NOT_CONNECT};
+use crate::compute::{self, COULD_NOT_CONNECT, PostgresConnection};
 use crate::config::{ComputeConfig, RetryConfig};
 use crate::context::RequestContext;
 use crate::control_plane::errors::WakeComputeError;
@@ -15,7 +15,7 @@ use crate::error::ReportableError;
 use crate::metrics::{
     ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType,
 };
-use crate::proxy::retry::{retry_after, should_retry, CouldRetry};
+use crate::proxy::retry::{CouldRetry, retry_after, should_retry};
 use crate::proxy::wake_compute::wake_compute;
 use crate::types::Host;
 
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 861f1766e8..6f8b972348 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -1,7 +1,7 @@
 use std::future::poll_fn;
 use std::io;
 use std::pin::Pin;
-use std::task::{ready, Context, Poll};
+use std::task::{Context, Poll, ready};
 
 use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
 use tracing::info;
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 49566e5172..0c6d352600 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -9,28 +9,28 @@ pub(crate) mod retry;
 pub(crate) mod wake_compute;
 use std::sync::Arc;
 
-pub use copy_bidirectional::{copy_bidirectional_client_compute, ErrorSource};
+pub use copy_bidirectional::{ErrorSource, copy_bidirectional_client_compute};
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pq_proto::{BeMessage as Be, CancelKeyData, StartupMessageParams};
 use regex::Regex;
 use serde::{Deserialize, Serialize};
-use smol_str::{format_smolstr, SmolStr, ToSmolStr};
+use smol_str::{SmolStr, ToSmolStr, format_smolstr};
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, warn, Instrument};
+use tracing::{Instrument, debug, error, info, warn};
 
-use self::connect_compute::{connect_to_compute, TcpMechanism};
+use self::connect_compute::{TcpMechanism, connect_to_compute};
 use self::passthrough::ProxyPassthrough;
 use crate::cancellation::{self, CancellationHandler};
 use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
 use crate::context::RequestContext;
 use crate::error::ReportableError;
 use crate::metrics::{Metrics, NumClientConnectionsGuard};
-use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo, ConnectionInfoExtra};
-use crate::proxy::handshake::{handshake, HandshakeData};
+use crate::protocol2::{ConnectHeader, ConnectionInfo, ConnectionInfoExtra, read_proxy_protocol};
+use crate::proxy::handshake::{HandshakeData, handshake};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::stream::{PqStream, Stream};
 use crate::types::EndpointCacheKey;
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index d8c00a9b41..171f539b1e 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -5,12 +5,12 @@ mod mitm;
 
 use std::time::Duration;
 
-use anyhow::{bail, Context};
+use anyhow::{Context, bail};
 use async_trait::async_trait;
 use http::StatusCode;
 use postgres_client::config::SslMode;
 use postgres_client::tls::{MakeTlsConnect, NoTls};
-use retry::{retry_after, ShouldRetryWakeCompute};
+use retry::{ShouldRetryWakeCompute, retry_after};
 use rstest::rstest;
 use rustls::crypto::ring;
 use rustls::pki_types;
@@ -334,8 +334,8 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
         generate_tls_config("generic-project-name.localhost", "localhost")?;
     let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), Scram::mock()));
 
-    use rand::distributions::Alphanumeric;
     use rand::Rng;
+    use rand::distributions::Alphanumeric;
     let password: String = rand::thread_rng()
         .sample_iter(&Alphanumeric)
         .take(rand::random::<u8>() as usize)
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index 4e9206feff..9d8915e24a 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -3,8 +3,8 @@ use tracing::{error, info};
 use super::connect_compute::ComputeConnectBackend;
 use crate::config::RetryConfig;
 use crate::context::RequestContext;
-use crate::control_plane::errors::{ControlPlaneError, WakeComputeError};
 use crate::control_plane::CachedNodeInfo;
+use crate::control_plane::errors::{ControlPlaneError, WakeComputeError};
 use crate::error::ReportableError;
 use crate::metrics::{
     ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType,
diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs
index 9645eaf725..b3853d48e4 100644
--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -3,7 +3,7 @@ use std::sync::atomic::{AtomicUsize, Ordering};
 
 use ahash::RandomState;
 use clashmap::ClashMap;
-use rand::{thread_rng, Rng};
+use rand::{Rng, thread_rng};
 use tokio::time::Instant;
 use tracing::info;
 use utils::leaky_bucket::LeakyBucketState;
diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs
index b74a9ab17e..f8eeb89f05 100644
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ b/proxy/src/rate_limiter/limit_algorithm.rs
@@ -5,8 +5,8 @@ use std::time::Duration;
 
 use parking_lot::Mutex;
 use tokio::sync::Notify;
-use tokio::time::error::Elapsed;
 use tokio::time::Instant;
+use tokio::time::error::Elapsed;
 
 use self::aimd::Aimd;
 
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index ef6c39f230..71e2a92da6 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -1,8 +1,8 @@
 use std::borrow::Cow;
 use std::collections::hash_map::RandomState;
 use std::hash::{BuildHasher, Hash};
-use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Mutex;
+use std::sync::atomic::{AtomicUsize, Ordering};
 
 use anyhow::bail;
 use clashmap::ClashMap;
diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs
index bf6dde9332..58e3c889a7 100644
--- a/proxy/src/redis/elasticache.rs
+++ b/proxy/src/redis/elasticache.rs
@@ -1,6 +1,7 @@
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
 
+use aws_config::Region;
 use aws_config::environment::EnvironmentVariableCredentialsProvider;
 use aws_config::imds::credentials::ImdsCredentialsProvider;
 use aws_config::meta::credentials::CredentialsProviderChain;
@@ -8,7 +9,6 @@ use aws_config::meta::region::RegionProviderChain;
 use aws_config::profile::ProfileFileCredentialsProvider;
 use aws_config::provider_config::ProviderConfig;
 use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
-use aws_config::Region;
 use aws_sdk_iam::config::ProvideCredentials;
 use aws_sigv4::http_request::{
     self, SignableBody, SignableRequest, SignatureLocation, SigningSettings,
diff --git a/proxy/src/redis/keys.rs b/proxy/src/redis/keys.rs
index dcb9a59f87..7527bca6d0 100644
--- a/proxy/src/redis/keys.rs
+++ b/proxy/src/redis/keys.rs
@@ -1,7 +1,7 @@
 use std::io::ErrorKind;
 
 use anyhow::Ok;
-use pq_proto::{id_to_cancel_key, CancelKeyData};
+use pq_proto::{CancelKeyData, id_to_cancel_key};
 use serde::{Deserialize, Serialize};
 
 pub mod keyspace {
diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs
index ac77556566..46e6a439e5 100644
--- a/proxy/src/sasl/stream.rs
+++ b/proxy/src/sasl/stream.rs
@@ -5,8 +5,8 @@ use std::io;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 
-use super::messages::ServerMessage;
 use super::Mechanism;
+use super::messages::ServerMessage;
 use crate::stream::PqStream;
 
 /// Abstracts away all peculiarities of the libpq's protocol.
diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs
index 87ab6e0d5f..9d56c465ec 100644
--- a/proxy/src/scram/countmin.rs
+++ b/proxy/src/scram/countmin.rs
@@ -90,7 +90,7 @@ mod tests {
             // number of insert operations
             let m = rng.gen_range(1..100);
 
-            let id = uuid::Builder::from_random_bytes(rng.gen()).into_uuid();
+            let id = uuid::Builder::from_random_bytes(rng.r#gen()).into_uuid();
             ids.push((id, n, m));
 
             // N = sum(actual)
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 77853db3db..abd5aeae5b 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -5,6 +5,7 @@ use std::convert::Infallible;
 use hmac::{Hmac, Mac};
 use sha2::Sha256;
 
+use super::ScramKey;
 use super::messages::{
     ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN,
 };
@@ -12,7 +13,6 @@ use super::pbkdf2::Pbkdf2;
 use super::secret::ServerSecret;
 use super::signature::SignatureBuilder;
 use super::threadpool::ThreadPool;
-use super::ScramKey;
 use crate::intern::EndpointIdInt;
 use crate::sasl::{self, ChannelBinding, Error as SaslError};
 
@@ -208,8 +208,8 @@ impl sasl::Mechanism for Exchange<'_> {
     type Output = super::ScramKey;
 
     fn exchange(mut self, input: &str) -> sasl::Result<sasl::Step<Self, Self::Output>> {
-        use sasl::Step;
         use ExchangeState;
+        use sasl::Step;
         match &self.state {
             ExchangeState::Initial(init) => {
                 match init.transition(self.secret, &self.tls_server_end_point, input)? {
diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs
index 0e54e7ded9..7b0b861ce9 100644
--- a/proxy/src/scram/messages.rs
+++ b/proxy/src/scram/messages.rs
@@ -4,7 +4,7 @@ use std::fmt;
 use std::ops::Range;
 
 use super::base64_decode_array;
-use super::key::{ScramKey, SCRAM_KEY_LEN};
+use super::key::{SCRAM_KEY_LEN, ScramKey};
 use super::signature::SignatureBuilder;
 use crate::sasl::ChannelBinding;
 
diff --git a/proxy/src/scram/mod.rs b/proxy/src/scram/mod.rs
index cfa571cbe1..24f991d4d9 100644
--- a/proxy/src/scram/mod.rs
+++ b/proxy/src/scram/mod.rs
@@ -15,7 +15,7 @@ mod secret;
 mod signature;
 pub mod threadpool;
 
-pub(crate) use exchange::{exchange, Exchange};
+pub(crate) use exchange::{Exchange, exchange};
 use hmac::{Hmac, Mac};
 pub(crate) use key::ScramKey;
 pub(crate) use secret::ServerSecret;
diff --git a/proxy/src/scram/signature.rs b/proxy/src/scram/signature.rs
index d3255cf2ca..a5b1c3e9f4 100644
--- a/proxy/src/scram/signature.rs
+++ b/proxy/src/scram/signature.rs
@@ -1,6 +1,6 @@
 //! Tools for client/server signature management.
 
-use super::key::{ScramKey, SCRAM_KEY_LEN};
+use super::key::{SCRAM_KEY_LEN, ScramKey};
 
 /// A collection of message parts needed to derive the client's signature.
 #[derive(Debug)]
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 70dd7bc0e7..72029102e0 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -7,27 +7,27 @@ use ed25519_dalek::SigningKey;
 use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer};
 use jose_jwk::jose_b64;
 use rand::rngs::OsRng;
-use tokio::net::{lookup_host, TcpStream};
+use tokio::net::{TcpStream, lookup_host};
 use tracing::field::display;
 use tracing::{debug, info};
 
 use super::conn_pool::poll_client;
 use super::conn_pool_lib::{Client, ConnInfo, EndpointConnPool, GlobalConnPool};
-use super::http_conn_pool::{self, poll_http2_client, HttpConnPool, Send};
-use super::local_conn_pool::{self, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION};
+use super::http_conn_pool::{self, HttpConnPool, Send, poll_http2_client};
+use super::local_conn_pool::{self, EXT_NAME, EXT_SCHEMA, EXT_VERSION, LocalConnPool};
 use crate::auth::backend::local::StaticAuthRules;
 use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
-use crate::auth::{self, check_peer_addr_is_in_list, AuthError};
+use crate::auth::{self, AuthError, check_peer_addr_is_in_list};
 use crate::compute;
 use crate::compute_ctl::{
     ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest,
 };
 use crate::config::{ComputeConfig, ProxyConfig};
 use crate::context::RequestContext;
+use crate::control_plane::CachedNodeInfo;
 use crate::control_plane::client::ApiLockError;
 use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError};
 use crate::control_plane::locks::ApiLocks;
-use crate::control_plane::CachedNodeInfo;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::intern::EndpointIdInt;
 use crate::protocol2::ConnectionInfoExtra;
diff --git a/proxy/src/serverless/cancel_set.rs b/proxy/src/serverless/cancel_set.rs
index 6db986f1f7..ba8945afc5 100644
--- a/proxy/src/serverless/cancel_set.rs
+++ b/proxy/src/serverless/cancel_set.rs
@@ -6,7 +6,7 @@ use std::time::Duration;
 
 use indexmap::IndexMap;
 use parking_lot::Mutex;
-use rand::{thread_rng, Rng};
+use rand::{Rng, thread_rng};
 use rustc_hash::FxHasher;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
@@ -40,7 +40,7 @@ impl CancelSet {
 
     pub(crate) fn take(&self) -> Option<CancellationToken> {
         for _ in 0..4 {
-            if let Some(token) = self.take_raw(thread_rng().gen()) {
+            if let Some(token) = self.take_raw(thread_rng().r#gen()) {
                 return Some(token);
             }
             tracing::trace!("failed to get cancel token");
@@ -68,7 +68,7 @@ impl CancelShard {
     fn take(&mut self, rng: usize) -> Option<CancellationToken> {
         NonZeroUsize::new(self.tokens.len()).and_then(|len| {
             // 10 second grace period so we don't cancel new connections
-            if self.tokens.get_index(rng % len)?.1 .0.elapsed() < Duration::from_secs(10) {
+            if self.tokens.get_index(rng % len)?.1.0.elapsed() < Duration::from_secs(10) {
                 return None;
             }
 
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 447103edce..6a9089fc2a 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -1,17 +1,17 @@
 use std::fmt;
 use std::pin::pin;
 use std::sync::{Arc, Weak};
-use std::task::{ready, Poll};
+use std::task::{Poll, ready};
 
-use futures::future::poll_fn;
 use futures::Future;
-use postgres_client::tls::NoTlsStream;
+use futures::future::poll_fn;
 use postgres_client::AsyncMessage;
+use postgres_client::tls::NoTlsStream;
 use smallvec::SmallVec;
 use tokio::net::TcpStream;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, info_span, warn, Instrument};
+use tracing::{Instrument, error, info, info_span, warn};
 #[cfg(test)]
 use {
     super::conn_pool_lib::GlobalConnPoolOptions,
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index 9e21491655..933204994b 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -10,7 +10,7 @@ use parking_lot::RwLock;
 use postgres_client::ReadyForQueryStatus;
 use rand::Rng;
 use smol_str::ToSmolStr;
-use tracing::{debug, info, Span};
+use tracing::{Span, debug, info};
 
 use super::backend::HttpConnError;
 use super::conn_pool::ClientDataRemote;
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index fa21f24a1c..338a79b4b3 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -7,7 +7,7 @@ use hyper_util::rt::{TokioExecutor, TokioIo};
 use parking_lot::RwLock;
 use smol_str::ToSmolStr;
 use tokio::net::TcpStream;
-use tracing::{debug, error, info, info_span, Instrument};
+use tracing::{Instrument, debug, error, info, info_span};
 
 use super::backend::HttpConnError;
 use super::conn_pool_lib::{
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index ab012bd020..fbd12ad9cb 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -1,5 +1,5 @@
-use postgres_client::types::{Kind, Type};
 use postgres_client::Row;
+use postgres_client::types::{Kind, Type};
 use serde_json::{Map, Value};
 
 //
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index 137a2d6377..8426a0810e 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -11,24 +11,24 @@
 
 use std::collections::HashMap;
 use std::pin::pin;
-use std::sync::atomic::AtomicUsize;
 use std::sync::Arc;
-use std::task::{ready, Poll};
+use std::sync::atomic::AtomicUsize;
+use std::task::{Poll, ready};
 use std::time::Duration;
 
 use ed25519_dalek::{Signature, Signer, SigningKey};
-use futures::future::poll_fn;
 use futures::Future;
+use futures::future::poll_fn;
 use indexmap::IndexMap;
 use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
 use parking_lot::RwLock;
-use postgres_client::tls::NoTlsStream;
 use postgres_client::AsyncMessage;
+use postgres_client::tls::NoTlsStream;
 use serde_json::value::RawValue;
 use tokio::net::TcpStream;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, info_span, warn, Instrument};
+use tracing::{Instrument, debug, error, info, info_span, warn};
 
 use super::backend::HttpConnError;
 use super::conn_pool_lib::{
@@ -389,6 +389,9 @@ mod tests {
         // });
         // println!("{}", serde_json::to_string(&jwk).unwrap());
 
-        assert_eq!(jwt, "eyJhbGciOiJFZERTQSJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.Cvyc2By33KI0f0obystwdy8PN111L3Sc9_Mr2CU3XshtSqSdxuRxNEZGbb_RvyJf2IzheC_s7aBZ-jLeQ9N0Bg");
+        assert_eq!(
+            jwt,
+            "eyJhbGciOiJFZERTQSJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.Cvyc2By33KI0f0obystwdy8PN111L3Sc9_Mr2CU3XshtSqSdxuRxNEZGbb_RvyJf2IzheC_s7aBZ-jLeQ9N0Bg"
+        );
     }
 }
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 8289500159..dd0fb9c5b4 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -15,7 +15,7 @@ mod sql_over_http;
 mod websocket;
 
 use std::net::{IpAddr, SocketAddr};
-use std::pin::{pin, Pin};
+use std::pin::{Pin, pin};
 use std::sync::Arc;
 
 use anyhow::Context;
@@ -23,8 +23,8 @@ use async_trait::async_trait;
 use atomic_take::AtomicTake;
 use bytes::Bytes;
 pub use conn_pool_lib::GlobalConnPoolOptions;
-use futures::future::{select, Either};
 use futures::TryFutureExt;
+use futures::future::{Either, select};
 use http::{Method, Response, StatusCode};
 use http_body_util::combinators::BoxBody;
 use http_body_util::{BodyExt, Empty};
@@ -32,23 +32,23 @@ use http_utils::error::ApiError;
 use hyper::body::Incoming;
 use hyper_util::rt::TokioExecutor;
 use hyper_util::server::conn::auto::Builder;
-use rand::rngs::StdRng;
 use rand::SeedableRng;
-use sql_over_http::{uuid_to_header_value, NEON_REQUEST_ID};
+use rand::rngs::StdRng;
+use sql_over_http::{NEON_REQUEST_ID, uuid_to_header_value};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::{TcpListener, TcpStream};
 use tokio::time::timeout;
 use tokio_rustls::TlsAcceptor;
 use tokio_util::sync::CancellationToken;
 use tokio_util::task::TaskTracker;
-use tracing::{info, warn, Instrument};
+use tracing::{Instrument, info, warn};
 
 use crate::cancellation::CancellationHandler;
 use crate::config::{ProxyConfig, ProxyProtocolV2};
 use crate::context::RequestContext;
 use crate::ext::TaskExt;
 use crate::metrics::Metrics;
-use crate::protocol2::{read_proxy_protocol, ChainRW, ConnectHeader, ConnectionInfo};
+use crate::protocol2::{ChainRW, ConnectHeader, ConnectionInfo, read_proxy_protocol};
 use crate::proxy::run_until_cancelled;
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 7c21d90ed8..8babfb5cd2 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -2,23 +2,23 @@ use std::pin::pin;
 use std::sync::Arc;
 
 use bytes::Bytes;
-use futures::future::{select, try_join, Either};
+use futures::future::{Either, select, try_join};
 use futures::{StreamExt, TryFutureExt};
-use http::header::AUTHORIZATION;
 use http::Method;
+use http::header::AUTHORIZATION;
 use http_body_util::combinators::BoxBody;
 use http_body_util::{BodyExt, Full};
 use http_utils::error::ApiError;
 use hyper::body::Incoming;
 use hyper::http::{HeaderName, HeaderValue};
-use hyper::{header, HeaderMap, Request, Response, StatusCode};
+use hyper::{HeaderMap, Request, Response, StatusCode, header};
 use indexmap::IndexMap;
 use postgres_client::error::{DbError, ErrorPosition, SqlState};
 use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction};
 use pq_proto::StartupMessageParamsBuilder;
 use serde::Serialize;
-use serde_json::value::RawValue;
 use serde_json::Value;
+use serde_json::value::RawValue;
 use tokio::time::{self, Instant};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info};
@@ -31,15 +31,15 @@ use super::conn_pool::{AuthData, ConnInfoWithAuth};
 use super::conn_pool_lib::{self, ConnInfo};
 use super::error::HttpCodeError;
 use super::http_util::json_response;
-use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError};
+use super::json::{JsonConversionError, json_to_pg_text, pg_text_row_to_json};
 use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
-use crate::auth::{endpoint_sni, ComputeUserInfoParseError};
+use crate::auth::{ComputeUserInfoParseError, endpoint_sni};
 use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig};
 use crate::context::RequestContext;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
-use crate::http::{read_body_with_limit, ReadBodyError};
+use crate::http::{ReadBodyError, read_body_with_limit};
 use crate::metrics::{HttpDirection, Metrics};
-use crate::proxy::{run_until_cancelled, NeonOptions};
+use crate::proxy::{NeonOptions, run_until_cancelled};
 use crate::serverless::backend::HttpConnError;
 use crate::types::{DbName, RoleName};
 use crate::usage_metrics::{MetricCounter, MetricCounterRecorder, TrafficDirection};
@@ -1021,7 +1021,7 @@ async fn query_to_json<T: GenericClient>(
     data: QueryData,
     current_size: &mut usize,
     parsed_headers: HttpHeaders,
-) -> Result<(ReadyForQueryStatus, impl Serialize), SqlOverHttpError> {
+) -> Result<(ReadyForQueryStatus, impl Serialize + use<T>), SqlOverHttpError> {
     let query_start = Instant::now();
 
     let query_params = data.params;
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 585a7d63b2..c4baeeb5cc 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -1,6 +1,6 @@
 use std::pin::Pin;
 use std::sync::Arc;
-use std::task::{ready, Context, Poll};
+use std::task::{Context, Poll, ready};
 
 use anyhow::Context as _;
 use bytes::{Buf, BufMut, Bytes, BytesMut};
@@ -15,9 +15,9 @@ use tracing::warn;
 use crate::cancellation::CancellationHandler;
 use crate::config::ProxyConfig;
 use crate::context::RequestContext;
-use crate::error::{io_error, ReportableError};
+use crate::error::{ReportableError, io_error};
 use crate::metrics::Metrics;
-use crate::proxy::{handle_client, ClientMode, ErrorSource};
+use crate::proxy::{ClientMode, ErrorSource, handle_client};
 use crate::rate_limiter::EndpointRateLimiter;
 
 pin_project! {
@@ -184,11 +184,11 @@ mod tests {
 
     use framed_websockets::WebSocketServer;
     use futures::{SinkExt, StreamExt};
-    use tokio::io::{duplex, AsyncReadExt, AsyncWriteExt};
+    use tokio::io::{AsyncReadExt, AsyncWriteExt, duplex};
     use tokio::task::JoinSet;
-    use tokio_tungstenite::tungstenite::protocol::Role;
-    use tokio_tungstenite::tungstenite::Message;
     use tokio_tungstenite::WebSocketStream;
+    use tokio_tungstenite::tungstenite::Message;
+    use tokio_tungstenite::tungstenite::protocol::Role;
 
     use super::WebSocketRw;
 
diff --git a/proxy/src/signals.rs b/proxy/src/signals.rs
index 0b675683c0..32b2344a1c 100644
--- a/proxy/src/signals.rs
+++ b/proxy/src/signals.rs
@@ -12,7 +12,7 @@ pub async fn handle<F>(
 where
     F: FnMut(),
 {
-    use tokio::signal::unix::{signal, SignalKind};
+    use tokio::signal::unix::{SignalKind, signal};
 
     let mut hangup = signal(SignalKind::hangup())?;
     let mut interrupt = signal(SignalKind::interrupt())?;
diff --git a/proxy/src/tls/postgres_rustls.rs b/proxy/src/tls/postgres_rustls.rs
index 0ad279b635..f09e916a1d 100644
--- a/proxy/src/tls/postgres_rustls.rs
+++ b/proxy/src/tls/postgres_rustls.rs
@@ -2,8 +2,8 @@ use std::convert::TryFrom;
 use std::sync::Arc;
 
 use postgres_client::tls::MakeTlsConnect;
-use rustls::pki_types::ServerName;
 use rustls::ClientConfig;
+use rustls::pki_types::ServerName;
 use tokio::io::{AsyncRead, AsyncWrite};
 
 mod private {
@@ -15,8 +15,8 @@ mod private {
     use postgres_client::tls::{ChannelBinding, TlsConnect};
     use rustls::pki_types::ServerName;
     use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
-    use tokio_rustls::client::TlsStream;
     use tokio_rustls::TlsConnector;
+    use tokio_rustls::client::TlsStream;
 
     use crate::tls::TlsServerEndPoint;
 
diff --git a/proxy/src/tls/server_config.rs b/proxy/src/tls/server_config.rs
index 2cc1657eea..903c0b712b 100644
--- a/proxy/src/tls/server_config.rs
+++ b/proxy/src/tls/server_config.rs
@@ -1,12 +1,12 @@
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
-use anyhow::{bail, Context};
+use anyhow::{Context, bail};
 use itertools::Itertools;
 use rustls::crypto::ring::{self, sign};
 use rustls::pki_types::{CertificateDer, PrivateKeyDer};
 
-use super::{TlsServerEndPoint, PG_ALPN_PROTOCOL};
+use super::{PG_ALPN_PROTOCOL, TlsServerEndPoint};
 
 pub struct TlsConfig {
     pub config: Arc<rustls::ServerConfig>,
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index 6a23f0e129..004d268fa1 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -2,17 +2,17 @@
 //! and push them to a HTTP endpoint.
 use std::borrow::Cow;
 use std::convert::Infallible;
-use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
 use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
 use std::time::Duration;
 
-use anyhow::{bail, Context};
+use anyhow::{Context, bail};
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Bytes;
 use chrono::{DateTime, Datelike, Timelike, Utc};
-use clashmap::mapref::entry::Entry;
 use clashmap::ClashMap;
-use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
+use clashmap::mapref::entry::Entry;
+use consumption_metrics::{CHUNK_SIZE, Event, EventChunk, EventType, idempotency_key};
 use once_cell::sync::Lazy;
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use serde::{Deserialize, Serialize};
@@ -62,11 +62,7 @@ mod none_as_empty_string {
         d: D,
     ) -> Result<Option<SmolStr>, D::Error> {
         let s = SmolStr::deserialize(d)?;
-        if s.is_empty() {
-            Ok(None)
-        } else {
-            Ok(Some(s))
-        }
+        if s.is_empty() { Ok(None) } else { Ok(Some(s)) }
     }
 }
 

From 5fad4a4ceeb990523336aefb3aa1fe6e0fac7eae Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 24 Feb 2025 10:30:21 -0500
Subject: [PATCH 563/583] feat(storcon): chaos injection of force exit (#10934)

## Problem

close https://github.com/neondatabase/cloud/issues/24485

## Summary of changes

This patch adds a new chaos injection mode for the storcon. The chaos
injector reads the crontab and exits immediately at the configured time.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 Cargo.lock                                    | 16 +++-
 Cargo.toml                                    |  1 +
 storage_controller/Cargo.toml                 |  1 +
 storage_controller/src/main.rs                | 10 ++-
 .../src/service/chaos_injector.rs             | 82 ++++++++++++++++---
 5 files changed, 93 insertions(+), 17 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 038727f1a8..47552174d2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1546,6 +1546,17 @@ dependencies = [
  "itertools 0.10.5",
 ]
 
+[[package]]
+name = "cron"
+version = "0.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5877d3fbf742507b66bc2a1945106bd30dd8504019d596901ddd012a4dd01740"
+dependencies = [
+ "chrono",
+ "once_cell",
+ "winnow",
+]
+
 [[package]]
 name = "crossbeam-channel"
 version = "0.5.8"
@@ -6446,6 +6457,7 @@ dependencies = [
  "chrono",
  "clap",
  "control_plane",
+ "cron",
  "diesel",
  "diesel-async",
  "diesel_migrations",
@@ -8138,9 +8150,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
 [[package]]
 name = "winnow"
-version = "0.6.13"
+version = "0.6.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1"
+checksum = "1e90edd2ac1aa278a5c4599b1d89cf03074b610800f866d4026dc199d7929a28"
 dependencies = [
  "memchr",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 21310ce6ec..e6ca3c982c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -77,6 +77,7 @@ byteorder = "1.4"
 bytes = "1.9"
 camino = "1.1.6"
 cfg-if = "1.0.0"
+cron = "0.15"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive", "env"] }
 clashmap = { version = "1.0", features = ["raw-api"] }
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 08c80bc141..8e82996db1 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -18,6 +18,7 @@ anyhow.workspace = true
 bytes.workspace = true
 chrono.workspace = true
 clap.workspace = true
+cron.workspace = true
 fail.workspace = true
 futures.workspace = true
 hex.workspace = true
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 18922b9e05..4152e40a76 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -115,10 +115,14 @@ struct Cli {
     #[arg(long)]
     neon_local_repo_dir: Option<PathBuf>,
 
-    /// Chaos testing
+    /// Chaos testing: exercise tenant migrations
     #[arg(long)]
     chaos_interval: Option<humantime::Duration>,
 
+    /// Chaos testing: exercise an immediate exit
+    #[arg(long)]
+    chaos_exit_crontab: Option<cron::Schedule>,
+
     // Maximum acceptable lag for the secondary location while draining
     // a pageserver
     #[arg(long)]
@@ -382,10 +386,12 @@ async fn async_main() -> anyhow::Result<()> {
         let service = service.clone();
         let cancel = CancellationToken::new();
         let cancel_bg = cancel.clone();
+        let chaos_exit_crontab = args.chaos_exit_crontab;
         (
             tokio::task::spawn(
                 async move {
-                    let mut chaos_injector = ChaosInjector::new(service, interval.into());
+                    let mut chaos_injector =
+                        ChaosInjector::new(service, interval.into(), chaos_exit_crontab);
                     chaos_injector.run(cancel_bg).await
                 }
                 .instrument(tracing::info_span!("chaos_injector")),
diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs
index aa0ee0df5a..25a0fab5ca 100644
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -16,29 +16,80 @@ use super::{Node, Scheduler, Service, TenantShard};
 pub struct ChaosInjector {
     service: Arc<Service>,
     interval: Duration,
+    chaos_exit_crontab: Option<cron::Schedule>,
+}
+
+fn cron_to_next_duration(cron: &cron::Schedule) -> anyhow::Result<tokio::time::Sleep> {
+    use chrono::Utc;
+    let next = cron.upcoming(Utc).next().unwrap();
+    let duration = (next - Utc::now()).to_std()?;
+    Ok(tokio::time::sleep(duration))
+}
+
+async fn maybe_sleep(sleep: Option<tokio::time::Sleep>) -> Option<()> {
+    if let Some(sleep) = sleep {
+        sleep.await;
+        Some(())
+    } else {
+        None
+    }
 }
 
 impl ChaosInjector {
-    pub fn new(service: Arc<Service>, interval: Duration) -> Self {
-        Self { service, interval }
+    pub fn new(
+        service: Arc<Service>,
+        interval: Duration,
+        chaos_exit_crontab: Option<cron::Schedule>,
+    ) -> Self {
+        Self {
+            service,
+            interval,
+            chaos_exit_crontab,
+        }
     }
 
     pub async fn run(&mut self, cancel: CancellationToken) {
         let mut interval = tokio::time::interval(self.interval);
-
-        loop {
-            tokio::select! {
-                _ = interval.tick() => {}
-                _ = cancel.cancelled() => {
-                    tracing::info!("Shutting down");
-                    return;
+        let cron_interval = {
+            if let Some(ref chaos_exit_crontab) = self.chaos_exit_crontab {
+                match cron_to_next_duration(chaos_exit_crontab) {
+                    Ok(interval_exit) => Some(interval_exit),
+                    Err(e) => {
+                        tracing::error!("Error processing the cron schedule: {e}");
+                        None
+                    }
                 }
+            } else {
+                None
             }
-
-            self.inject_chaos().await;
-
-            tracing::info!("Chaos iteration...");
+        };
+        enum ChaosEvent {
+            ShuffleTenant,
+            ForceKill,
         }
+        let chaos_type = tokio::select! {
+            _ = interval.tick() => {
+                ChaosEvent::ShuffleTenant
+            }
+            Some(_) = maybe_sleep(cron_interval) => {
+                ChaosEvent::ForceKill
+            }
+            _ = cancel.cancelled() => {
+                tracing::info!("Shutting down");
+                return;
+            }
+        };
+
+        match chaos_type {
+            ChaosEvent::ShuffleTenant => {
+                self.inject_chaos().await;
+            }
+            ChaosEvent::ForceKill => {
+                self.force_kill().await;
+            }
+        }
+
+        tracing::info!("Chaos iteration...");
     }
 
     /// If a shard has a secondary and attached location, then re-assign the secondary to be
@@ -95,6 +146,11 @@ impl ChaosInjector {
         );
     }
 
+    async fn force_kill(&mut self) {
+        tracing::warn!("Injecting chaos: force kill");
+        std::process::exit(1);
+    }
+
     async fn inject_chaos(&mut self) {
         // Pick some shards to interfere with
         let batch_size = 128;

From df362de0ddef6bd90d014eea12529c59c2ee1b76 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 24 Feb 2025 17:38:13 +0100
Subject: [PATCH 564/583] Reject basebackup requests for archived timelines
 (#10828)

For archived timelines, we would like to prevent all non-pageserver
issued getpage requests, as users are not supposed to send these.
Instead, they should unarchive a timeline before issuing any external
read traffic.

As this is non-trivial to do, at least prevent launches of new computes,
by errorring on basebackup requests for archived timelines. In #10688,
we started issuing a warning instead of an error, because an error would
mean a stuck project. Now after we can confirm the the warning is not
present in the logs for about a week, we can issue errors.

Follow-up of #10688
Related: #9548
---
 pageserver/src/page_service.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 7285697040..b9b8e32753 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -2097,9 +2097,8 @@ impl PageServerHandler {
         set_tracing_field_shard_id(&timeline);
 
         if timeline.is_archived() == Some(true) {
-            // TODO after a grace period, turn this log line into a hard error
-            tracing::warn!("timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it.");
-            //return Err(QueryError::NotFound("timeline is archived".into()))
+            tracing::info!("timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it.");
+            return Err(QueryError::NotFound("timeline is archived".into()));
         }
 
         let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();

From 40acb0c06df769d69e31bb99174a3c4162b164f8 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 24 Feb 2025 19:15:07 +0200
Subject: [PATCH 565/583] Fix usage of WaitEventSetWait() with timeout (#10947)

WaitEventSetWait() returns the number of "events" that happened, and
only that many events in the WaitEvent array are updated. When the
timeout is reached, it returns 0 and does not modify the WaitEvent array
at all. We were reading 'event.events' without checking the return
value, which would be uninitialized when the timeout was hit.

No test included, as this is harmless at the moment. But this caused the
test I'm including in PR #10882 to fail. That PR changes the logic to
loop back to retry the PQgetCopyData() call if WL_SOCKET_READABLE was
set. Currently, making an extra call to PQconsumeInput() is harmless,
but with that change in logic, it turns into a busy-wait.
---
 pgxn/neon/libpagestore.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index fc1aecd340..f5801b379b 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -716,20 +716,21 @@ retry:
 
 	if (ret == 0)
 	{
-		WaitEvent	event;
+		WaitEvent	occurred_event;
+		int			noccurred;
 		long		timeout;
 
 		timeout = Max(0, LOG_INTERVAL_MS - INSTR_TIME_GET_MILLISEC(since_last_log));
 
 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(shard->wes_read, timeout, &event, 1,
-								WAIT_EVENT_NEON_PS_READ);
+		noccurred = WaitEventSetWait(shard->wes_read, timeout, &occurred_event, 1,
+									 WAIT_EVENT_NEON_PS_READ);
 		ResetLatch(MyLatch);
 
 		CHECK_FOR_INTERRUPTS();
 
 		/* Data available in socket? */
-		if (event.events & WL_SOCKET_READABLE)
+		if (noccurred > 0 && (occurred_event.events & WL_SOCKET_READABLE) != 0)
 		{
 			if (!PQconsumeInput(pageserver_conn))
 			{

From 1f0dea9a1a6b7f5f575f3f64ac0725f85e9535f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Mon, 24 Feb 2025 18:45:23 +0100
Subject: [PATCH 566/583] feat(ci): push container images to ghcr.io as well
 (#10945)

## Problem
There's new rate-limits coming on docker hub. To reduce our reliance on
docker hub and the problems the limits are going to cause for us, we
want to prepare for this by also pushing our container images to ghcr.io

## Summary of changes
Push our images to ghcr.io as well and not just docker hub.
---
 .../workflows/_push-to-container-registry.yml | 35 +++++++-------
 .github/workflows/build_and_test.yml          | 48 ++++++++++---------
 .github/workflows/pin-build-tools-image.yml   | 12 +++--
 scripts/generate_image_maps.py                |  1 +
 4 files changed, 52 insertions(+), 44 deletions(-)

diff --git a/.github/workflows/_push-to-container-registry.yml b/.github/workflows/_push-to-container-registry.yml
index c938f62ad5..403d078988 100644
--- a/.github/workflows/_push-to-container-registry.yml
+++ b/.github/workflows/_push-to-container-registry.yml
@@ -11,8 +11,12 @@ on:
         description: AWS region to log in to. Required when pushing to ECR.
         required: false
         type: string
-      aws-account-ids:
-        description: Comma separated AWS account IDs to log in to for pushing to ECR. Required when pushing to ECR.
+      aws-account-id:
+        description: AWS account ID to log in to for pushing to ECR. Required when pushing to ECR.
+        required: false
+        type: string
+      aws-role-to-assume:
+        description: AWS role to assume to for pushing to ECR. Required when pushing to ECR.
         required: false
         type: string
       azure-client-id:
@@ -31,16 +35,6 @@ on:
         description: ACR registry name. Required when pushing to ACR.
         required: false
         type: string
-    secrets:
-      docker-hub-username:
-        description: Docker Hub username. Required when pushing to Docker Hub.
-        required: false
-      docker-hub-password:
-        description: Docker Hub password. Required when pushing to Docker Hub.
-        required: false
-      aws-role-to-assume:
-        description: AWS role to assume. Required when pushing to ECR.
-        required: false
 
 permissions: {}
 
@@ -53,6 +47,7 @@ jobs:
     runs-on: ubuntu-22.04
     permissions:
       id-token: write  # Required for aws/azure login
+      packages: write  # required for pushing to GHCR
     steps:
       - uses: actions/checkout@v4
         with:
@@ -67,14 +62,14 @@ jobs:
         uses: aws-actions/configure-aws-credentials@v4
         with:
           aws-region: "${{ inputs.aws-region }}"
-          role-to-assume: "${{ secrets.aws-role-to-assume }}"
+          role-to-assume: "arn:aws:iam::${{ inputs.aws-account-id }}:role/${{ inputs.aws-role-to-assume }}"
           role-duration-seconds: 3600
 
       - name: Login to ECR
         if: contains(inputs.image-map, 'amazonaws.com/')
         uses: aws-actions/amazon-ecr-login@v2
         with:
-          registries: "${{ inputs.aws-account-ids }}"
+          registries: "${{ inputs.aws-account-id }}"
 
       - name: Configure Azure credentials
         if: contains(inputs.image-map, 'azurecr.io/')
@@ -89,11 +84,19 @@ jobs:
         run: |
           az acr login --name=${{ inputs.acr-registry-name }}
 
+      - name: Login to GHCR
+        if: contains(inputs.image-map, 'ghcr.io/')
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
       - name: Log in to Docker Hub
         uses: docker/login-action@v3
         with:
-          username: ${{ secrets.docker-hub-username }}
-          password: ${{ secrets.docker-hub-password }}
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
       - name: Copy docker images to target registries
         run: python scripts/push_with_image_map.py
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 8f3392ceea..1b706b3f16 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -866,68 +866,72 @@ jobs:
   push-neon-image-dev:
     needs: [ generate-image-maps, neon-image ]
     uses: ./.github/workflows/_push-to-container-registry.yml
+    permissions:
+      id-token: write  # Required for aws/azure login
+      packages: write  # required for pushing to GHCR
     with:
       image-map: '${{ needs.generate-image-maps.outputs.neon-dev }}'
       aws-region: ${{ vars.AWS_ECR_REGION }}
-      aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
+      aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
+      aws-role-to-assume: "gha-oidc-neon-admin"
       azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
       azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
       azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
       acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
-    secrets:
-      aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}"
-      docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-      docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+    secrets: inherit
 
   push-compute-image-dev:
     needs: [ generate-image-maps, vm-compute-node-image ]
     uses: ./.github/workflows/_push-to-container-registry.yml
+    permissions:
+      id-token: write  # Required for aws/azure login
+      packages: write  # required for pushing to GHCR
     with:
       image-map: '${{ needs.generate-image-maps.outputs.compute-dev }}'
       aws-region: ${{ vars.AWS_ECR_REGION }}
-      aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
+      aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
+      aws-role-to-assume: "gha-oidc-neon-admin"
       azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
       azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
       azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
       acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
-    secrets:
-      aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}"
-      docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-      docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+    secrets: inherit
 
   push-neon-image-prod:
     if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
     needs: [ generate-image-maps, neon-image, test-images ]
     uses: ./.github/workflows/_push-to-container-registry.yml
+    permissions:
+      id-token: write  # Required for aws/azure login
+      packages: write  # required for pushing to GHCR
     with:
       image-map: '${{ needs.generate-image-maps.outputs.neon-prod }}'
       aws-region: ${{ vars.AWS_ECR_REGION }}
-      aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}"
+      aws-account-id: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}"
+      aws-role-to-assume: "gha-oidc-neon-admin"
       azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }}
       azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
       azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
       acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
-    secrets:
-      aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}"
-      docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-      docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+    secrets: inherit
 
   push-compute-image-prod:
     if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
     needs: [ generate-image-maps, vm-compute-node-image, test-images ]
     uses: ./.github/workflows/_push-to-container-registry.yml
+    permissions:
+      id-token: write  # Required for aws/azure login
+      packages: write  # required for pushing to GHCR
     with:
       image-map: '${{ needs.generate-image-maps.outputs.compute-prod }}'
       aws-region: ${{ vars.AWS_ECR_REGION }}
-      aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}"
+      aws-account-id: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}"
+      aws-role-to-assume: "gha-oidc-neon-admin"
       azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }}
       azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
       azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
       acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
-    secrets:
-      aws-role-to-assume: "${{ secrets.PROD_GHA_OIDC_ROLE }}"
-      docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-      docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+    secrets: inherit
 
   # This is a bit of a special case so we're not using a generated image map.
   add-latest-tag-to-neon-extensions-test-image:
@@ -940,9 +944,7 @@ jobs:
           "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"],
           "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"]
         }
-    secrets:
-      docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-      docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+    secrets: inherit
 
   trigger-custom-extensions-build-and-wait:
     needs: [ check-permissions, tag ]
diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
index b305b662ee..d2588ba0bf 100644
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -65,6 +65,7 @@ jobs:
 
     permissions:
       id-token: write  # Required for aws/azure login
+      packages: write  # required for pushing to GHCR
 
     uses: ./.github/workflows/_push-to-container-registry.yml
     with:
@@ -72,12 +73,15 @@ jobs:
         {
           "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bullseye": [
             "docker.io/neondatabase/build-tools:pinned-bullseye",
+            "ghcr.io/neondatabase/build-tools:pinned-bullseye",
             "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bullseye",
             "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bullseye"
           ],
           "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bookworm": [
             "docker.io/neondatabase/build-tools:pinned-bookworm",
             "docker.io/neondatabase/build-tools:pinned",
+            "ghcr.io/neondatabase/build-tools:pinned-bookworm",
+            "ghcr.io/neondatabase/build-tools:pinned",
             "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bookworm",
             "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned",
             "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bookworm",
@@ -85,12 +89,10 @@ jobs:
           ]
         }
       aws-region: ${{ vars.AWS_ECR_REGION }}
-      aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
+      aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
+      aws-role-to-assume: "gha-oidc-neon-admin"
       azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
       azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
       azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
       acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
-    secrets:
-      aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}"
-      docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-      docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+    secrets: inherit
diff --git a/scripts/generate_image_maps.py b/scripts/generate_image_maps.py
index 915eb33673..39ece5b38f 100644
--- a/scripts/generate_image_maps.py
+++ b/scripts/generate_image_maps.py
@@ -27,6 +27,7 @@ components = {
 registries = {
     "dev": [
         "docker.io/neondatabase",
+        "ghcr.io/neondatabase",
         f"{dev_aws}.dkr.ecr.{aws_region}.amazonaws.com",
         f"{dev_acr}.azurecr.io/neondatabase",
     ],

From 8fd0f89b9499f38c8abf634ce068c235ef0305eb Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Mon, 24 Feb 2025 18:50:49 +0100
Subject: [PATCH 567/583] rename libduckdb.so in pg_duckdb context to avoid
 conflict with pg_mooncake (#10915)

## Problem

Introducing pg_duckdb caused a conflict with pg_mooncake.
Both use libduckdb.so in different versions.

## Summary of changes

- Rename the libduckdb.so to libduckdb_pg_duckdb.so in the context of
pg_duckdb so that it doesn't conflict with libduckdb.so referenced by
pg_mooncake.
- use a version map to rename the duckdb symbols to a version specific
name
  - DUCKDB_1.1.3 for pg_mooncake
  - DUCKDB_1.2.0 for pg_duckdb

For the concept of version maps see
- https://www.man7.org/conf/lca2006/shared_libraries/slide19a.html
-
https://peeterjoot.com/2019/09/20/an-example-of-linux-glibc-symbol-versioning/
- https://akkadia.org/drepper/dsohowto.pdf
---
 compute/compute-node.Dockerfile      | 13 +++---
 compute/patches/duckdb_v113.patch    | 25 +++++++++++
 compute/patches/duckdb_v120.patch    | 67 ++++++++++++++++++++++++++++
 compute/patches/pg_duckdb_v031.patch | 22 +++++++++
 4 files changed, 121 insertions(+), 6 deletions(-)
 create mode 100644 compute/patches/duckdb_v113.patch
 create mode 100644 compute/patches/duckdb_v120.patch

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index ef4c22612d..a74291fdb4 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1458,9 +1458,11 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
 FROM build-deps AS pg_mooncake-src
 ARG PG_VERSION
 WORKDIR /ext-src
+COPY compute/patches/duckdb_v113.patch .
 RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.2/pg_mooncake-0.1.2.tar.gz -O pg_mooncake.tar.gz && \
     echo "4550473784fcdd2e1e18062bc01eb9c286abd27cdf5e11a4399be6c0a426ba90 pg_mooncake.tar.gz" | sha256sum --check && \
     mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \
+    cd third_party/duckdb && patch -p1 < /ext-src/duckdb_v113.patch && cd ../.. && \
     echo "make -f pg_mooncake-src/Makefile.build installcheck TEST_DIR=./test SQL_DIR=./sql SRC_DIR=./src" > neon-test.sh && \
     chmod a+x neon-test.sh
 
@@ -1480,6 +1482,7 @@ RUN make release -j $(getconf _NPROCESSORS_ONLN) && \
 FROM build-deps AS pg_duckdb-src
 WORKDIR /ext-src
 COPY compute/patches/pg_duckdb_v031.patch .
+COPY compute/patches/duckdb_v120.patch .
 # pg_duckdb build requires source dir to be a git repo to get submodules
 # allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: 
 # - extension management function duckdb.install_extension()
@@ -1487,7 +1490,9 @@ COPY compute/patches/pg_duckdb_v031.patch .
 RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \
     cd pg_duckdb-src && \
     git submodule update --init --recursive && \
-    patch -p1 < /ext-src/pg_duckdb_v031.patch
+    patch -p1 < /ext-src/pg_duckdb_v031.patch && \
+    cd third_party/duckdb && \
+    patch -p1 < /ext-src/duckdb_v120.patch
 
 FROM pg-build AS pg_duckdb-build
 ARG PG_VERSION
@@ -1676,11 +1681,7 @@ COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
-
-# Disabled temporarily, because it clashed with pg_mooncake. pg_mooncake
-# also depends on libduckdb, but a different version.
-#COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
-
+COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pgaudit-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pgauditlogtofile-build /usr/local/pgsql/ /usr/local/pgsql/
diff --git a/compute/patches/duckdb_v113.patch b/compute/patches/duckdb_v113.patch
new file mode 100644
index 0000000000..b7b43b88bf
--- /dev/null
+++ b/compute/patches/duckdb_v113.patch
@@ -0,0 +1,25 @@
+diff --git a/libduckdb.map b/libduckdb.map
+new file mode 100644
+index 0000000000..3b56f00cd7
+--- /dev/null
++++ b/libduckdb.map
+@@ -0,0 +1,6 @@
++DUCKDB_1.1.3 {
++    global:
++        *duckdb*;
++    local:
++        *;
++};
+diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
+index 3e757a4bcc..88ab4005b9 100644
+--- a/src/CMakeLists.txt
++++ b/src/CMakeLists.txt
+@@ -135,6 +135,8 @@ else()
+   target_link_libraries(duckdb ${DUCKDB_LINK_LIBS})
+   link_threads(duckdb)
+   link_extension_libraries(duckdb)
++  target_link_options(duckdb PRIVATE
++    -Wl,--version-script=${CMAKE_SOURCE_DIR}/libduckdb.map)
+ 
+   add_library(duckdb_static STATIC ${ALL_OBJECT_FILES})
+   target_link_libraries(duckdb_static ${DUCKDB_LINK_LIBS})
diff --git a/compute/patches/duckdb_v120.patch b/compute/patches/duckdb_v120.patch
new file mode 100644
index 0000000000..cf317736a5
--- /dev/null
+++ b/compute/patches/duckdb_v120.patch
@@ -0,0 +1,67 @@
+diff --git a/libduckdb_pg_duckdb.map b/libduckdb_pg_duckdb.map
+new file mode 100644
+index 0000000000..0872978b48
+--- /dev/null
++++ b/libduckdb_pg_duckdb.map
+@@ -0,0 +1,6 @@
++DUCKDB_1.2.0 {
++    global:
++        *duckdb*;
++    local:
++        *;
++};
+diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
+index 58adef3fc0..2c522f91be 100644
+--- a/src/CMakeLists.txt
++++ b/src/CMakeLists.txt
+@@ -59,7 +59,7 @@ endfunction()
+ 
+ if(AMALGAMATION_BUILD)
+ 
+-  add_library(duckdb SHARED "${PROJECT_SOURCE_DIR}/src/amalgamation/duckdb.cpp")
++  add_library(duckdb_pg_duckdb SHARED "${PROJECT_SOURCE_DIR}/src/amalgamation/duckdb.cpp")
+   target_link_libraries(duckdb ${DUCKDB_SYSTEM_LIBS})
+   link_threads(duckdb)
+   link_extension_libraries(duckdb)
+@@ -109,7 +109,7 @@ else()
+       duckdb_yyjson
+       duckdb_zstd)
+ 
+-  add_library(duckdb SHARED ${ALL_OBJECT_FILES})
++  add_library(duckdb_pg_duckdb SHARED ${ALL_OBJECT_FILES})
+ 
+   if(WIN32 AND NOT MINGW)
+     ensure_variable_is_number(DUCKDB_MAJOR_VERSION RC_MAJOR_VERSION)
+@@ -131,9 +131,11 @@ else()
+     target_sources(duckdb PRIVATE version.rc)
+   endif()
+ 
+-  target_link_libraries(duckdb ${DUCKDB_LINK_LIBS})
+-  link_threads(duckdb)
+-  link_extension_libraries(duckdb)
++  target_link_libraries(duckdb_pg_duckdb ${DUCKDB_LINK_LIBS})
++  link_threads(duckdb_pg_duckdb)
++  link_extension_libraries(duckdb_pg_duckdb)
++  target_link_options(duckdb_pg_duckdb PRIVATE
++    -Wl,--version-script=${CMAKE_SOURCE_DIR}/libduckdb_pg_duckdb.map)
+ 
+   add_library(duckdb_static STATIC ${ALL_OBJECT_FILES})
+   target_link_libraries(duckdb_static ${DUCKDB_LINK_LIBS})
+@@ -141,7 +143,7 @@ else()
+   link_extension_libraries(duckdb_static)
+ 
+   target_include_directories(
+-    duckdb PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
++    duckdb_pg_duckdb PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+                   $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+ 
+   target_include_directories(
+@@ -161,7 +163,7 @@ else()
+ endif()
+ 
+ install(
+-  TARGETS duckdb duckdb_static
++  TARGETS duckdb_pg_duckdb duckdb_static
+   EXPORT "${DUCKDB_EXPORT_SET}"
+   LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
+   ARCHIVE DESTINATION "${INSTALL_LIB_DIR}"
diff --git a/compute/patches/pg_duckdb_v031.patch b/compute/patches/pg_duckdb_v031.patch
index a7e188d69e..edc7fbf69d 100644
--- a/compute/patches/pg_duckdb_v031.patch
+++ b/compute/patches/pg_duckdb_v031.patch
@@ -1,3 +1,25 @@
+diff --git a/Makefile b/Makefile
+index 3235cc8..6b892bc 100644
+--- a/Makefile
++++ b/Makefile
+@@ -32,7 +32,7 @@ else
+ 	DUCKDB_BUILD_TYPE = release
+ endif
+ 
+-DUCKDB_LIB = libduckdb$(DLSUFFIX)
++DUCKDB_LIB = libduckdb_pg_duckdb$(DLSUFFIX)
+ FULL_DUCKDB_LIB = third_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src/$(DUCKDB_LIB)
+ 
+ ERROR_ON_WARNING ?=
+@@ -54,7 +54,7 @@ override PG_CXXFLAGS += -std=c++17 ${DUCKDB_BUILD_CXX_FLAGS} ${COMPILER_FLAGS} -
+ # changes to the vendored code in one place.
+ override PG_CFLAGS += -Wno-declaration-after-statement
+ 
+-SHLIB_LINK += -Wl,-rpath,$(PG_LIB)/ -lpq -Lthird_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src -L$(PG_LIB) -lduckdb -lstdc++ -llz4
++SHLIB_LINK += -Wl,-rpath,$(PG_LIB)/ -lpq -Lthird_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src -L$(PG_LIB) -lduckdb_pg_duckdb -lstdc++ -llz4
+ 
+ include Makefile.global
+ 
 diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql
 index d777d76..af60106 100644
 --- a/sql/pg_duckdb--0.2.0--0.3.0.sql

From 565a9e62a1e865a0de1966b1afa146b958d39396 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 24 Feb 2025 22:16:37 +0200
Subject: [PATCH 568/583] compute: Disconnect if no response to a pageserver
 request is received (#10882)

We've seen some cases in production where a compute doesn't get a
response to a pageserver request for several minutes, or even more. We
haven't found the root cause for that yet, but whatever the reason is,
it seems overly optimistic to think that if the pageserver hasn't
responded for 2 minutes, we'd get a response if we just wait patiently a
little longer. More likely, the pageserver is dead or there's some kind
of a network glitch so that the TCP connection is dead, or at least
stuck for a long time. Either way, it's better to disconnect and
reconnect. I set the default timeout to 2 minutes, which should be
enough for any GetPage request under normal circumstances, even if the
pageserver has to download several layer files from remote storage.

Make the disconnect timeout configurable. Also make the "log interval",
after which we print a message to the log configurable, so that if you
change the disconnect timeout, you can set the log timeout
correspondingly. The default log interval is still 10 s. The new GUCs
are called "neon.pageserver_response_log_timeout" and
"neon.pageserver_response_disconnect_timeout".

Includes a basic test for the log and disconnect timeouts.

Implements issue #10857
---
 pageserver/src/page_service.rs             |   3 +
 pgxn/neon/libpagestore.c                   | 215 +++++++++++++++------
 test_runner/regress/test_bad_connection.py | 191 ++++++++++++++++++
 3 files changed, 347 insertions(+), 62 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index b9b8e32753..cab3d76bf8 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -45,6 +45,7 @@ use utils::sync::gate::{Gate, GateGuard};
 use utils::sync::spsc_fold;
 use utils::{
     auth::{Claims, Scope, SwappableJwtAuth},
+    failpoint_support,
     id::{TenantId, TimelineId},
     lsn::Lsn,
     simple_rcu::RcuReadGuard,
@@ -1298,6 +1299,8 @@ impl PageServerHandler {
                 &response_msg.serialize(protocol_version),
             ))?;
 
+            failpoint_support::sleep_millis_async!("before-pagestream-msg-flush", cancel);
+
             // what we want to do
             let socket_fd = pgb_writer.socket_fd;
             let flush_fut = pgb_writer.flush();
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index f5801b379b..f71f11ff93 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -14,6 +14,8 @@
  */
 #include "postgres.h"
 
+#include <math.h>
+
 #include "access/xlog.h"
 #include "common/hashfn.h"
 #include "fmgr.h"
@@ -61,6 +63,9 @@ int         neon_protocol_version = 2;
 static int	max_reconnect_attempts = 60;
 static int	stripe_size;
 
+static int pageserver_response_log_timeout = 10000;
+static int pageserver_response_disconnect_timeout = 120000; /* 2 minutes */
+
 typedef struct
 {
 	char		connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE];
@@ -129,6 +134,11 @@ typedef struct
 	uint64			nrequests_sent;
 	uint64			nresponses_received;
 
+	/* State for the receive timeout mechanism in call_PQgetCopyData() */
+	instr_time		receive_start_time;			/* when we started waiting */
+	instr_time		receive_last_log_time;		/* when we last printed a log message for the wait */
+	bool			receive_logged;				/* has the wait been logged */
+
 	/*---
 	 * WaitEventSet containing:
 	 *	- WL_SOCKET_READABLE on 'conn'
@@ -661,6 +671,9 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		shard->state = PS_Connected;
 		shard->nrequests_sent = 0;
 		shard->nresponses_received = 0;
+		INSTR_TIME_SET_ZERO(shard->receive_start_time);
+		INSTR_TIME_SET_ZERO(shard->receive_last_log_time);
+		shard->receive_logged = false;
 	}
 	/* FALLTHROUGH */
 	case PS_Connected:
@@ -680,6 +693,33 @@ pageserver_connect(shardno_t shard_no, int elevel)
 	Assert(false);
 }
 
+static void
+get_socket_stats(int socketfd, int *sndbuf, int *recvbuf)
+{
+	*sndbuf = -1;
+	*recvbuf = -1;
+
+#ifdef __linux__
+	/*
+	 * get kernel's send and recv queue size via ioctl
+	 * https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27
+	 */
+	if (socketfd != -1)
+	{
+		int			ioctl_err;
+
+		ioctl_err = ioctl(socketfd, SIOCOUTQ, sndbuf);
+		if (ioctl_err!= 0) {
+			*sndbuf = -errno;
+		}
+		ioctl_err = ioctl(socketfd, FIONREAD, recvbuf);
+		if (ioctl_err != 0) {
+			*recvbuf = -errno;
+		}
+	}
+#endif
+}
+
 /*
  * A wrapper around PQgetCopyData that checks for interrupts while sleeping.
  */
@@ -690,26 +730,8 @@ call_PQgetCopyData(shardno_t shard_no, char **buffer)
 	PageServer *shard = &page_servers[shard_no];
 	PGconn	   *pageserver_conn = shard->conn;
 	instr_time	now,
-				start_ts,
 				since_start,
-				last_log_ts,
 				since_last_log;
-	bool		logged = false;
-
-	/*
-	 * As a debugging aid, if we don't get a response for a long time, print a
-	 * log message.
-	 *
-	 * 10 s is a very generous threshold, normally we expect a response in a
-	 * few milliseconds. We have metrics to track latencies in normal ranges,
-	 * but in the cases that take exceptionally long, it's useful to log the
-	 * exact timestamps.
-	 */
-#define LOG_INTERVAL_MS		INT64CONST(10 * 1000)
-
-	INSTR_TIME_SET_CURRENT(now);
-	start_ts = last_log_ts = now;
-	INSTR_TIME_SET_ZERO(since_last_log);
 
 retry:
 	ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
@@ -718,11 +740,36 @@ retry:
 	{
 		WaitEvent	occurred_event;
 		int			noccurred;
+		double		log_timeout,
+					disconnect_timeout;
 		long		timeout;
 
-		timeout = Max(0, LOG_INTERVAL_MS - INSTR_TIME_GET_MILLISEC(since_last_log));
+		/*
+		 * Calculate time elapsed since the start, and since the last progress
+		 * log message. On first call, remember the start time.
+		 */
+		INSTR_TIME_SET_CURRENT(now);
+		if (INSTR_TIME_IS_ZERO(shard->receive_start_time))
+		{
+			shard->receive_start_time = now;
+			INSTR_TIME_SET_ZERO(since_start);
+			shard->receive_last_log_time = now;
+			INSTR_TIME_SET_ZERO(since_last_log);
+			shard->receive_logged = false;
+		}
+		else
+		{
+			since_start = now;
+			INSTR_TIME_SUBTRACT(since_start, shard->receive_start_time);
+			since_last_log = now;
+			INSTR_TIME_SUBTRACT(since_last_log, shard->receive_last_log_time);
+		}
+
+		/* Sleep until the log or disconnect timeout is reached. */
+		log_timeout = Max(0, (double) pageserver_response_log_timeout - INSTR_TIME_GET_MILLISEC(since_last_log));
+		disconnect_timeout = Max(0, (double) pageserver_response_disconnect_timeout - INSTR_TIME_GET_MILLISEC(since_start));
+		timeout = (long) ceil(Min(log_timeout, disconnect_timeout));
 
-		/* Sleep until there's something to do */
 		noccurred = WaitEventSetWait(shard->wes_read, timeout, &occurred_event, 1,
 									 WAIT_EVENT_NEON_PS_READ);
 		ResetLatch(MyLatch);
@@ -740,49 +787,61 @@ retry:
 				pfree(msg);
 				return -1;
 			}
+			goto retry;
+		}
+
+		/* Timeout was reached, or we were interrupted for some other reason */
+		INSTR_TIME_SET_CURRENT(now);
+		since_last_log = now;
+		INSTR_TIME_SUBTRACT(since_last_log, shard->receive_last_log_time);
+		since_start = now;
+		INSTR_TIME_SUBTRACT(since_start, shard->receive_start_time);
+
+		/*
+		 * As a debugging aid, if we don't get a response to a pageserver request
+		 * for a long time, print a log message.
+		 *
+		 * The default neon.pageserver_response_log_timeout value, 10 s, is
+		 * very generous. Normally we expect a response in a few
+		 * milliseconds. We have metrics to track latencies in normal ranges,
+		 * but in the cases that take exceptionally long, it's useful to log
+		 * the exact timestamps.
+		 */
+		if (INSTR_TIME_GET_MILLISEC(since_last_log) >= pageserver_response_log_timeout)
+		{
+			int			sndbuf;
+			int			recvbuf;
+
+			get_socket_stats(PQsocket(pageserver_conn), &sndbuf, &recvbuf);
+
+			neon_shard_log(shard_no, LOG,
+						   "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d)",
+						   INSTR_TIME_GET_DOUBLE(since_start),
+						   shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf);
+			shard->receive_last_log_time = now;
+			shard->receive_logged = true;
 		}
 
 		/*
-		 * Print a message to the log if a long time has passed with no
-		 * response.
+		 * If an even longer time has passed without receiving a response from
+		 * the pageserver, disconnect.  That triggers a reconnection attempt
+		 * in the caller.
+		 *
+		 * If this happens, the pageserver is likely dead and isn't coming
+		 * back, or there's some kind of a network glitch and the connection
+		 * is permanently gone. Without this, if the pageserver or the network
+		 * connection is dead, it could take a very long time (15 minutes or
+		 * more) until the TCP keepalive timeout notices that. Even if we
+		 * would in fact get a response if we just waited a little longer,
+		 * there's a good chance that we'll get the response sooner by
+		 * reconnecting.
 		 */
-		INSTR_TIME_SET_CURRENT(now);
-		since_last_log = now;
-		INSTR_TIME_SUBTRACT(since_last_log, last_log_ts);
-		if (INSTR_TIME_GET_MILLISEC(since_last_log) >= LOG_INTERVAL_MS)
+		if (INSTR_TIME_GET_MILLISEC(since_start) >= pageserver_response_disconnect_timeout)
 		{
-			int sndbuf = -1;
-			int recvbuf = -1;
-#ifdef __linux__
-			int socketfd;
-#endif
-
-			since_start = now;
-			INSTR_TIME_SUBTRACT(since_start, start_ts);
-
-#ifdef __linux__
-			/*
-			 * get kernel's send and recv queue size via ioctl
-			 * https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27
-			 */
-			socketfd = PQsocket(pageserver_conn);
-			if (socketfd != -1) {
-				int ioctl_err;
-				ioctl_err = ioctl(socketfd, SIOCOUTQ, &sndbuf);
-				if (ioctl_err!= 0) {
-					sndbuf = -errno;
-				}
-				ioctl_err = ioctl(socketfd, FIONREAD, &recvbuf);
-				if (ioctl_err != 0) {
-					recvbuf = -errno;
-				}
-			}
-#endif
-			neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d)",
-						   INSTR_TIME_GET_DOUBLE(since_start),
-						   shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf);
-			last_log_ts = now;
-			logged = true;
+			neon_shard_log(shard_no, LOG, "no response from pageserver for %0.3f s, disconnecting",
+					   INSTR_TIME_GET_DOUBLE(since_start));
+			pageserver_disconnect(shard_no);
+			return -1;
 		}
 
 		goto retry;
@@ -792,14 +851,18 @@ retry:
 	 * If we logged earlier that the response is taking a long time, log
 	 * another message when the response is finally received.
 	 */
-	if (logged)
+	if (shard->receive_logged)
 	{
 		INSTR_TIME_SET_CURRENT(now);
 		since_start = now;
-		INSTR_TIME_SUBTRACT(since_start, start_ts);
-		neon_shard_log(shard_no, LOG, "received response from pageserver after %0.3f s",
+		INSTR_TIME_SUBTRACT(since_start, shard->receive_start_time);
+		neon_shard_log(shard_no, LOG,
+					   "received response from pageserver after %0.3f s",
 					   INSTR_TIME_GET_DOUBLE(since_start));
 	}
+	INSTR_TIME_SET_ZERO(shard->receive_start_time);
+	INSTR_TIME_SET_ZERO(shard->receive_last_log_time);
+	shard->receive_logged = false;
 
 	return ret;
 }
@@ -973,9 +1036,17 @@ pageserver_receive(shardno_t shard_no)
 			pfree(msg);
 		}
 	}
+	else if (rc == -1 && shard->state == PS_Disconnected)
+	{
+		/* If the state is 'Disconnected', the disconnection message was already logged */
+		resp = NULL;
+	}
 	else if (rc == -1)
 	{
-		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn)));
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
+		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", msg);
+		pfree(msg);
 		pageserver_disconnect(shard_no);
 		resp = NULL;
 	}
@@ -1261,6 +1332,26 @@ pg_init_libpagestore(void)
 							0,	/* no flags required */
 							NULL, NULL, NULL);
 
+	DefineCustomIntVariable("neon.pageserver_response_log_timeout",
+							"pageserver response log timeout",
+							"If the pageserver doesn't respond to a request within this timeout,"
+							"a message is printed to the log.",
+							&pageserver_response_log_timeout,
+							10000, 100, INT_MAX,
+							PGC_SUSET,
+							GUC_UNIT_MS,
+							NULL, NULL, NULL);
+
+	DefineCustomIntVariable("neon.pageserver_response_disconnect_timeout",
+							"pageserver response diconnect timeout",
+							"If the pageserver doesn't respond to a request within this timeout,"
+							"disconnect and reconnect.",
+							&pageserver_response_disconnect_timeout,
+							120000, 100, INT_MAX,
+							PGC_SUSET,
+							GUC_UNIT_MS,
+							NULL, NULL, NULL);
+
 	relsize_hash_init();
 
 	if (page_server != NULL)
diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py
index c0c9537421..bfc5cb174e 100644
--- a/test_runner/regress/test_bad_connection.py
+++ b/test_runner/regress/test_bad_connection.py
@@ -7,6 +7,7 @@ import psycopg2.errors
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.utils import USE_LFC
 
 
 @pytest.mark.timeout(600)
@@ -80,3 +81,193 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
     # do a graceful shutdown which would had caught the allowed_errors before
     # https://github.com/neondatabase/neon/pull/8632
     env.pageserver.stop()
+
+
+def test_compute_pageserver_hung_connections(neon_env_builder: NeonEnvBuilder):
+    """
+    Test timeouts in waiting for response to pageserver request
+    """
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.append(".*slow GetPage.*")
+    pageserver_http = env.pageserver.http_client()
+    endpoint = env.endpoints.create_start(
+        "main",
+        tenant_id=env.initial_tenant,
+        config_lines=["autovacuum = off"],
+    )
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    # Create table, and insert some rows. Make it big enough that it doesn't fit in
+    # shared_buffers, otherwise the SELECT after restart will just return answer
+    # from shared_buffers without hitting the page server, which defeats the point
+    # of this test.
+    cur.execute("CREATE TABLE foo (t text)")
+    cur.execute(
+        """
+        INSERT INTO foo
+            SELECT 'long string to consume some space' || g
+            FROM generate_series(1, 100000) g
+        """
+    )
+
+    # Verify that the table is larger than shared_buffers
+    cur.execute(
+        """
+        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
+        from pg_settings where name = 'shared_buffers'
+        """
+    )
+    row = cur.fetchone()
+    assert row is not None
+    log.debug(f"shared_buffers is {row[0]}, table size {row[1]}")
+    assert int(row[0]) < int(row[1])
+
+    # Print the backend PID so that it can be compared with the logs easily
+    cur.execute("SELECT pg_backend_pid()")
+    row = cur.fetchone()
+    assert row is not None
+    log.info(f"running test workload in backend PID {row[0]}")
+
+    def run_workload(duration: float):
+        end_time = time.time() + duration
+        times_executed = 0
+        while time.time() < end_time:
+            if random.random() < 0.5:
+                cur.execute("INSERT INTO foo VALUES ('stas'), ('heikki')")
+            else:
+                cur.execute("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10")
+                cur.fetchall()
+                times_executed += 1
+        log.info(f"Workload executed {times_executed} times")
+        assert times_executed > 0
+
+    ## Test short connection hiccups
+    ##
+    ## This is to exercise the logging timeout.
+    log.info("running workload with log timeout")
+    cur.execute("SET neon.pageserver_response_log_timeout = '500ms'")
+    pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "10%3*return(3000)"))
+    run_workload(20)
+
+    # check that the message was logged
+    assert endpoint.log_contains("no response received from pageserver for .* s, still waiting")
+    assert endpoint.log_contains("received response from pageserver after .* s")
+
+    ## Test connections that are hung for longer
+    ##
+    ## This exercises the disconnect timeout. We'll disconnect and
+    ## reconnect after 500 ms.
+    log.info("running workload with disconnect timeout")
+    cur.execute("SET neon.pageserver_response_log_timeout = '250ms'")
+    cur.execute("SET neon.pageserver_response_disconnect_timeout = '500ms'")
+    pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "10%3*return(3000)"))
+    run_workload(15)
+
+    assert endpoint.log_contains("no response from pageserver for .* s, disconnecting")
+
+    # do a graceful shutdown which would had caught the allowed_errors before
+    # https://github.com/neondatabase/neon/pull/8632
+    env.pageserver.stop()
+
+
+def test_compute_pageserver_statement_timeout(neon_env_builder: NeonEnvBuilder):
+    """
+    Test statement_timeout while waiting for response to pageserver request
+    """
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.append(".*slow GetPage.*")
+    pageserver_http = env.pageserver.http_client()
+
+    # Make sure the shared_buffers and LFC are tiny, to ensure the queries
+    # hit the storage. Disable autovacuum to make the test more deterministic.
+    config_lines = [
+        "shared_buffers='512kB'",
+        "autovacuum = off",
+    ]
+    if USE_LFC:
+        config_lines = ["neon.max_file_cache_size = 1MB", "neon.file_cache_size_limit = 1MB"]
+    endpoint = env.endpoints.create_start(
+        "main",
+        tenant_id=env.initial_tenant,
+        config_lines=config_lines,
+    )
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    # Disable parallel query. Parallel workers open their own pageserver connections,
+    # which messes up the test logic.
+    cur.execute("SET max_parallel_workers_per_gather=0")
+    cur.execute("SET effective_io_concurrency=0")
+
+    # Create table, and insert some rows. Make it big enough that it doesn't fit in
+    # shared_buffers, otherwise the SELECT after restart will just return answer
+    # from shared_buffers without hitting the page server, which defeats the point
+    # of this test.
+    cur.execute("CREATE TABLE foo (t text)")
+    cur.execute(
+        """
+        INSERT INTO foo
+            SELECT 'long string to consume some space' || g
+            FROM generate_series(1, 100000) g
+        """
+    )
+
+    # Verify that the table is larger than shared_buffers
+    cur.execute(
+        """
+        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
+        from pg_settings where name = 'shared_buffers'
+        """
+    )
+    row = cur.fetchone()
+    assert row is not None
+    log.debug(f"shared_buffers is {row[0]}, table size {row[1]}")
+    assert int(row[0]) < int(row[1])
+
+    ## Run a query until the compute->pageserver connection hits the failpoint and
+    ## get stuck. This tests that the statement_timeout is obeyed while waiting on a
+    ## GetPage request.
+    log.info("running workload with statement_timeout")
+    cur.execute("SET neon.pageserver_response_log_timeout = '2000ms'")
+    cur.execute("SET neon.pageserver_response_disconnect_timeout = '30000ms'")
+    cur.execute("SET statement_timeout='10s'")
+    pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "10%return(60000)"))
+
+    start_time = time.time()
+    with pytest.raises(psycopg2.errors.QueryCanceled):
+        cur.execute("SELECT count(*) FROM foo")
+        cur.fetchall()
+    log.info("Statement timeout reached")
+    end_time = time.time()
+    # Verify that the statement_timeout canceled the query before
+    # neon.pageserver_response_disconnect_timeout expired
+    assert end_time - start_time < 40
+    times_canceled = 1
+
+    # Should not have disconnected yet
+    assert not endpoint.log_contains("no response from pageserver for .* s, disconnecting")
+
+    # Clear the failpoint. This doesn't affect the connection that already hit it. It
+    # will keep waiting. But subsequent connections will work normally.
+    pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "off"))
+
+    # If we keep retrying, we should eventually succeed. (This tests that the
+    # neon.pageserver_response_disconnect_timeout is not reset on query
+    # cancellation.)
+    while times_canceled < 10:
+        try:
+            cur.execute("SELECT count(*) FROM foo")
+            cur.fetchall()
+            log.info("Statement succeeded")
+            break
+        except psycopg2.errors.QueryCanceled:
+            log.info("Statement timed out, retrying")
+            times_canceled += 1
+    assert times_canceled > 1 and times_canceled < 10
+
+    assert endpoint.log_contains("no response from pageserver for .* s, disconnecting")
+
+    # do a graceful shutdown which would had caught the allowed_errors before
+    # https://github.com/neondatabase/neon/pull/8632
+    env.pageserver.stop()

From 6621be6b7b5c860739d611096c248f6ec44d6fb4 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 24 Feb 2025 23:01:14 +0100
Subject: [PATCH 569/583] pageserver: tweak slow GetPage logging (#10956)

## Problem

We recently added slow GetPage request logging. However, this
unintentionally included the flush time when logging (which we already
have separate logging for). It also logs at WARN level, which is a bit
aggressive since we see this fire quite frequently.

Follows https://github.com/neondatabase/neon/pull/10906.

## Summary of changes

* Only log the request execution time, not the flush time.
* Extract a `pagestream_dispatch_batched_message()` helper.
* Rename `warn_slow()` to `log_slow()` and downgrade to INFO.
---
 libs/utils/benches/README.md     |   4 +-
 libs/utils/benches/benchmarks.rs |  12 +-
 libs/utils/src/logging.rs        |  12 +-
 pageserver/src/page_service.rs   | 298 ++++++++++++++++---------------
 4 files changed, 170 insertions(+), 156 deletions(-)

diff --git a/libs/utils/benches/README.md b/libs/utils/benches/README.md
index e23ec268c2..5afbe3cf2b 100644
--- a/libs/utils/benches/README.md
+++ b/libs/utils/benches/README.md
@@ -10,14 +10,14 @@ cargo bench --package utils
 cargo bench --package utils --bench benchmarks
 
 # Specific benchmark.
-cargo bench --package utils --bench benchmarks warn_slow/enabled=true
+cargo bench --package utils --bench benchmarks log_slow/enabled=true
 
 # List available benchmarks.
 cargo bench --package utils --benches -- --list
 
 # Generate flamegraph profiles using pprof-rs, profiling for 10 seconds.
 # Output in target/criterion/*/profile/flamegraph.svg.
-cargo bench --package utils --bench benchmarks warn_slow/enabled=true --profile-time 10
+cargo bench --package utils --bench benchmarks log_slow/enabled=true --profile-time 10
 ```
 
 Additional charts and statistics are available in `target/criterion/report/index.html`.
diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs
index cff3792f3a..348e27ac47 100644
--- a/libs/utils/benches/benchmarks.rs
+++ b/libs/utils/benches/benchmarks.rs
@@ -3,14 +3,14 @@ use std::time::Duration;
 use criterion::{criterion_group, criterion_main, Bencher, Criterion};
 use pprof::criterion::{Output, PProfProfiler};
 use utils::id;
-use utils::logging::warn_slow;
+use utils::logging::log_slow;
 
 // Register benchmarks with Criterion.
 criterion_group!(
     name = benches;
     config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
     targets = bench_id_stringify,
-    bench_warn_slow,
+    bench_log_slow,
 );
 criterion_main!(benches);
 
@@ -29,9 +29,9 @@ pub fn bench_id_stringify(c: &mut Criterion) {
     });
 }
 
-pub fn bench_warn_slow(c: &mut Criterion) {
+pub fn bench_log_slow(c: &mut Criterion) {
     for enabled in [false, true] {
-        c.bench_function(&format!("warn_slow/enabled={enabled}"), |b| {
+        c.bench_function(&format!("log_slow/enabled={enabled}"), |b| {
             run_bench(b, enabled).unwrap()
         });
     }
@@ -45,11 +45,11 @@ pub fn bench_warn_slow(c: &mut Criterion) {
             .enable_all()
             .build()?;
 
-        // Test both with and without warn_slow, since we're essentially measuring Tokio scheduling
+        // Test both with and without log_slow, since we're essentially measuring Tokio scheduling
         // performance too. Use a simple noop future that yields once, to avoid any scheduler fast
         // paths for a ready future.
         if enabled {
-            b.iter(|| runtime.block_on(warn_slow("ready", THRESHOLD, tokio::task::yield_now())));
+            b.iter(|| runtime.block_on(log_slow("ready", THRESHOLD, tokio::task::yield_now())));
         } else {
             b.iter(|| runtime.block_on(tokio::task::yield_now()));
         }
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 95c69ac8ba..2c36942f43 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -7,7 +7,7 @@ use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, VariantNames};
 use tokio::time::Instant;
-use tracing::warn;
+use tracing::info;
 
 /// Logs a critical error, similarly to `tracing::error!`. This will:
 ///
@@ -322,11 +322,13 @@ impl std::fmt::Debug for SecretString {
     }
 }
 
-/// Logs a periodic warning if a future is slow to complete.
+/// Logs a periodic message if a future is slow to complete.
 ///
 /// This is performance-sensitive as it's used on the GetPage read path.
+///
+/// TODO: consider upgrading this to a warning, but currently it fires too often.
 #[inline]
-pub async fn warn_slow<O>(name: &str, threshold: Duration, f: impl Future<Output = O>) -> O {
+pub async fn log_slow<O>(name: &str, threshold: Duration, f: impl Future<Output = O>) -> O {
     // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and
     // won't fit on the stack.
     let mut f = Box::pin(f);
@@ -345,13 +347,13 @@ pub async fn warn_slow<O>(name: &str, threshold: Duration, f: impl Future<Output
             // false negatives.
             let elapsed = started.elapsed();
             if elapsed >= threshold {
-                warn!("slow {name} completed after {:.3}s", elapsed.as_secs_f64());
+                info!("slow {name} completed after {:.3}s", elapsed.as_secs_f64());
             }
             return output;
         }
 
         let elapsed = started.elapsed().as_secs_f64();
-        warn!("slow {name} still running after {elapsed:.3}s",);
+        info!("slow {name} still running after {elapsed:.3}s",);
 
         attempt += 1;
     }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index cab3d76bf8..668f0eee36 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -40,7 +40,7 @@ use tokio::io::{AsyncWriteExt, BufWriter};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::logging::warn_slow;
+use utils::logging::log_slow;
 use utils::sync::gate::{Gate, GateGuard};
 use utils::sync::spsc_fold;
 use utils::{
@@ -84,8 +84,8 @@ use std::os::fd::AsRawFd;
 /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 
-/// Threshold at which to log a warning about slow GetPage requests.
-const WARN_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30);
+/// Threshold at which to log slow GetPage requests.
+const LOG_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30);
 
 ///////////////////////////////////////////////////////////////////////////////
 
@@ -1087,11 +1087,147 @@ impl PageServerHandler {
             batch
         };
 
-        // invoke handler function
-        let (mut handler_results, span): (
+        // Dispatch the batch to the appropriate request handler.
+        let (mut handler_results, span) = log_slow(
+            batch.as_static_str(),
+            LOG_SLOW_GETPAGE_THRESHOLD,
+            self.pagestream_dispatch_batched_message(batch, io_concurrency, ctx),
+        )
+        .await?;
+
+        // We purposefully don't count flush time into the smgr operation timer.
+        //
+        // The reason is that current compute client will not perform protocol processing
+        // if the postgres backend process is doing things other than `->smgr_read()`.
+        // This is especially the case for prefetch.
+        //
+        // If the compute doesn't read from the connection, eventually TCP will backpressure
+        // all the way into our flush call below.
+        //
+        // The timer's underlying metric is used for a storage-internal latency SLO and
+        // we don't want to include latency in it that we can't control.
+        // And as pointed out above, in this case, we don't control the time that flush will take.
+        //
+        // We put each response in the batch onto the wire in a separate pgb_writer.flush()
+        // call, which (all unmeasured) adds syscall overhead but reduces time to first byte
+        // and avoids building up a "giant" contiguous userspace buffer to hold the entire response.
+        // TODO: vectored socket IO would be great, but pgb_writer doesn't support that.
+        let flush_timers = {
+            let flushing_start_time = Instant::now();
+            let mut flush_timers = Vec::with_capacity(handler_results.len());
+            for handler_result in &mut handler_results {
+                let flush_timer = match handler_result {
+                    Ok((_, timer)) => Some(
+                        timer
+                            .observe_execution_end(flushing_start_time)
+                            .expect("we are the first caller"),
+                    ),
+                    Err(_) => {
+                        // TODO: measure errors
+                        None
+                    }
+                };
+                flush_timers.push(flush_timer);
+            }
+            assert_eq!(flush_timers.len(), handler_results.len());
+            flush_timers
+        };
+
+        // Map handler result to protocol behavior.
+        // Some handler errors cause exit from pagestream protocol.
+        // Other handler errors are sent back as an error message and we stay in pagestream protocol.
+        for (handler_result, flushing_timer) in handler_results.into_iter().zip(flush_timers) {
+            let response_msg = match handler_result {
+                Err(e) => match &e.err {
+                    PageStreamError::Shutdown => {
+                        // If we fail to fulfil a request during shutdown, which may be _because_ of
+                        // shutdown, then do not send the error to the client.  Instead just drop the
+                        // connection.
+                        span.in_scope(|| info!("dropping connection due to shutdown"));
+                        return Err(QueryError::Shutdown);
+                    }
+                    PageStreamError::Reconnect(reason) => {
+                        span.in_scope(|| info!("handler requested reconnect: {reason}"));
+                        return Err(QueryError::Reconnect);
+                    }
+                    PageStreamError::Read(_)
+                    | PageStreamError::LsnTimeout(_)
+                    | PageStreamError::NotFound(_)
+                    | PageStreamError::BadRequest(_) => {
+                        // print the all details to the log with {:#}, but for the client the
+                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
+                        // here includes cancellation which is not an error.
+                        let full = utils::error::report_compact_sources(&e.err);
+                        span.in_scope(|| {
+                            error!("error reading relation or page version: {full:#}")
+                        });
+
+                        PagestreamBeMessage::Error(PagestreamErrorResponse {
+                            req: e.req,
+                            message: e.err.to_string(),
+                        })
+                    }
+                },
+                Ok((response_msg, _op_timer_already_observed)) => response_msg,
+            };
+
+            //
+            // marshal & transmit response message
+            //
+
+            pgb_writer.write_message_noflush(&BeMessage::CopyData(
+                &response_msg.serialize(protocol_version),
+            ))?;
+
+            failpoint_support::sleep_millis_async!("before-pagestream-msg-flush", cancel);
+
+            // what we want to do
+            let socket_fd = pgb_writer.socket_fd;
+            let flush_fut = pgb_writer.flush();
+            // metric for how long flushing takes
+            let flush_fut = match flushing_timer {
+                Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure(
+                    Instant::now(),
+                    flush_fut,
+                    socket_fd,
+                )),
+                None => futures::future::Either::Right(flush_fut),
+            };
+            // do it while respecting cancellation
+            let _: () = async move {
+                tokio::select! {
+                    biased;
+                    _ = cancel.cancelled() => {
+                        // We were requested to shut down.
+                        info!("shutdown request received in page handler");
+                        return Err(QueryError::Shutdown)
+                    }
+                    res = flush_fut => {
+                        res?;
+                    }
+                }
+                Ok(())
+            }
+            .await?;
+        }
+        Ok(())
+    }
+
+    /// Helper which dispatches a batched message to the appropriate handler.
+    /// Returns a vec of results, along with the extracted trace span.
+    async fn pagestream_dispatch_batched_message(
+        &mut self,
+        batch: BatchedFeMessage,
+        io_concurrency: IoConcurrency,
+        ctx: &RequestContext,
+    ) -> Result<
+        (
             Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>>,
-            _,
-        ) = match batch {
+            Span,
+        ),
+        QueryError,
+    > {
+        Ok(match batch {
             BatchedFeMessage::Exists {
                 span,
                 timer,
@@ -1213,124 +1349,7 @@ impl PageServerHandler {
                 // call the handler.
                 (vec![Err(error)], span)
             }
-        };
-
-        // We purposefully don't count flush time into the smgr operation timer.
-        //
-        // The reason is that current compute client will not perform protocol processing
-        // if the postgres backend process is doing things other than `->smgr_read()`.
-        // This is especially the case for prefetch.
-        //
-        // If the compute doesn't read from the connection, eventually TCP will backpressure
-        // all the way into our flush call below.
-        //
-        // The timer's underlying metric is used for a storage-internal latency SLO and
-        // we don't want to include latency in it that we can't control.
-        // And as pointed out above, in this case, we don't control the time that flush will take.
-        //
-        // We put each response in the batch onto the wire in a separate pgb_writer.flush()
-        // call, which (all unmeasured) adds syscall overhead but reduces time to first byte
-        // and avoids building up a "giant" contiguous userspace buffer to hold the entire response.
-        // TODO: vectored socket IO would be great, but pgb_writer doesn't support that.
-        let flush_timers = {
-            let flushing_start_time = Instant::now();
-            let mut flush_timers = Vec::with_capacity(handler_results.len());
-            for handler_result in &mut handler_results {
-                let flush_timer = match handler_result {
-                    Ok((_, timer)) => Some(
-                        timer
-                            .observe_execution_end(flushing_start_time)
-                            .expect("we are the first caller"),
-                    ),
-                    Err(_) => {
-                        // TODO: measure errors
-                        None
-                    }
-                };
-                flush_timers.push(flush_timer);
-            }
-            assert_eq!(flush_timers.len(), handler_results.len());
-            flush_timers
-        };
-
-        // Map handler result to protocol behavior.
-        // Some handler errors cause exit from pagestream protocol.
-        // Other handler errors are sent back as an error message and we stay in pagestream protocol.
-        for (handler_result, flushing_timer) in handler_results.into_iter().zip(flush_timers) {
-            let response_msg = match handler_result {
-                Err(e) => match &e.err {
-                    PageStreamError::Shutdown => {
-                        // If we fail to fulfil a request during shutdown, which may be _because_ of
-                        // shutdown, then do not send the error to the client.  Instead just drop the
-                        // connection.
-                        span.in_scope(|| info!("dropping connection due to shutdown"));
-                        return Err(QueryError::Shutdown);
-                    }
-                    PageStreamError::Reconnect(reason) => {
-                        span.in_scope(|| info!("handler requested reconnect: {reason}"));
-                        return Err(QueryError::Reconnect);
-                    }
-                    PageStreamError::Read(_)
-                    | PageStreamError::LsnTimeout(_)
-                    | PageStreamError::NotFound(_)
-                    | PageStreamError::BadRequest(_) => {
-                        // print the all details to the log with {:#}, but for the client the
-                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
-                        // here includes cancellation which is not an error.
-                        let full = utils::error::report_compact_sources(&e.err);
-                        span.in_scope(|| {
-                            error!("error reading relation or page version: {full:#}")
-                        });
-
-                        PagestreamBeMessage::Error(PagestreamErrorResponse {
-                            req: e.req,
-                            message: e.err.to_string(),
-                        })
-                    }
-                },
-                Ok((response_msg, _op_timer_already_observed)) => response_msg,
-            };
-
-            //
-            // marshal & transmit response message
-            //
-
-            pgb_writer.write_message_noflush(&BeMessage::CopyData(
-                &response_msg.serialize(protocol_version),
-            ))?;
-
-            failpoint_support::sleep_millis_async!("before-pagestream-msg-flush", cancel);
-
-            // what we want to do
-            let socket_fd = pgb_writer.socket_fd;
-            let flush_fut = pgb_writer.flush();
-            // metric for how long flushing takes
-            let flush_fut = match flushing_timer {
-                Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure(
-                    Instant::now(),
-                    flush_fut,
-                    socket_fd,
-                )),
-                None => futures::future::Either::Right(flush_fut),
-            };
-            // do it while respecting cancellation
-            let _: () = async move {
-                tokio::select! {
-                    biased;
-                    _ = cancel.cancelled() => {
-                        // We were requested to shut down.
-                        info!("shutdown request received in page handler");
-                        return Err(QueryError::Shutdown)
-                    }
-                    res = flush_fut => {
-                        res?;
-                    }
-                }
-                Ok(())
-            }
-            .await?;
-        }
-        Ok(())
+        })
     }
 
     /// Pagestream sub-protocol handler.
@@ -1476,19 +1495,16 @@ impl PageServerHandler {
                 }
             };
 
-            let result = warn_slow(
-                msg.as_static_str(),
-                WARN_SLOW_GETPAGE_THRESHOLD,
-                self.pagesteam_handle_batched_message(
+            let result = self
+                .pagesteam_handle_batched_message(
                     pgb_writer,
                     msg,
                     io_concurrency.clone(),
                     &cancel,
                     protocol_version,
                     ctx,
-                ),
-            )
-            .await;
+                )
+                .await;
             match result {
                 Ok(()) => {}
                 Err(e) => break e,
@@ -1652,17 +1668,13 @@ impl PageServerHandler {
                             return Err(e);
                         }
                     };
-                    warn_slow(
-                        batch.as_static_str(),
-                        WARN_SLOW_GETPAGE_THRESHOLD,
-                        self.pagesteam_handle_batched_message(
-                            pgb_writer,
-                            batch,
-                            io_concurrency.clone(),
-                            &cancel,
-                            protocol_version,
-                            &ctx,
-                        ),
+                    self.pagesteam_handle_batched_message(
+                        pgb_writer,
+                        batch,
+                        io_concurrency.clone(),
+                        &cancel,
+                        protocol_version,
+                        &ctx,
                     )
                     .await?;
                 }

From 5d17640944b529834158c689cf0e821871dbf344 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 25 Feb 2025 09:33:08 +0000
Subject: [PATCH 570/583] storcon: send heartbeats concurrently (#10954)

## Problem

While looking at logs I noticed that heartbeats are sent sequentially.
The loop polling the UnorderedSet is at the wrong level of identation.
Instead of doing it after we have the full set, we did after each entry.

## Summary of Changes

Poll the UnorderedSet properly.
---
 storage_controller/src/heartbeater.rs | 52 +++++++++++++--------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index 52b6110667..88ee7887d3 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -223,21 +223,21 @@ impl HeartBeat<Node, PageserverState> for HeartbeaterTask<Node, PageserverState>
                     Some((*node_id, status))
                 }
             });
+        }
 
-            loop {
-                let maybe_status = tokio::select! {
-                    next = heartbeat_futs.next() => {
-                        match next {
-                            Some(result) => result,
-                            None => { break; }
-                        }
-                    },
-                    _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); }
-                };
+        loop {
+            let maybe_status = tokio::select! {
+                next = heartbeat_futs.next() => {
+                    match next {
+                        Some(result) => result,
+                        None => { break; }
+                    }
+                },
+                _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); }
+            };
 
-                if let Some((node_id, status)) = maybe_status {
-                    new_state.insert(node_id, status);
-                }
+            if let Some((node_id, status)) = maybe_status {
+                new_state.insert(node_id, status);
             }
         }
 
@@ -363,21 +363,21 @@ impl HeartBeat<Safekeeper, SafekeeperState> for HeartbeaterTask<Safekeeper, Safe
                     Some((*node_id, status))
                 }
             });
+        }
 
-            loop {
-                let maybe_status = tokio::select! {
-                    next = heartbeat_futs.next() => {
-                        match next {
-                            Some(result) => result,
-                            None => { break; }
-                        }
-                    },
-                    _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); }
-                };
+        loop {
+            let maybe_status = tokio::select! {
+                next = heartbeat_futs.next() => {
+                    match next {
+                        Some(result) => result,
+                        None => { break; }
+                    }
+                },
+                _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); }
+            };
 
-                if let Some((node_id, status)) = maybe_status {
-                    new_state.insert(node_id, status);
-                }
+            if let Some((node_id, status)) = maybe_status {
+                new_state.insert(node_id, status);
             }
         }
 

From 0d9a45a4753b124e47aeabf24986b7b9e1e2f4b7 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 25 Feb 2025 10:21:35 +0000
Subject: [PATCH 571/583] safekeeper: invalidate start of interpreted batch on
 reader resets (#10951)

## Problem

The interpreted WAL reader tracks the start of the current logical
batch.
This needs to be invalidated when the reader is reset.

This bug caused a couple of WAL gap alerts in staging.

## Summary of changes

* Refactor to make it possible to write a reproducer
* Add repro unit test
* Fix by resetting the start with the reader

Related https://github.com/neondatabase/cloud/issues/23935
---
 safekeeper/src/send_interpreted_wal.rs | 247 ++++++++++++++++++++++---
 safekeeper/src/send_wal.rs             |   5 +-
 2 files changed, 221 insertions(+), 31 deletions(-)

diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
index fb06339604..0662bb9518 100644
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -100,7 +100,12 @@ struct ShardSenderState {
 /// State of [`InterpretedWalReader`] visible outside of the task running it.
 #[derive(Debug)]
 pub(crate) enum InterpretedWalReaderState {
-    Running { current_position: Lsn },
+    Running {
+        current_position: Lsn,
+        /// Tracks the start of the PG WAL LSN from which the current batch of
+        /// interpreted records originated.
+        current_batch_wal_start: Option<Lsn>,
+    },
     Done,
 }
 
@@ -122,14 +127,21 @@ pub enum InterpretedWalReaderError {
 }
 
 enum CurrentPositionUpdate {
-    Reset(Lsn),
+    Reset { from: Lsn, to: Lsn },
     NotReset(Lsn),
 }
 
 impl CurrentPositionUpdate {
     fn current_position(&self) -> Lsn {
         match self {
-            CurrentPositionUpdate::Reset(lsn) => *lsn,
+            CurrentPositionUpdate::Reset { from: _, to } => *to,
+            CurrentPositionUpdate::NotReset(lsn) => *lsn,
+        }
+    }
+
+    fn previous_position(&self) -> Lsn {
+        match self {
+            CurrentPositionUpdate::Reset { from, to: _ } => *from,
             CurrentPositionUpdate::NotReset(lsn) => *lsn,
         }
     }
@@ -145,16 +157,33 @@ impl InterpretedWalReaderState {
         }
     }
 
+    #[cfg(test)]
+    fn current_batch_wal_start(&self) -> Option<Lsn> {
+        match self {
+            InterpretedWalReaderState::Running {
+                current_batch_wal_start,
+                ..
+            } => *current_batch_wal_start,
+            InterpretedWalReaderState::Done => None,
+        }
+    }
+
     // Reset the current position of the WAL reader if the requested starting position
     // of the new shard is smaller than the current value.
     fn maybe_reset(&mut self, new_shard_start_pos: Lsn) -> CurrentPositionUpdate {
         match self {
             InterpretedWalReaderState::Running {
-                current_position, ..
+                current_position,
+                current_batch_wal_start,
             } => {
                 if new_shard_start_pos < *current_position {
+                    let from = *current_position;
                     *current_position = new_shard_start_pos;
-                    CurrentPositionUpdate::Reset(*current_position)
+                    *current_batch_wal_start = None;
+                    CurrentPositionUpdate::Reset {
+                        from,
+                        to: *current_position,
+                    }
                 } else {
                     CurrentPositionUpdate::NotReset(*current_position)
                 }
@@ -164,6 +193,47 @@ impl InterpretedWalReaderState {
             }
         }
     }
+
+    fn update_current_batch_wal_start(&mut self, lsn: Lsn) {
+        match self {
+            InterpretedWalReaderState::Running {
+                current_batch_wal_start,
+                ..
+            } => {
+                if current_batch_wal_start.is_none() {
+                    *current_batch_wal_start = Some(lsn);
+                }
+            }
+            InterpretedWalReaderState::Done => {
+                panic!("update_current_batch_wal_start called on finished reader")
+            }
+        }
+    }
+
+    fn take_current_batch_wal_start(&mut self) -> Lsn {
+        match self {
+            InterpretedWalReaderState::Running {
+                current_batch_wal_start,
+                ..
+            } => current_batch_wal_start.take().unwrap(),
+            InterpretedWalReaderState::Done => {
+                panic!("take_current_batch_wal_start called on finished reader")
+            }
+        }
+    }
+
+    fn update_current_position(&mut self, lsn: Lsn) {
+        match self {
+            InterpretedWalReaderState::Running {
+                current_position, ..
+            } => {
+                *current_position = lsn;
+            }
+            InterpretedWalReaderState::Done => {
+                panic!("update_current_position called on finished reader")
+            }
+        }
+    }
 }
 
 pub(crate) struct AttachShardNotification {
@@ -184,6 +254,7 @@ impl InterpretedWalReader {
     ) -> InterpretedWalReaderHandle {
         let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running {
             current_position: start_pos,
+            current_batch_wal_start: None,
         }));
 
         let (shard_notification_tx, shard_notification_rx) = tokio::sync::mpsc::unbounded_channel();
@@ -237,9 +308,13 @@ impl InterpretedWalReader {
         tx: tokio::sync::mpsc::Sender<Batch>,
         shard: ShardIdentity,
         pg_version: u32,
+        shard_notification_rx: Option<
+            tokio::sync::mpsc::UnboundedReceiver<AttachShardNotification>,
+        >,
     ) -> InterpretedWalReader {
         let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running {
             current_position: start_pos,
+            current_batch_wal_start: None,
         }));
 
         InterpretedWalReader {
@@ -252,7 +327,7 @@ impl InterpretedWalReader {
                     next_record_lsn: start_pos,
                 }],
             )]),
-            shard_notification_rx: None,
+            shard_notification_rx,
             state: state.clone(),
             pg_version,
         }
@@ -295,10 +370,6 @@ impl InterpretedWalReader {
 
         let mut wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
 
-        // Tracks the start of the PG WAL LSN from which the current batch of
-        // interpreted records originated.
-        let mut current_batch_wal_start_lsn: Option<Lsn> = None;
-
         loop {
             tokio::select! {
                 // Main branch for reading WAL and forwarding it
@@ -319,11 +390,7 @@ impl InterpretedWalReader {
                         }
                     };
 
-                    // We will already have a value if the previous chunks of WAL
-                    // did not decode into anything useful.
-                    if current_batch_wal_start_lsn.is_none() {
-                        current_batch_wal_start_lsn = Some(wal_start_lsn);
-                    }
+                    self.state.write().unwrap().update_current_batch_wal_start(wal_start_lsn);
 
                     wal_decoder.feed_bytes(&wal);
 
@@ -380,16 +447,11 @@ impl InterpretedWalReader {
 
                     // Update the current position such that new receivers can decide
                     // whether to attach to us or spawn a new WAL reader.
-                    match &mut *self.state.write().unwrap() {
-                        InterpretedWalReaderState::Running { current_position, .. } => {
-                            *current_position = max_next_record_lsn;
-                        },
-                        InterpretedWalReaderState::Done => {
-                            unreachable!()
-                        }
-                    }
-
-                    let batch_wal_start_lsn = current_batch_wal_start_lsn.take().unwrap();
+                    let batch_wal_start_lsn = {
+                        let mut guard = self.state.write().unwrap();
+                        guard.update_current_position(max_next_record_lsn);
+                        guard.take_current_batch_wal_start()
+                    };
 
                     // Send interpreted records downstream. Anything that has already been seen
                     // by a shard is filtered out.
@@ -480,7 +542,7 @@ impl InterpretedWalReader {
                         // anything outside the select statement.
                         let position_reset = self.state.write().unwrap().maybe_reset(start_pos);
                         match position_reset {
-                            CurrentPositionUpdate::Reset(to) => {
+                            CurrentPositionUpdate::Reset { from: _, to } => {
                                 self.wal_stream.reset(to).await;
                                 wal_decoder = WalStreamDecoder::new(to, self.pg_version);
                             },
@@ -488,14 +550,22 @@ impl InterpretedWalReader {
                         };
 
                         tracing::info!(
-                            "Added shard sender {} with start_pos={} current_pos={}",
-                            ShardSenderId::new(shard_id, new_sender_id), start_pos, position_reset.current_position()
+                            "Added shard sender {} with start_pos={} previous_pos={} current_pos={}",
+                            ShardSenderId::new(shard_id, new_sender_id),
+                            start_pos,
+                            position_reset.previous_position(),
+                            position_reset.current_position(),
                         );
                     }
                 }
             }
         }
     }
+
+    #[cfg(test)]
+    fn state(&self) -> Arc<std::sync::RwLock<InterpretedWalReaderState>> {
+        self.state.clone()
+    }
 }
 
 impl InterpretedWalReaderHandle {
@@ -633,7 +703,7 @@ mod tests {
     };
 
     use crate::{
-        send_interpreted_wal::{Batch, InterpretedWalReader},
+        send_interpreted_wal::{AttachShardNotification, Batch, InterpretedWalReader},
         test_utils::Env,
         wal_reader_stream::StreamingWalReader,
     };
@@ -913,4 +983,123 @@ mod tests {
             assert_eq!(sender.received_next_record_lsns, expected);
         }
     }
+
+    #[tokio::test]
+    async fn test_batch_start_tracking_on_reset() {
+        // When the WAL stream is reset to an older LSN,
+        // the current batch start LSN should be invalidated.
+        // This test constructs such a scenario:
+        // 1. Shard 0 is reading somewhere ahead
+        // 2. Reader reads some WAL, but does not decode a full record (partial read)
+        // 3. Shard 1 attaches to the reader and resets it to an older LSN
+        // 4. Shard 1 should get the correct batch WAL start LSN
+        let _ = env_logger::builder().is_test(true).try_init();
+
+        const SIZE: usize = 64 * 1024;
+        const MSG_COUNT: usize = 10;
+        const PG_VERSION: u32 = 17;
+        const SHARD_COUNT: u8 = 2;
+        const WAL_READER_BATCH_SIZE: usize = 8192;
+
+        let start_lsn = Lsn::from_str("0/149FD18").unwrap();
+        let shard_0_start_lsn = Lsn::from_str("0/14AFE10").unwrap();
+        let env = Env::new(true).unwrap();
+        let tli = env
+            .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
+            .await
+            .unwrap();
+
+        let resident_tli = tli.wal_residence_guard().await.unwrap();
+        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None)
+            .await
+            .unwrap();
+        let end_pos = end_watch.get();
+
+        let streaming_wal_reader = StreamingWalReader::new(
+            resident_tli,
+            None,
+            shard_0_start_lsn,
+            end_pos,
+            end_watch,
+            WAL_READER_BATCH_SIZE,
+        );
+
+        let shard_0 = ShardIdentity::new(
+            ShardNumber(0),
+            ShardCount(SHARD_COUNT),
+            ShardStripeSize::default(),
+        )
+        .unwrap();
+
+        let shard_1 = ShardIdentity::new(
+            ShardNumber(1),
+            ShardCount(SHARD_COUNT),
+            ShardStripeSize::default(),
+        )
+        .unwrap();
+
+        let mut shards = HashMap::new();
+
+        for shard_number in 0..SHARD_COUNT {
+            let shard_id = ShardIdentity::new(
+                ShardNumber(shard_number),
+                ShardCount(SHARD_COUNT),
+                ShardStripeSize::default(),
+            )
+            .unwrap();
+            let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
+            shards.insert(shard_id, (Some(tx), Some(rx)));
+        }
+
+        let shard_0_tx = shards.get_mut(&shard_0).unwrap().0.take().unwrap();
+
+        let (shard_notification_tx, shard_notification_rx) = tokio::sync::mpsc::unbounded_channel();
+
+        let reader = InterpretedWalReader::new(
+            streaming_wal_reader,
+            shard_0_start_lsn,
+            shard_0_tx,
+            shard_0,
+            PG_VERSION,
+            Some(shard_notification_rx),
+        );
+
+        let reader_state = reader.state();
+        let mut reader_fut = std::pin::pin!(reader.run(start_lsn, &None));
+        loop {
+            let poll = futures::poll!(reader_fut.as_mut());
+            assert!(poll.is_pending());
+
+            let guard = reader_state.read().unwrap();
+            if guard.current_batch_wal_start().is_some() {
+                break;
+            }
+        }
+
+        shard_notification_tx
+            .send(AttachShardNotification {
+                shard_id: shard_1,
+                sender: shards.get_mut(&shard_1).unwrap().0.take().unwrap(),
+                start_pos: start_lsn,
+            })
+            .unwrap();
+
+        let mut shard_1_rx = shards.get_mut(&shard_1).unwrap().1.take().unwrap();
+        loop {
+            let poll = futures::poll!(reader_fut.as_mut());
+            assert!(poll.is_pending());
+
+            let try_recv_res = shard_1_rx.try_recv();
+            match try_recv_res {
+                Ok(batch) => {
+                    assert_eq!(batch.records.raw_wal_start_lsn.unwrap(), start_lsn);
+                    break;
+                }
+                Err(tokio::sync::mpsc::error::TryRecvError::Empty) => {}
+                Err(tokio::sync::mpsc::error::TryRecvError::Disconnected) => {
+                    unreachable!();
+                }
+            }
+        }
+    }
 }
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 4a4a74a0fd..72b1fd9fc3 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -624,8 +624,9 @@ impl SafekeeperPostgresHandler {
                         MAX_SEND_SIZE,
                     );
 
-                    let reader =
-                        InterpretedWalReader::new(wal_reader, start_pos, tx, shard, pg_version);
+                    let reader = InterpretedWalReader::new(
+                        wal_reader, start_pos, tx, shard, pg_version, None,
+                    );
 
                     let sender = InterpretedWalSender {
                         format,

From 758f5972807083987ca0c7977a1ef237bf642d73 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Tue, 25 Feb 2025 14:56:05 +0300
Subject: [PATCH 572/583] compute <-> sk protocol v3 (#10647)

## Problem

As part of https://github.com/neondatabase/neon/issues/8614 we need to
pass membership configurations between compute and safekeepers.

## Summary of changes

Add version 3 of the protocol carrying membership configurations.
Greeting message in both sides gets full conf, and other messages
generation number only. Use protocol bump to include other accumulated
changes:
- stop packing whole structs on the wire as is;
- make the tag u8 instead of u64;
- send all ints in network order;
- drop proposer_uuid, we can pass it in START_WAL_PUSH and it wasn't
much useful anyway.
Per message changes, apart from mconf:
- ProposerGreeting: tenant / timeline id is sent now as hex cstring.
Remove proto version, it is passed outside in START_WAL_PUSH. Remove
postgres timeline, it is unused. Reorder fields a bit.
- AcceptorGreeting: reorder fields
- VoteResponse: timeline_start_lsn is removed. It can be taken from
first member of term history, and later we won't need it at all when all
timelines will be explicitly created. Vote itself is u8 instead of u64.
- ProposerElected: timeline_start_lsn is removed for the same reasons.
- AppendRequest: epoch_start_lsn removed, it is known from term history
in ProposerElected.

Both compute and sk are able to talk v2 and v3 to make rollbacks (in
case we need them) easier; neon.safekeeper_proto_version GUC sets the
client version. v2 code can be dropped later.

So far empty conf is passed everywhere, future PRs will handle them.

To test, add param to some tests choosing proto version; we want to test
both 2 and 3 until we fully migrate.

ref https://github.com/neondatabase/neon/issues/10326

---------

Co-authored-by: Arthur Petukhovsky <petuhovskiy@yandex.ru>
---
 libs/safekeeper_api/src/membership.rs         |  18 +-
 libs/walproposer/src/walproposer.rs           |  68 +-
 pgxn/neon/neon_utils.c                        |  20 +
 pgxn/neon/neon_utils.h                        |   2 +
 pgxn/neon/walproposer.c                       | 671 ++++++++++++-----
 pgxn/neon/walproposer.h                       | 133 +++-
 pgxn/neon/walproposer_compat.c                |  46 +-
 pgxn/neon/walproposer_pg.c                    |  56 +-
 safekeeper/benches/receive_wal.rs             |  10 +-
 safekeeper/src/json_ctrl.rs                   |   7 +-
 safekeeper/src/receive_wal.rs                 |   9 +-
 safekeeper/src/recovery.rs                    |  11 +-
 safekeeper/src/safekeeper.rs                  | 708 +++++++++++++-----
 safekeeper/src/test_utils.rs                  |   6 +-
 .../tests/walproposer_sim/safekeeper.rs       |   8 +-
 test_runner/regress/test_normal_work.py       |  20 +-
 .../regress/test_wal_acceptor_async.py        |  14 +-
 17 files changed, 1355 insertions(+), 452 deletions(-)

diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs
index 8b14a4f290..2f20ec5f94 100644
--- a/libs/safekeeper_api/src/membership.rs
+++ b/libs/safekeeper_api/src/membership.rs
@@ -68,14 +68,12 @@ impl Display for SafekeeperId {
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 #[serde(transparent)]
 pub struct MemberSet {
-    pub members: Vec<SafekeeperId>,
+    pub m: Vec<SafekeeperId>,
 }
 
 impl MemberSet {
     pub fn empty() -> Self {
-        MemberSet {
-            members: Vec::new(),
-        }
+        MemberSet { m: Vec::new() }
     }
 
     pub fn new(members: Vec<SafekeeperId>) -> anyhow::Result<Self> {
@@ -83,11 +81,11 @@ impl MemberSet {
         if hs.len() != members.len() {
             bail!("duplicate safekeeper id in the set {:?}", members);
         }
-        Ok(MemberSet { members })
+        Ok(MemberSet { m: members })
     }
 
     pub fn contains(&self, sk: &SafekeeperId) -> bool {
-        self.members.iter().any(|m| m.id == sk.id)
+        self.m.iter().any(|m| m.id == sk.id)
     }
 
     pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> {
@@ -97,7 +95,7 @@ impl MemberSet {
                 sk.id, self
             ));
         }
-        self.members.push(sk);
+        self.m.push(sk);
         Ok(())
     }
 }
@@ -105,11 +103,7 @@ impl MemberSet {
 impl Display for MemberSet {
     /// Display as a comma separated list of members.
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let sks_str = self
-            .members
-            .iter()
-            .map(|m| m.to_string())
-            .collect::<Vec<_>>();
+        let sks_str = self.m.iter().map(|sk| sk.to_string()).collect::<Vec<_>>();
         write!(f, "({})", sks_str.join(", "))
     }
 }
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index ba75171db2..60b606c64a 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -215,6 +215,7 @@ impl Wrapper {
             syncSafekeepers: config.sync_safekeepers,
             systemId: 0,
             pgTimeline: 1,
+            proto_version: 3,
             callback_data,
         };
         let c_config = Box::into_raw(Box::new(c_config));
@@ -276,6 +277,7 @@ mod tests {
     use core::panic;
     use std::{
         cell::Cell,
+        ffi::CString,
         sync::{atomic::AtomicUsize, mpsc::sync_channel},
     };
 
@@ -496,57 +498,64 @@ mod tests {
         // Messages definitions are at walproposer.h
         // xxx: it would be better to extract them from safekeeper crate and
         // use serialization/deserialization here.
-        let greeting_tag = (b'g' as u64).to_ne_bytes();
-        let proto_version = 2_u32.to_ne_bytes();
-        let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes();
-        let proposer_id = [0; 16];
-        let system_id = 0_u64.to_ne_bytes();
-        let tenant_id = ttid.tenant_id.as_arr();
-        let timeline_id = ttid.timeline_id.as_arr();
-        let pg_tli = 1_u32.to_ne_bytes();
-        let wal_seg_size = 16777216_u32.to_ne_bytes();
+        let greeting_tag = (b'g').to_be_bytes();
+        let tenant_id = CString::new(ttid.tenant_id.to_string())
+            .unwrap()
+            .into_bytes_with_nul();
+        let timeline_id = CString::new(ttid.timeline_id.to_string())
+            .unwrap()
+            .into_bytes_with_nul();
+        let mconf_gen = 0_u32.to_be_bytes();
+        let mconf_members_len = 0_u32.to_be_bytes();
+        let mconf_members_new_len = 0_u32.to_be_bytes();
+        let pg_version: [u8; 4] = PG_VERSION_NUM.to_be_bytes();
+        let system_id = 0_u64.to_be_bytes();
+        let wal_seg_size = 16777216_u32.to_be_bytes();
+
         let proposer_greeting = [
             greeting_tag.as_slice(),
-            proto_version.as_slice(),
-            pg_version.as_slice(),
-            proposer_id.as_slice(),
-            system_id.as_slice(),
             tenant_id.as_slice(),
             timeline_id.as_slice(),
-            pg_tli.as_slice(),
+            mconf_gen.as_slice(),
+            mconf_members_len.as_slice(),
+            mconf_members_new_len.as_slice(),
+            pg_version.as_slice(),
+            system_id.as_slice(),
             wal_seg_size.as_slice(),
         ]
         .concat();
 
-        let voting_tag = (b'v' as u64).to_ne_bytes();
-        let vote_request_term = 3_u64.to_ne_bytes();
-        let proposer_id = [0; 16];
+        let voting_tag = (b'v').to_be_bytes();
+        let vote_request_term = 3_u64.to_be_bytes();
         let vote_request = [
             voting_tag.as_slice(),
+            mconf_gen.as_slice(),
             vote_request_term.as_slice(),
-            proposer_id.as_slice(),
         ]
         .concat();
 
-        let acceptor_greeting_term = 2_u64.to_ne_bytes();
-        let acceptor_greeting_node_id = 1_u64.to_ne_bytes();
+        let acceptor_greeting_term = 2_u64.to_be_bytes();
+        let acceptor_greeting_node_id = 1_u64.to_be_bytes();
         let acceptor_greeting = [
             greeting_tag.as_slice(),
-            acceptor_greeting_term.as_slice(),
             acceptor_greeting_node_id.as_slice(),
+            mconf_gen.as_slice(),
+            mconf_members_len.as_slice(),
+            mconf_members_new_len.as_slice(),
+            acceptor_greeting_term.as_slice(),
         ]
         .concat();
 
-        let vote_response_term = 3_u64.to_ne_bytes();
-        let vote_given = 1_u64.to_ne_bytes();
-        let flush_lsn = 0x539_u64.to_ne_bytes();
-        let truncate_lsn = 0x539_u64.to_ne_bytes();
-        let th_len = 1_u32.to_ne_bytes();
-        let th_term = 2_u64.to_ne_bytes();
-        let th_lsn = 0x539_u64.to_ne_bytes();
-        let timeline_start_lsn = 0x539_u64.to_ne_bytes();
+        let vote_response_term = 3_u64.to_be_bytes();
+        let vote_given = 1_u8.to_be_bytes();
+        let flush_lsn = 0x539_u64.to_be_bytes();
+        let truncate_lsn = 0x539_u64.to_be_bytes();
+        let th_len = 1_u32.to_be_bytes();
+        let th_term = 2_u64.to_be_bytes();
+        let th_lsn = 0x539_u64.to_be_bytes();
         let vote_response = [
             voting_tag.as_slice(),
+            mconf_gen.as_slice(),
             vote_response_term.as_slice(),
             vote_given.as_slice(),
             flush_lsn.as_slice(),
@@ -554,7 +563,6 @@ mod tests {
             th_len.as_slice(),
             th_term.as_slice(),
             th_lsn.as_slice(),
-            timeline_start_lsn.as_slice(),
         ]
         .concat();
 
diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c
index 1fb4ed9522..1fad44bd58 100644
--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -51,6 +51,26 @@ HexDecodeString(uint8 *result, char *input, int nbytes)
 	return true;
 }
 
+/* --------------------------------
+ *		pq_getmsgint16	- get a binary 2-byte int from a message buffer
+ * --------------------------------
+ */
+uint16
+pq_getmsgint16(StringInfo msg)
+{
+	return pq_getmsgint(msg, 2);
+}
+
+/* --------------------------------
+ *		pq_getmsgint32	- get a binary 4-byte int from a message buffer
+ * --------------------------------
+ */
+uint32
+pq_getmsgint32(StringInfo msg)
+{
+	return pq_getmsgint(msg, 4);
+}
+
 /* --------------------------------
  *		pq_getmsgint32_le	- get a binary 4-byte int from a message buffer in native (LE) order
  * --------------------------------
diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h
index 89683714f1..7480ac28cc 100644
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -8,6 +8,8 @@
 #endif
 
 bool		HexDecodeString(uint8 *result, char *input, int nbytes);
+uint16      pq_getmsgint16(StringInfo msg);
+uint32      pq_getmsgint32(StringInfo msg);
 uint32		pq_getmsgint32_le(StringInfo msg);
 uint64		pq_getmsgint64_le(StringInfo msg);
 void		pq_sendint32_le(StringInfo buf, uint32 i);
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 7472fd6afc..d7604e30d7 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -70,6 +70,7 @@ static bool SendAppendRequests(Safekeeper *sk);
 static bool RecvAppendResponses(Safekeeper *sk);
 static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp);
 static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp);
+static void PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version);
 static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk);
 static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size);
 static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg);
@@ -81,6 +82,8 @@ static char *FormatSafekeeperState(Safekeeper *sk);
 static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
 static char *FormatEvents(WalProposer *wp, uint32 events);
 static void UpdateDonorShmem(WalProposer *wp);
+static char *MembershipConfigurationToString(MembershipConfiguration *mconf);
+static void MembershipConfigurationFree(MembershipConfiguration *mconf);
 
 WalProposer *
 WalProposerCreate(WalProposerConfig *config, walproposer_api api)
@@ -137,25 +140,21 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	}
 	wp->quorum = wp->n_safekeepers / 2 + 1;
 
+	if (wp->config->proto_version != 2 && wp->config->proto_version != 3)
+		wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version);
+	wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version);
+
 	/* Fill the greeting package */
-	wp->greetRequest.tag = 'g';
-	wp->greetRequest.protocolVersion = SK_PROTOCOL_VERSION;
-	wp->greetRequest.pgVersion = PG_VERSION_NUM;
-	wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId));
-	wp->greetRequest.systemId = wp->config->systemId;
-	if (!wp->config->neon_timeline)
-		wp_log(FATAL, "neon.timeline_id is not provided");
-	if (*wp->config->neon_timeline != '\0' &&
-		!HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16))
-		wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline);
+	wp->greetRequest.pam.tag = 'g';
 	if (!wp->config->neon_tenant)
 		wp_log(FATAL, "neon.tenant_id is not provided");
-	if (*wp->config->neon_tenant != '\0' &&
-		!HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16))
-		wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant);
-
-	wp->greetRequest.timeline = wp->config->pgTimeline;
-	wp->greetRequest.walSegSize = wp->config->wal_segment_size;
+	wp->greetRequest.tenant_id = wp->config->neon_tenant;
+	if (!wp->config->neon_timeline)
+		wp_log(FATAL, "neon.timeline_id is not provided");
+	wp->greetRequest.timeline_id = wp->config->neon_timeline;
+	wp->greetRequest.pg_version = PG_VERSION_NUM;
+	wp->greetRequest.system_id = wp->config->systemId;
+	wp->greetRequest.wal_seg_size = wp->config->wal_segment_size;
 
 	wp->api.init_event_set(wp);
 
@@ -165,12 +164,14 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 void
 WalProposerFree(WalProposer *wp)
 {
+	MembershipConfigurationFree(&wp->mconf);
 	for (int i = 0; i < wp->n_safekeepers; i++)
 	{
 		Safekeeper *sk = &wp->safekeeper[i];
 
 		Assert(sk->outbuf.data != NULL);
 		pfree(sk->outbuf.data);
+		MembershipConfigurationFree(&sk->greetResponse.mconf);
 		if (sk->voteResponse.termHistory.entries)
 			pfree(sk->voteResponse.termHistory.entries);
 		sk->voteResponse.termHistory.entries = NULL;
@@ -308,6 +309,7 @@ ShutdownConnection(Safekeeper *sk)
 	sk->state = SS_OFFLINE;
 	sk->streamingAt = InvalidXLogRecPtr;
 
+	MembershipConfigurationFree(&sk->greetResponse.mconf);
 	if (sk->voteResponse.termHistory.entries)
 		pfree(sk->voteResponse.termHistory.entries);
 	sk->voteResponse.termHistory.entries = NULL;
@@ -598,11 +600,14 @@ static void
 SendStartWALPush(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
+#define CMD_LEN 512
+	char		cmd[CMD_LEN];
 
-	if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
+	snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d')", wp->config->proto_version);
+	if (!wp->api.conn_send_query(sk, cmd))
 	{
-		wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
-			   sk->host, sk->port, wp->api.conn_error_message(sk));
+		wp_log(WARNING, "failed to send '%s' query to safekeeper %s:%s: %s",
+			   cmd, sk->host, sk->port, wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return;
 	}
@@ -658,23 +663,33 @@ RecvStartWALPushResult(Safekeeper *sk)
 
 /*
  * Start handshake: first of all send information about the
- * safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for
+ * walproposer. After sending, we wait on SS_HANDSHAKE_RECV for
  * a response to finish the handshake.
  */
 static void
 SendProposerGreeting(Safekeeper *sk)
 {
+	WalProposer *wp = sk->wp;
+	char	   *mconf_toml = MembershipConfigurationToString(&wp->greetRequest.mconf);
+
+	wp_log(LOG, "sending ProposerGreeting to safekeeper %s:%s with mconf = %s", sk->host, sk->port, mconf_toml);
+	pfree(mconf_toml);
+
+	PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->greetRequest,
+					   &sk->outbuf, wp->config->proto_version);
+
 	/*
 	 * On failure, logging & resetting the connection is handled. We just need
 	 * to handle the control flow.
 	 */
-	BlockingWrite(sk, &sk->wp->greetRequest, sizeof(sk->wp->greetRequest), SS_HANDSHAKE_RECV);
+	BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV);
 }
 
 static void
 RecvAcceptorGreeting(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
+	char	   *mconf_toml;
 
 	/*
 	 * If our reading doesn't immediately succeed, any necessary error
@@ -685,7 +700,10 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
 		return;
 
-	wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, term=" INT64_FORMAT, sk->host, sk->port, sk->greetResponse.term);
+	mconf_toml = MembershipConfigurationToString(&sk->greetResponse.mconf);
+	wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, node_id = %lu, mconf = %s, term=" UINT64_FORMAT,
+		   sk->host, sk->port, sk->greetResponse.nodeId, mconf_toml, sk->greetResponse.term);
+	pfree(mconf_toml);
 
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_VOTING;
@@ -707,12 +725,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
 			wp->propTerm++;
 			wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
 
-			wp->voteRequest = (VoteRequest)
-			{
-				.tag = 'v',
-					.term = wp->propTerm
-			};
-			memcpy(wp->voteRequest.proposerId.data, wp->greetRequest.proposerId.data, UUID_LEN);
+			wp->voteRequest.pam.tag = 'v';
+			wp->voteRequest.generation = wp->mconf.generation;
+			wp->voteRequest.term = wp->propTerm;
 		}
 	}
 	else if (sk->greetResponse.term > wp->propTerm)
@@ -759,12 +774,14 @@ SendVoteRequest(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
 
-	/* We have quorum for voting, send our vote request */
-	wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
-	/* On failure, logging & resetting is handled */
-	if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT))
-		return;
+	PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->voteRequest,
+					   &sk->outbuf, wp->config->proto_version);
 
+	/* We have quorum for voting, send our vote request */
+	wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port,
+		   wp->voteRequest.generation, wp->voteRequest.term);
+	/* On failure, logging & resetting is handled */
+	BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT);
 	/* If successful, wait for read-ready with SS_WAIT_VERDICT */
 }
 
@@ -778,11 +795,12 @@ RecvVoteResponse(Safekeeper *sk)
 		return;
 
 	wp_log(LOG,
-		   "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
-		   sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
+		   "got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
+		   sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
+		   sk->voteResponse.voteGiven,
+		   GetHighestTerm(&sk->voteResponse.termHistory),
 		   LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
-		   LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
-		   LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
+		   LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn));
 
 	/*
 	 * In case of acceptor rejecting our vote, bail out, but only if either it
@@ -847,9 +865,9 @@ HandleElectedProposer(WalProposer *wp)
 	 * otherwise we must be sync-safekeepers and we have nothing to do then.
 	 *
 	 * Proceeding is not only pointless but harmful, because we'd give
-	 * safekeepers term history starting with 0/0. These hacks will go away once
-	 * we disable implicit timeline creation on safekeepers and create it with
-	 * non zero LSN from the start.
+	 * safekeepers term history starting with 0/0. These hacks will go away
+	 * once we disable implicit timeline creation on safekeepers and create it
+	 * with non zero LSN from the start.
 	 */
 	if (wp->propEpochStartLsn == InvalidXLogRecPtr)
 	{
@@ -942,7 +960,6 @@ DetermineEpochStartLsn(WalProposer *wp)
 	wp->propEpochStartLsn = InvalidXLogRecPtr;
 	wp->donorEpoch = 0;
 	wp->truncateLsn = InvalidXLogRecPtr;
-	wp->timelineStartLsn = InvalidXLogRecPtr;
 
 	for (int i = 0; i < wp->n_safekeepers; i++)
 	{
@@ -959,20 +976,6 @@ DetermineEpochStartLsn(WalProposer *wp)
 				wp->donor = i;
 			}
 			wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
-
-			if (wp->safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr)
-			{
-				/* timelineStartLsn should be the same everywhere or unknown */
-				if (wp->timelineStartLsn != InvalidXLogRecPtr &&
-					wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
-				{
-					wp_log(WARNING,
-						   "inconsistent timelineStartLsn: current %X/%X, received %X/%X",
-						   LSN_FORMAT_ARGS(wp->timelineStartLsn),
-						   LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
-				}
-				wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
-			}
 		}
 	}
 
@@ -995,22 +998,11 @@ DetermineEpochStartLsn(WalProposer *wp)
 	if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers)
 	{
 		wp->propEpochStartLsn = wp->truncateLsn = wp->api.get_redo_start_lsn(wp);
-		if (wp->timelineStartLsn == InvalidXLogRecPtr)
-		{
-			wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp);
-		}
 		wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
 	}
 	pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propEpochStartLsn);
 
-	/*
-	 * Safekeepers are setting truncateLsn after timelineStartLsn is known, so
-	 * it should never be zero at this point, if we know timelineStartLsn.
-	 *
-	 * timelineStartLsn can be zero only on the first syncSafekeepers run.
-	 */
-	Assert((wp->truncateLsn != InvalidXLogRecPtr) ||
-		   (wp->config->syncSafekeepers && wp->truncateLsn == wp->timelineStartLsn));
+	Assert(wp->truncateLsn != InvalidXLogRecPtr || wp->config->syncSafekeepers);
 
 	/*
 	 * We will be generating WAL since propEpochStartLsn, so we should set
@@ -1053,10 +1045,11 @@ DetermineEpochStartLsn(WalProposer *wp)
 		if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp))
 		{
 			/*
-			 * However, allow to proceed if last_log_term on the node which gave
-			 * the highest vote (i.e. point where we are going to start writing)
-			 * actually had been won by me; plain restart of walproposer not
-			 * intervened by concurrent compute which wrote WAL is ok.
+			 * However, allow to proceed if last_log_term on the node which
+			 * gave the highest vote (i.e. point where we are going to start
+			 * writing) actually had been won by me; plain restart of
+			 * walproposer not intervened by concurrent compute which wrote
+			 * WAL is ok.
 			 *
 			 * This avoids compute crash after manual term_bump.
 			 */
@@ -1126,14 +1119,8 @@ SendProposerElected(Safekeeper *sk)
 	{
 		/* safekeeper is empty or no common point, start from the beginning */
 		sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
-		wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u",
-			   sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
-
-		/*
-		 * wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline
-		 * is created manually (test_s3_wal_replay)
-		 */
-		Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr);
+		wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, termHistory.n_entries=%u",
+			   sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), wp->propTermHistory.n_entries);
 	}
 	else
 	{
@@ -1158,29 +1145,19 @@ SendProposerElected(Safekeeper *sk)
 
 	Assert(sk->startStreamingAt <= wp->availableLsn);
 
-	msg.tag = 'e';
+	msg.apm.tag = 'e';
+	msg.generation = wp->mconf.generation;
 	msg.term = wp->propTerm;
 	msg.startStreamingAt = sk->startStreamingAt;
 	msg.termHistory = &wp->propTermHistory;
-	msg.timelineStartLsn = wp->timelineStartLsn;
 
 	lastCommonTerm = idx >= 0 ? wp->propTermHistory.entries[idx].term : 0;
 	wp_log(LOG,
-		   "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
-		   sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
-
-	resetStringInfo(&sk->outbuf);
-	pq_sendint64_le(&sk->outbuf, msg.tag);
-	pq_sendint64_le(&sk->outbuf, msg.term);
-	pq_sendint64_le(&sk->outbuf, msg.startStreamingAt);
-	pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries);
-	for (int i = 0; i < msg.termHistory->n_entries; i++)
-	{
-		pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term);
-		pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn);
-	}
-	pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn);
+		   "sending elected msg to node " UINT64_FORMAT " generation=%u term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s",
+		   sk->greetResponse.nodeId, msg.generation, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt),
+		   lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port);
 
+	PAMessageSerialize(wp, (ProposerAcceptorMessage *) &msg, &sk->outbuf, wp->config->proto_version);
 	if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH))
 		return;
 
@@ -1246,14 +1223,13 @@ static void
 PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn)
 {
 	Assert(endLsn >= beginLsn);
-	req->tag = 'a';
+	req->apm.tag = 'a';
+	req->generation = wp->mconf.generation;
 	req->term = wp->propTerm;
-	req->epochStartLsn = wp->propEpochStartLsn;
 	req->beginLsn = beginLsn;
 	req->endLsn = endLsn;
 	req->commitLsn = wp->commitLsn;
 	req->truncateLsn = wp->truncateLsn;
-	req->proposerId = wp->greetRequest.proposerId;
 }
 
 /*
@@ -1354,7 +1330,8 @@ SendAppendRequests(Safekeeper *sk)
 			resetStringInfo(&sk->outbuf);
 
 			/* write AppendRequest header */
-			appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader));
+			PAMessageSerialize(wp, (ProposerAcceptorMessage *) req, &sk->outbuf, wp->config->proto_version);
+			/* prepare for reading WAL into the outbuf */
 			enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
 			sk->active_state = SS_ACTIVE_READ_WAL;
 		}
@@ -1367,14 +1344,17 @@ SendAppendRequests(Safekeeper *sk)
 			req = &sk->appendRequest;
 			req_len = req->endLsn - req->beginLsn;
 
-			/* We send zero sized AppenRequests as heartbeats; don't wal_read for these. */
+			/*
+			 * We send zero sized AppenRequests as heartbeats; don't wal_read
+			 * for these.
+			 */
 			if (req_len > 0)
 			{
 				switch (wp->api.wal_read(sk,
-										&sk->outbuf.data[sk->outbuf.len],
-										req->beginLsn,
-										req_len,
-										&errmsg))
+										 &sk->outbuf.data[sk->outbuf.len],
+										 req->beginLsn,
+										 req_len,
+										 &errmsg))
 				{
 					case NEON_WALREAD_SUCCESS:
 						break;
@@ -1382,7 +1362,7 @@ SendAppendRequests(Safekeeper *sk)
 						return true;
 					case NEON_WALREAD_ERROR:
 						wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
-							sk->host, sk->port, errmsg);
+							   sk->host, sk->port, errmsg);
 						ShutdownConnection(sk);
 						return false;
 					default:
@@ -1470,11 +1450,11 @@ RecvAppendResponses(Safekeeper *sk)
 			 * Term has changed to higher one, probably another compute is
 			 * running. If this is the case we could PANIC as well because
 			 * likely it inserted some data and our basebackup is unsuitable
-			 * anymore. However, we also bump term manually (term_bump endpoint)
-			 * on safekeepers for migration purposes, in this case we do want
-			 * compute to stay alive. So restart walproposer with FATAL instead
-			 * of panicking; if basebackup is spoiled next election will notice
-			 * this.
+			 * anymore. However, we also bump term manually (term_bump
+			 * endpoint) on safekeepers for migration purposes, in this case
+			 * we do want compute to stay alive. So restart walproposer with
+			 * FATAL instead of panicking; if basebackup is spoiled next
+			 * election will notice this.
 			 */
 			wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
 				   sk->host, sk->port,
@@ -1509,7 +1489,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 
 	for (i = 0; i < nkeys; i++)
 	{
-		const char *key = pq_getmsgstring(reply_message);
+		const char *key = pq_getmsgrawstring(reply_message);
 		unsigned int value_len = pq_getmsgint(reply_message, sizeof(int32));
 
 		if (strcmp(key, "current_timeline_size") == 0)
@@ -1750,6 +1730,208 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
 	}
 }
 
+/* Serialize MembershipConfiguration into buf. */
+static void
+MembershipConfigurationSerialize(MembershipConfiguration *mconf, StringInfo buf)
+{
+	uint32		i;
+
+	pq_sendint32(buf, mconf->generation);
+
+	pq_sendint32(buf, mconf->members.len);
+	for (i = 0; i < mconf->members.len; i++)
+	{
+		pq_sendint64(buf, mconf->members.m[i].node_id);
+		pq_send_ascii_string(buf, mconf->members.m[i].host);
+		pq_sendint16(buf, mconf->members.m[i].port);
+	}
+
+	/*
+	 * There is no special mark for absent new_members; zero members in
+	 * invalid, so zero len means absent.
+	 */
+	pq_sendint32(buf, mconf->new_members.len);
+	for (i = 0; i < mconf->new_members.len; i++)
+	{
+		pq_sendint64(buf, mconf->new_members.m[i].node_id);
+		pq_send_ascii_string(buf, mconf->new_members.m[i].host);
+		pq_sendint16(buf, mconf->new_members.m[i].port);
+	}
+}
+
+/* Serialize proposer -> acceptor message into buf using specified version */
+static void
+PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version)
+{
+	/* both version are supported currently until we fully migrate to 3 */
+	Assert(proto_version == 3 || proto_version == 2);
+
+	resetStringInfo(buf);
+
+	if (proto_version == 3)
+	{
+		/*
+		 * v2 sends structs for some messages as is, so commonly send tag only
+		 * for v3
+		 */
+		pq_sendint8(buf, msg->tag);
+
+		switch (msg->tag)
+		{
+			case 'g':
+				{
+					ProposerGreeting *m = (ProposerGreeting *) msg;
+
+					pq_send_ascii_string(buf, m->tenant_id);
+					pq_send_ascii_string(buf, m->timeline_id);
+					MembershipConfigurationSerialize(&m->mconf, buf);
+					pq_sendint32(buf, m->pg_version);
+					pq_sendint64(buf, m->system_id);
+					pq_sendint32(buf, m->wal_seg_size);
+					break;
+				}
+			case 'v':
+				{
+					VoteRequest *m = (VoteRequest *) msg;
+
+					pq_sendint32(buf, m->generation);
+					pq_sendint64(buf, m->term);
+					break;
+
+				}
+			case 'e':
+				{
+					ProposerElected *m = (ProposerElected *) msg;
+
+					pq_sendint32(buf, m->generation);
+					pq_sendint64(buf, m->term);
+					pq_sendint64(buf, m->startStreamingAt);
+					pq_sendint32(buf, m->termHistory->n_entries);
+					for (uint32 i = 0; i < m->termHistory->n_entries; i++)
+					{
+						pq_sendint64(buf, m->termHistory->entries[i].term);
+						pq_sendint64(buf, m->termHistory->entries[i].lsn);
+					}
+					break;
+				}
+			case 'a':
+				{
+					/*
+					 * Note: this serializes only AppendRequestHeader, caller
+					 * is expected to append WAL data later.
+					 */
+					AppendRequestHeader *m = (AppendRequestHeader *) msg;
+
+					pq_sendint32(buf, m->generation);
+					pq_sendint64(buf, m->term);
+					pq_sendint64(buf, m->beginLsn);
+					pq_sendint64(buf, m->endLsn);
+					pq_sendint64(buf, m->commitLsn);
+					pq_sendint64(buf, m->truncateLsn);
+					break;
+				}
+			default:
+				wp_log(FATAL, "unexpected message type %c to serialize", msg->tag);
+		}
+		return;
+	}
+
+	if (proto_version == 2)
+	{
+		switch (msg->tag)
+		{
+			case 'g':
+				{
+					/* v2 sent struct as is */
+					ProposerGreeting *m = (ProposerGreeting *) msg;
+					ProposerGreetingV2 greetRequestV2;
+
+					/* Fill also v2 struct. */
+					greetRequestV2.tag = 'g';
+					greetRequestV2.protocolVersion = proto_version;
+					greetRequestV2.pgVersion = m->pg_version;
+
+					/*
+					 * v3 removed this field because it's easier to pass as
+					 * libq or START_WAL_PUSH options
+					 */
+					memset(&greetRequestV2.proposerId, 0, sizeof(greetRequestV2.proposerId));
+					greetRequestV2.systemId = wp->config->systemId;
+					if (*m->timeline_id != '\0' &&
+						!HexDecodeString(greetRequestV2.timeline_id, m->timeline_id, 16))
+						wp_log(FATAL, "could not parse neon.timeline_id, %s", m->timeline_id);
+					if (*m->tenant_id != '\0' &&
+						!HexDecodeString(greetRequestV2.tenant_id, m->tenant_id, 16))
+						wp_log(FATAL, "could not parse neon.tenant_id, %s", m->tenant_id);
+
+					greetRequestV2.timeline = wp->config->pgTimeline;
+					greetRequestV2.walSegSize = wp->config->wal_segment_size;
+
+					pq_sendbytes(buf, (char *) &greetRequestV2, sizeof(greetRequestV2));
+					break;
+				}
+			case 'v':
+				{
+					/* v2 sent struct as is */
+					VoteRequest *m = (VoteRequest *) msg;
+					VoteRequestV2 voteRequestV2;
+
+					voteRequestV2.tag = m->pam.tag;
+					voteRequestV2.term = m->term;
+					/* removed field */
+					memset(&voteRequestV2.proposerId, 0, sizeof(voteRequestV2.proposerId));
+					pq_sendbytes(buf, (char *) &voteRequestV2, sizeof(voteRequestV2));
+					break;
+				}
+			case 'e':
+				{
+					ProposerElected *m = (ProposerElected *) msg;
+
+					pq_sendint64_le(buf, m->apm.tag);
+					pq_sendint64_le(buf, m->term);
+					pq_sendint64_le(buf, m->startStreamingAt);
+					pq_sendint32_le(buf, m->termHistory->n_entries);
+					for (int i = 0; i < m->termHistory->n_entries; i++)
+					{
+						pq_sendint64_le(buf, m->termHistory->entries[i].term);
+						pq_sendint64_le(buf, m->termHistory->entries[i].lsn);
+					}
+					pq_sendint64_le(buf, 0);	/* removed timeline_start_lsn */
+					break;
+				}
+			case 'a':
+
+				/*
+				 * Note: this serializes only AppendRequestHeader, caller is
+				 * expected to append WAL data later.
+				 */
+				{
+					/* v2 sent struct as is */
+					AppendRequestHeader *m = (AppendRequestHeader *) msg;
+					AppendRequestHeaderV2 appendRequestHeaderV2;
+
+					appendRequestHeaderV2.tag = m->apm.tag;
+					appendRequestHeaderV2.term = m->term;
+					appendRequestHeaderV2.epochStartLsn = 0;	/* removed field */
+					appendRequestHeaderV2.beginLsn = m->beginLsn;
+					appendRequestHeaderV2.endLsn = m->endLsn;
+					appendRequestHeaderV2.commitLsn = m->commitLsn;
+					appendRequestHeaderV2.truncateLsn = m->truncateLsn;
+					/* removed field */
+					memset(&appendRequestHeaderV2.proposerId, 0, sizeof(appendRequestHeaderV2.proposerId));
+
+					pq_sendbytes(buf, (char *) &appendRequestHeaderV2, sizeof(appendRequestHeaderV2));
+					break;
+				}
+
+			default:
+				wp_log(FATAL, "unexpected message type %c to serialize", msg->tag);
+		}
+		return;
+	}
+	wp_log(FATAL, "unexpected proto_version %d", proto_version);
+}
+
 /*
  * Try to read CopyData message from i'th safekeeper, resetting connection on
  * failure.
@@ -1779,6 +1961,37 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
 	return false;
 }
 
+/* Deserialize membership configuration from buf to mconf. */
+static void
+MembershipConfigurationDeserialize(MembershipConfiguration *mconf, StringInfo buf)
+{
+	uint32		i;
+
+	mconf->generation = pq_getmsgint32(buf);
+	mconf->members.len = pq_getmsgint32(buf);
+	mconf->members.m = palloc0(sizeof(SafekeeperId) * mconf->members.len);
+	for (i = 0; i < mconf->members.len; i++)
+	{
+		const char *buf_host;
+
+		mconf->members.m[i].node_id = pq_getmsgint64(buf);
+		buf_host = pq_getmsgrawstring(buf);
+		strlcpy(mconf->members.m[i].host, buf_host, sizeof(mconf->members.m[i].host));
+		mconf->members.m[i].port = pq_getmsgint16(buf);
+	}
+	mconf->new_members.len = pq_getmsgint32(buf);
+	mconf->new_members.m = palloc0(sizeof(SafekeeperId) * mconf->new_members.len);
+	for (i = 0; i < mconf->new_members.len; i++)
+	{
+		const char *buf_host;
+
+		mconf->new_members.m[i].node_id = pq_getmsgint64(buf);
+		buf_host = pq_getmsgrawstring(buf);
+		strlcpy(mconf->new_members.m[i].host, buf_host, sizeof(mconf->new_members.m[i].host));
+		mconf->new_members.m[i].port = pq_getmsgint16(buf);
+	}
+}
+
 /*
  * Read next message with known type into provided struct, by reading a CopyData
  * block from the safekeeper's postgres connection, returning whether the read
@@ -1787,6 +2000,8 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
  * If the read needs more polling, we return 'false' and keep the state
  * unmodified, waiting until it becomes read-ready to try again. If it fully
  * failed, a warning is emitted and the connection is reset.
+ *
+ * Note: it pallocs if needed, i.e. for AcceptorGreeting and VoteResponse fields.
  */
 static bool
 AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
@@ -1795,82 +2010,154 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 
 	char	   *buf;
 	int			buf_size;
-	uint64		tag;
+	uint8		tag;
 	StringInfoData s;
 
 	if (!(AsyncRead(sk, &buf, &buf_size)))
 		return false;
+	sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
 
 	/* parse it */
 	s.data = buf;
 	s.len = buf_size;
+	s.maxlen = buf_size;
 	s.cursor = 0;
 
-	tag = pq_getmsgint64_le(&s);
-	if (tag != anymsg->tag)
+	if (wp->config->proto_version == 3)
 	{
-		wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
-			   sk->port, FormatSafekeeperState(sk));
-		ResetConnection(sk);
-		return false;
-	}
-	sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
-	switch (tag)
-	{
-		case 'g':
-			{
-				AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
-
-				msg->term = pq_getmsgint64_le(&s);
-				msg->nodeId = pq_getmsgint64_le(&s);
-				pq_getmsgend(&s);
-				return true;
-			}
-
-		case 'v':
-			{
-				VoteResponse *msg = (VoteResponse *) anymsg;
-
-				msg->term = pq_getmsgint64_le(&s);
-				msg->voteGiven = pq_getmsgint64_le(&s);
-				msg->flushLsn = pq_getmsgint64_le(&s);
-				msg->truncateLsn = pq_getmsgint64_le(&s);
-				msg->termHistory.n_entries = pq_getmsgint32_le(&s);
-				msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
-				for (int i = 0; i < msg->termHistory.n_entries; i++)
+		tag = pq_getmsgbyte(&s);
+		if (tag != anymsg->tag)
+		{
+			wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
+				   sk->port, FormatSafekeeperState(sk));
+			ResetConnection(sk);
+			return false;
+		}
+		switch (tag)
+		{
+			case 'g':
 				{
-					msg->termHistory.entries[i].term = pq_getmsgint64_le(&s);
-					msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s);
+					AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
+
+					msg->nodeId = pq_getmsgint64(&s);
+					MembershipConfigurationDeserialize(&msg->mconf, &s);
+					msg->term = pq_getmsgint64(&s);
+					pq_getmsgend(&s);
+					return true;
 				}
-				msg->timelineStartLsn = pq_getmsgint64_le(&s);
-				pq_getmsgend(&s);
-				return true;
-			}
+			case 'v':
+				{
+					VoteResponse *msg = (VoteResponse *) anymsg;
 
-		case 'a':
-			{
-				AppendResponse *msg = (AppendResponse *) anymsg;
+					msg->generation = pq_getmsgint32(&s);
+					msg->term = pq_getmsgint64(&s);
+					msg->voteGiven = pq_getmsgbyte(&s);
+					msg->flushLsn = pq_getmsgint64(&s);
+					msg->truncateLsn = pq_getmsgint64(&s);
+					msg->termHistory.n_entries = pq_getmsgint32(&s);
+					msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
+					for (uint32 i = 0; i < msg->termHistory.n_entries; i++)
+					{
+						msg->termHistory.entries[i].term = pq_getmsgint64(&s);
+						msg->termHistory.entries[i].lsn = pq_getmsgint64(&s);
+					}
+					pq_getmsgend(&s);
+					return true;
+				}
+			case 'a':
+				{
+					AppendResponse *msg = (AppendResponse *) anymsg;
 
-				msg->term = pq_getmsgint64_le(&s);
-				msg->flushLsn = pq_getmsgint64_le(&s);
-				msg->commitLsn = pq_getmsgint64_le(&s);
-				msg->hs.ts = pq_getmsgint64_le(&s);
-				msg->hs.xmin.value = pq_getmsgint64_le(&s);
-				msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
-				if (s.len > s.cursor)
-					ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
-				else
-					msg->ps_feedback.present = false;
-				pq_getmsgend(&s);
-				return true;
-			}
-
-		default:
-			{
-				Assert(false);
-				return false;
-			}
+					msg->generation = pq_getmsgint32(&s);
+					msg->term = pq_getmsgint64(&s);
+					msg->flushLsn = pq_getmsgint64(&s);
+					msg->commitLsn = pq_getmsgint64(&s);
+					msg->hs.ts = pq_getmsgint64(&s);
+					msg->hs.xmin.value = pq_getmsgint64(&s);
+					msg->hs.catalog_xmin.value = pq_getmsgint64(&s);
+					if (s.len > s.cursor)
+						ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
+					else
+						msg->ps_feedback.present = false;
+					pq_getmsgend(&s);
+					return true;
+				}
+			default:
+				{
+					wp_log(FATAL, "unexpected message tag %c to read", (char) tag);
+					return false;
+				}
+		}
 	}
+	else if (wp->config->proto_version == 2)
+	{
+		tag = pq_getmsgint64_le(&s);
+		if (tag != anymsg->tag)
+		{
+			wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
+				   sk->port, FormatSafekeeperState(sk));
+			ResetConnection(sk);
+			return false;
+		}
+		switch (tag)
+		{
+			case 'g':
+				{
+					AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
+
+					msg->term = pq_getmsgint64_le(&s);
+					msg->nodeId = pq_getmsgint64_le(&s);
+					pq_getmsgend(&s);
+					return true;
+				}
+
+			case 'v':
+				{
+					VoteResponse *msg = (VoteResponse *) anymsg;
+
+					msg->term = pq_getmsgint64_le(&s);
+					msg->voteGiven = pq_getmsgint64_le(&s);
+					msg->flushLsn = pq_getmsgint64_le(&s);
+					msg->truncateLsn = pq_getmsgint64_le(&s);
+					msg->termHistory.n_entries = pq_getmsgint32_le(&s);
+					msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
+					for (int i = 0; i < msg->termHistory.n_entries; i++)
+					{
+						msg->termHistory.entries[i].term = pq_getmsgint64_le(&s);
+						msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s);
+					}
+					pq_getmsgint64_le(&s);	/* timelineStartLsn */
+					pq_getmsgend(&s);
+					return true;
+				}
+
+			case 'a':
+				{
+					AppendResponse *msg = (AppendResponse *) anymsg;
+
+					msg->term = pq_getmsgint64_le(&s);
+					msg->flushLsn = pq_getmsgint64_le(&s);
+					msg->commitLsn = pq_getmsgint64_le(&s);
+					msg->hs.ts = pq_getmsgint64_le(&s);
+					msg->hs.xmin.value = pq_getmsgint64_le(&s);
+					msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
+					if (s.len > s.cursor)
+						ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
+					else
+						msg->ps_feedback.present = false;
+					pq_getmsgend(&s);
+					return true;
+				}
+
+			default:
+				{
+					wp_log(FATAL, "unexpected message tag %c to read", (char) tag);
+					return false;
+				}
+		}
+	}
+	wp_log(FATAL, "unsupported proto_version %d", wp->config->proto_version);
+	return false; /* keep the compiler quiet */
 }
 
 /*
@@ -2246,3 +2533,45 @@ FormatEvents(WalProposer *wp, uint32 events)
 
 	return (char *) &return_str;
 }
+
+/* Dump mconf as toml for observability / debugging. Result is palloc'ed. */
+static char *
+MembershipConfigurationToString(MembershipConfiguration *mconf)
+{
+	StringInfoData s;
+	uint32		i;
+
+	initStringInfo(&s);
+	appendStringInfo(&s, "{gen = %u", mconf->generation);
+	appendStringInfoString(&s, ", members = [");
+	for (i = 0; i < mconf->members.len; i++)
+	{
+		if (i > 0)
+			appendStringInfoString(&s, ", ");
+		appendStringInfo(&s, "{node_id = %lu", mconf->members.m[i].node_id);
+		appendStringInfo(&s, ", host = %s", mconf->members.m[i].host);
+		appendStringInfo(&s, ", port = %u }", mconf->members.m[i].port);
+	}
+	appendStringInfo(&s, "], new_members = [");
+	for (i = 0; i < mconf->new_members.len; i++)
+	{
+		if (i > 0)
+			appendStringInfoString(&s, ", ");
+		appendStringInfo(&s, "{node_id = %lu", mconf->new_members.m[i].node_id);
+		appendStringInfo(&s, ", host = %s", mconf->new_members.m[i].host);
+		appendStringInfo(&s, ", port = %u }", mconf->new_members.m[i].port);
+	}
+	appendStringInfoString(&s, "]}");
+	return s.data;
+}
+
+static void
+MembershipConfigurationFree(MembershipConfiguration *mconf)
+{
+	if (mconf->members.m)
+		pfree(mconf->members.m);
+	mconf->members.m = NULL;
+	if (mconf->new_members.m)
+		pfree(mconf->new_members.m);
+	mconf->new_members.m = NULL;
+}
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index d8c44f8182..eee55f924f 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -12,9 +12,6 @@
 #include "neon_walreader.h"
 #include "pagestore_client.h"
 
-#define SK_MAGIC 0xCafeCeefu
-#define SK_PROTOCOL_VERSION 2
-
 #define MAX_SAFEKEEPERS 32
 #define MAX_SEND_SIZE (XLOG_BLCKSZ * 16)	/* max size of a single* WAL
 											 * message */
@@ -143,12 +140,71 @@ typedef uint64 term_t;
 /* neon storage node id */
 typedef uint64 NNodeId;
 
+/*
+ * Number uniquely identifying safekeeper membership configuration.
+ * This and following structs pair ones in membership.rs.
+ */
+typedef uint32 Generation;
+
+typedef struct SafekeeperId
+{
+	NNodeId		node_id;
+	char		host[MAXCONNINFO];
+	uint16		port;
+} SafekeeperId;
+
+/* Set of safekeepers. */
+typedef struct MemberSet
+{
+	uint32		len;			/* number of members */
+	SafekeeperId *m;			/* ids themselves */
+} MemberSet;
+
+/* Timeline safekeeper membership configuration. */
+typedef struct MembershipConfiguration
+{
+	Generation	generation;
+	MemberSet	members;
+	/* Has 0 n_members in non joint conf. */
+	MemberSet	new_members;
+} MembershipConfiguration;
+
 /*
  * Proposer <-> Acceptor messaging.
  */
 
+typedef struct ProposerAcceptorMessage
+{
+	uint8		tag;
+} ProposerAcceptorMessage;
+
 /* Initial Proposer -> Acceptor message */
 typedef struct ProposerGreeting
+{
+	ProposerAcceptorMessage pam;	/* message tag */
+
+	/*
+	 * tenant/timeline ids as C strings with standard hex notation for ease of
+	 * printing. In principle they are not strictly needed as ttid is also
+	 * passed as libpq options.
+	 */
+	char	   *tenant_id;
+	char	   *timeline_id;
+	/* Full conf is carried to allow safekeeper switch */
+	MembershipConfiguration mconf;
+
+	/*
+	 * pg_version and wal_seg_size are used for timeline creation until we
+	 * fully migrate to doing externally. systemId is only used as a sanity
+	 * cross check.
+	 */
+	uint32		pg_version;		/* in PG_VERSION_NUM format */
+	uint64		system_id;		/* Postgres system identifier. */
+	uint32		wal_seg_size;
+} ProposerGreeting;
+
+/* protocol v2 variant, kept while wp supports it */
+typedef struct ProposerGreetingV2
 {
 	uint64		tag;			/* message tag */
 	uint32		protocolVersion;	/* proposer-safekeeper protocol version */
@@ -159,32 +215,42 @@ typedef struct ProposerGreeting
 	uint8		tenant_id[16];
 	TimeLineID	timeline;
 	uint32		walSegSize;
-} ProposerGreeting;
+} ProposerGreetingV2;
 
 typedef struct AcceptorProposerMessage
 {
-	uint64		tag;
+	uint8		tag;
 } AcceptorProposerMessage;
 
 /*
- * Acceptor -> Proposer initial response: the highest term acceptor voted for.
+ * Acceptor -> Proposer initial response: the highest term acceptor voted for,
+ * its node id and configuration.
  */
 typedef struct AcceptorGreeting
 {
 	AcceptorProposerMessage apm;
-	term_t		term;
 	NNodeId		nodeId;
+	MembershipConfiguration mconf;
+	term_t		term;
 } AcceptorGreeting;
 
 /*
  * Proposer -> Acceptor vote request.
  */
 typedef struct VoteRequest
+{
+	ProposerAcceptorMessage pam;	/* message tag */
+	Generation	generation;		/* membership conf generation */
+	term_t		term;
+} VoteRequest;
+
+/* protocol v2 variant, kept while wp supports it */
+typedef struct VoteRequestV2
 {
 	uint64		tag;
 	term_t		term;
 	pg_uuid_t	proposerId;		/* for monitoring/debugging */
-} VoteRequest;
+} VoteRequestV2;
 
 /* Element of term switching chain. */
 typedef struct TermSwitchEntry
@@ -203,8 +269,15 @@ typedef struct TermHistory
 typedef struct VoteResponse
 {
 	AcceptorProposerMessage apm;
+
+	/*
+	 * Membership conf generation. It's not strictly required because on
+	 * mismatch safekeeper is expected to ERROR the connection, but let's
+	 * sanity check it.
+	 */
+	Generation	generation;
 	term_t		term;
-	uint64		voteGiven;
+	uint8		voteGiven;
 
 	/*
 	 * Safekeeper flush_lsn (end of WAL) + history of term switches allow
@@ -214,7 +287,6 @@ typedef struct VoteResponse
 	XLogRecPtr	truncateLsn;	/* minimal LSN which may be needed for*
 								 * recovery of some safekeeper */
 	TermHistory termHistory;
-	XLogRecPtr	timelineStartLsn;	/* timeline globally starts at this LSN */
 } VoteResponse;
 
 /*
@@ -223,20 +295,37 @@ typedef struct VoteResponse
  */
 typedef struct ProposerElected
 {
-	uint64		tag;
+	AcceptorProposerMessage apm;
+	Generation	generation;		/* membership conf generation */
 	term_t		term;
 	/* proposer will send since this point */
 	XLogRecPtr	startStreamingAt;
 	/* history of term switches up to this proposer */
 	TermHistory *termHistory;
-	/* timeline globally starts at this LSN */
-	XLogRecPtr	timelineStartLsn;
 } ProposerElected;
 
 /*
  * Header of request with WAL message sent from proposer to safekeeper.
  */
 typedef struct AppendRequestHeader
+{
+	AcceptorProposerMessage apm;
+	Generation	generation;		/* membership conf generation */
+	term_t		term;			/* term of the proposer */
+	XLogRecPtr	beginLsn;		/* start position of message in WAL */
+	XLogRecPtr	endLsn;			/* end position of message in WAL */
+	XLogRecPtr	commitLsn;		/* LSN committed by quorum of safekeepers */
+
+	/*
+	 * minimal LSN which may be needed for recovery of some safekeeper (end
+	 * lsn + 1 of last chunk streamed to everyone)
+	 */
+	XLogRecPtr	truncateLsn;
+	/* in the AppendRequest message, WAL data follows */
+} AppendRequestHeader;
+
+/* protocol v2 variant, kept while wp supports it */
+typedef struct AppendRequestHeaderV2
 {
 	uint64		tag;
 	term_t		term;			/* term of the proposer */
@@ -256,7 +345,8 @@ typedef struct AppendRequestHeader
 	 */
 	XLogRecPtr	truncateLsn;
 	pg_uuid_t	proposerId;		/* for monitoring/debugging */
-} AppendRequestHeader;
+	/* in the AppendRequest message, WAL data follows */
+} AppendRequestHeaderV2;
 
 /*
  * Hot standby feedback received from replica
@@ -309,6 +399,13 @@ typedef struct AppendResponse
 {
 	AcceptorProposerMessage apm;
 
+	/*
+	 * Membership conf generation. It's not strictly required because on
+	 * mismatch safekeeper is expected to ERROR the connection, but let's
+	 * sanity check it.
+	 */
+	Generation	generation;
+
 	/*
 	 * Current term of the safekeeper; if it is higher than proposer's, the
 	 * compute is out of date.
@@ -644,6 +741,8 @@ typedef struct WalProposerConfig
 	/* Will be passed to safekeepers in greet request. */
 	TimeLineID	pgTimeline;
 
+	int			proto_version;
+
 #ifdef WALPROPOSER_LIB
 	void	   *callback_data;
 #endif
@@ -656,11 +755,14 @@ typedef struct WalProposerConfig
 typedef struct WalProposer
 {
 	WalProposerConfig *config;
-	int			n_safekeepers;
+	/* Current walproposer membership configuration */
+	MembershipConfiguration mconf;
 
 	/* (n_safekeepers / 2) + 1 */
 	int			quorum;
 
+	/* Number of occupied slots in safekeepers[] */
+	int			n_safekeepers;
 	Safekeeper	safekeeper[MAX_SAFEKEEPERS];
 
 	/* WAL has been generated up to this point */
@@ -670,6 +772,7 @@ typedef struct WalProposer
 	XLogRecPtr	commitLsn;
 
 	ProposerGreeting greetRequest;
+	ProposerGreetingV2 greetRequestV2;
 
 	/* Vote request for safekeeper */
 	VoteRequest voteRequest;
diff --git a/pgxn/neon/walproposer_compat.c b/pgxn/neon/walproposer_compat.c
index 35d984c52e..a986160224 100644
--- a/pgxn/neon/walproposer_compat.c
+++ b/pgxn/neon/walproposer_compat.c
@@ -117,14 +117,13 @@ pq_getmsgbytes(StringInfo msg, int datalen)
 }
 
 /* --------------------------------
- *		pq_getmsgstring - get a null-terminated text string (with conversion)
+ *		pq_getmsgrawstring - get a null-terminated text string - NO conversion
  *
- *		May return a pointer directly into the message buffer, or a pointer
- *		to a palloc'd conversion result.
+ *		Returns a pointer directly into the message buffer.
  * --------------------------------
  */
 const char *
-pq_getmsgstring(StringInfo msg)
+pq_getmsgrawstring(StringInfo msg)
 {
 	char	   *str;
 	int			slen;
@@ -155,6 +154,45 @@ pq_getmsgend(StringInfo msg)
 		ExceptionalCondition("invalid msg format", __FILE__, __LINE__);
 }
 
+/* --------------------------------
+ *		pq_sendbytes	- append raw data to a StringInfo buffer
+ * --------------------------------
+ */
+void
+pq_sendbytes(StringInfo buf, const void *data, int datalen)
+{
+	/* use variant that maintains a trailing null-byte, out of caution */
+	appendBinaryStringInfo(buf, data, datalen);
+}
+
+/* --------------------------------
+ *		pq_send_ascii_string	- append a null-terminated text string (without conversion)
+ *
+ * This function intentionally bypasses encoding conversion, instead just
+ * silently replacing any non-7-bit-ASCII characters with question marks.
+ * It is used only when we are having trouble sending an error message to
+ * the client with normal localization and encoding conversion.  The caller
+ * should already have taken measures to ensure the string is just ASCII;
+ * the extra work here is just to make certain we don't send a badly encoded
+ * string to the client (which might or might not be robust about that).
+ *
+ * NB: passed text string must be null-terminated, and so is the data
+ * sent to the frontend.
+ * --------------------------------
+ */
+void
+pq_send_ascii_string(StringInfo buf, const char *str)
+{
+	while (*str)
+	{
+		char		ch = *str++;
+
+		if (IS_HIGHBIT_SET(ch))
+			ch = '?';
+		appendStringInfoCharMacro(buf, ch);
+	}
+	appendStringInfoChar(buf, '\0');
+}
 
 /*
  * Produce a C-string representation of a TimestampTz.
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 86444084ff..b21184de57 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -59,9 +59,11 @@
 
 #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
 
+/* GUCs */
 char	   *wal_acceptors_list = "";
 int			wal_acceptor_reconnect_timeout = 1000;
 int			wal_acceptor_connection_timeout = 10000;
+int			safekeeper_proto_version = 2;
 
 /* Set to true in the walproposer bgw. */
 static bool am_walproposer;
@@ -126,6 +128,7 @@ init_walprop_config(bool syncSafekeepers)
 	else
 		walprop_config.systemId = 0;
 	walprop_config.pgTimeline = walprop_pg_get_timeline_id();
+	walprop_config.proto_version = safekeeper_proto_version;
 }
 
 /*
@@ -219,25 +222,37 @@ nwp_register_gucs(void)
 							PGC_SIGHUP,
 							GUC_UNIT_MS,
 							NULL, NULL, NULL);
+
+	DefineCustomIntVariable(
+							"neon.safekeeper_proto_version",
+							"Version of compute <-> safekeeper protocol.",
+							"Used while migrating from 2 to 3.",
+							&safekeeper_proto_version,
+							2, 0, INT_MAX,
+							PGC_POSTMASTER,
+							0,
+							NULL, NULL, NULL);
 }
 
 
 static int
 split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
 {
-	int n_safekeepers = 0;
-	char *curr_sk = safekeepers_list;
+	int			n_safekeepers = 0;
+	char	   *curr_sk = safekeepers_list;
 
 	for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma)
 	{
-		if (++n_safekeepers >= MAX_SAFEKEEPERS) {
+		if (++n_safekeepers >= MAX_SAFEKEEPERS)
+		{
 			wpg_log(FATAL, "too many safekeepers");
 		}
 
 		coma = strchr(coma, ',');
-		safekeepers[n_safekeepers-1] = curr_sk;
+		safekeepers[n_safekeepers - 1] = curr_sk;
 
-		if (coma != NULL) {
+		if (coma != NULL)
+		{
 			*coma++ = '\0';
 		}
 	}
@@ -252,10 +267,10 @@ split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
 static bool
 safekeepers_cmp(char *old, char *new)
 {
-	char *safekeepers_old[MAX_SAFEKEEPERS];
-	char *safekeepers_new[MAX_SAFEKEEPERS];
-	int len_old = 0;
-	int len_new = 0;
+	char	   *safekeepers_old[MAX_SAFEKEEPERS];
+	char	   *safekeepers_new[MAX_SAFEKEEPERS];
+	int			len_old = 0;
+	int			len_new = 0;
 
 	len_old = split_safekeepers_list(old, safekeepers_old);
 	len_new = split_safekeepers_list(new, safekeepers_new);
@@ -292,7 +307,8 @@ assign_neon_safekeepers(const char *newval, void *extra)
 	if (!am_walproposer)
 		return;
 
-	if (!newval) {
+	if (!newval)
+	{
 		/* should never happen */
 		wpg_log(FATAL, "neon.safekeepers is empty");
 	}
@@ -301,11 +317,11 @@ assign_neon_safekeepers(const char *newval, void *extra)
 	newval_copy = pstrdup(newval);
 	oldval = pstrdup(wal_acceptors_list);
 
-	/* 
+	/*
 	 * TODO: restarting through FATAL is stupid and introduces 1s delay before
-	 * next bgw start. We should refactor walproposer to allow graceful exit and
-	 * thus remove this delay.
-	 * XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder.
+	 * next bgw start. We should refactor walproposer to allow graceful exit
+	 * and thus remove this delay. XXX: If you change anything here, sync with
+	 * test_safekeepers_reconfigure_reorder.
 	 */
 	if (!safekeepers_cmp(oldval, newval_copy))
 	{
@@ -454,7 +470,8 @@ backpressure_throttling_impl(void)
 	memcpy(new_status, old_status, len);
 	snprintf(new_status + len, 64, "backpressure throttling: lag %lu", lag);
 	set_ps_display(new_status);
-	new_status[len] = '\0'; /* truncate off " backpressure ..." to later reset the ps */
+	new_status[len] = '\0';		/* truncate off " backpressure ..." to later
+								 * reset the ps */
 
 	elog(DEBUG2, "backpressure throttling: lag %lu", lag);
 	start = GetCurrentTimestamp();
@@ -621,7 +638,7 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos)
 	wpg_log(LOG, "WAL proposer starts streaming at %X/%X",
 			LSN_FORMAT_ARGS(startpos));
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
-	cmd.timeline = wp->greetRequest.timeline;
+	cmd.timeline = wp->config->pgTimeline;
 	cmd.startpoint = startpos;
 	StartProposerReplication(wp, &cmd);
 }
@@ -1963,10 +1980,11 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
 		FullTransactionId xmin = hsFeedback.xmin;
 		FullTransactionId catalog_xmin = hsFeedback.catalog_xmin;
 		FullTransactionId next_xid = ReadNextFullTransactionId();
+
 		/*
-		 * Page server is updating nextXid in checkpoint each 1024 transactions,
-		 * so feedback xmin can be actually larger then nextXid and
-		 * function TransactionIdInRecentPast return false in this case,
+		 * Page server is updating nextXid in checkpoint each 1024
+		 * transactions, so feedback xmin can be actually larger then nextXid
+		 * and function TransactionIdInRecentPast return false in this case,
 		 * preventing update of slot's xmin.
 		 */
 		if (FullTransactionIdPrecedes(next_xid, xmin))
diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs
index 19c6662e74..1c0ae66f01 100644
--- a/safekeeper/benches/receive_wal.rs
+++ b/safekeeper/benches/receive_wal.rs
@@ -13,6 +13,7 @@ use safekeeper::safekeeper::{
     AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage,
 };
 use safekeeper::test_utils::Env;
+use safekeeper_api::membership::SafekeeperGeneration as Generation;
 use tokio::io::AsyncWriteExt as _;
 use utils::id::{NodeId, TenantTimelineId};
 use utils::lsn::Lsn;
@@ -88,13 +89,12 @@ fn bench_process_msg(c: &mut Criterion) {
                 let (lsn, record) = walgen.next().expect("endless WAL");
                 ProposerAcceptorMessage::AppendRequest(AppendRequest {
                     h: AppendRequestHeader {
+                        generation: Generation::new(0),
                         term: 1,
-                        term_start_lsn: Lsn(0),
                         begin_lsn: lsn,
                         end_lsn: lsn + record.len() as u64,
                         commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record
                         truncate_lsn: Lsn(0),
-                        proposer_uuid: [0; 16],
                     },
                     wal_data: record,
                 })
@@ -160,13 +160,12 @@ fn bench_wal_acceptor(c: &mut Criterion) {
                     .take(n)
                     .map(|(lsn, record)| AppendRequest {
                         h: AppendRequestHeader {
+                            generation: Generation::new(0),
                             term: 1,
-                            term_start_lsn: Lsn(0),
                             begin_lsn: lsn,
                             end_lsn: lsn + record.len() as u64,
                             commit_lsn: Lsn(0),
                             truncate_lsn: Lsn(0),
-                            proposer_uuid: [0; 16],
                         },
                         wal_data: record,
                     })
@@ -262,13 +261,12 @@ fn bench_wal_acceptor_throughput(c: &mut Criterion) {
             runtime.block_on(async {
                 let reqgen = walgen.take(count).map(|(lsn, record)| AppendRequest {
                     h: AppendRequestHeader {
+                        generation: Generation::new(0),
                         term: 1,
-                        term_start_lsn: Lsn(0),
                         begin_lsn: lsn,
                         end_lsn: lsn + record.len() as u64,
                         commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record
                         truncate_lsn: Lsn(0),
-                        proposer_uuid: [0; 16],
                     },
                     wal_data: record,
                 });
diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs
index 19e17c4a75..8d7c1109ad 100644
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -8,7 +8,7 @@
 
 use anyhow::Context;
 use postgres_backend::QueryError;
-use safekeeper_api::membership::Configuration;
+use safekeeper_api::membership::{Configuration, INVALID_GENERATION};
 use safekeeper_api::{ServerInfo, Term};
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -133,10 +133,10 @@ async fn send_proposer_elected(
     let history = TermHistory(history_entries);
 
     let proposer_elected_request = ProposerAcceptorMessage::Elected(ProposerElected {
+        generation: INVALID_GENERATION,
         term,
         start_streaming_at: lsn,
         term_history: history,
-        timeline_start_lsn: lsn,
     });
 
     tli.process_msg(&proposer_elected_request).await?;
@@ -170,13 +170,12 @@ pub async fn append_logical_message(
 
     let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest {
         h: AppendRequestHeader {
+            generation: INVALID_GENERATION,
             term: msg.term,
-            term_start_lsn: begin_lsn,
             begin_lsn,
             end_lsn,
             commit_lsn,
             truncate_lsn: msg.truncate_lsn,
-            proposer_uuid: [0u8; 16],
         },
         wal_data,
     });
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index cb42f6f414..a94e6930e1 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -281,7 +281,7 @@ impl SafekeeperPostgresHandler {
             tokio::select! {
                 // todo: add read|write .context to these errors
                 r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline, next_msg) => r,
-                r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
+                r = network_write(pgb, reply_rx, pageserver_feedback_rx, proto_version) => r,
                 _ = timeline_cancel.cancelled() => {
                     return Err(CopyStreamHandlerEnd::Cancelled);
                 }
@@ -342,8 +342,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
         let tli = match next_msg {
             ProposerAcceptorMessage::Greeting(ref greeting) => {
                 info!(
-                    "start handshake with walproposer {} sysid {} timeline {}",
-                    self.peer_addr, greeting.system_id, greeting.tli,
+                    "start handshake with walproposer {} sysid {}",
+                    self.peer_addr, greeting.system_id,
                 );
                 let server_info = ServerInfo {
                     pg_version: greeting.pg_version,
@@ -459,6 +459,7 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
     pgb_writer: &mut PostgresBackend<IO>,
     mut reply_rx: Receiver<AcceptorProposerMessage>,
     mut pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback>,
+    proto_version: u32,
 ) -> Result<(), CopyStreamHandlerEnd> {
     let mut buf = BytesMut::with_capacity(128);
 
@@ -496,7 +497,7 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
         };
 
         buf.clear();
-        msg.serialize(&mut buf)?;
+        msg.serialize(&mut buf, proto_version)?;
         pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?;
     }
 }
diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs
index 35394eb6ed..3e9080ebbe 100644
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -7,6 +7,7 @@ use std::{fmt, pin::pin};
 use anyhow::{bail, Context};
 use futures::StreamExt;
 use postgres_protocol::message::backend::ReplicationMessage;
+use safekeeper_api::membership::INVALID_GENERATION;
 use safekeeper_api::models::{PeerInfo, TimelineStatus};
 use safekeeper_api::Term;
 use tokio::sync::mpsc::{channel, Receiver, Sender};
@@ -267,7 +268,10 @@ async fn recover(
     );
 
     // Now understand our term history.
-    let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: donor.term });
+    let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest {
+        generation: INVALID_GENERATION,
+        term: donor.term,
+    });
     let vote_response = match tli
         .process_msg(&vote_request)
         .await
@@ -302,10 +306,10 @@ async fn recover(
 
     // truncate WAL locally
     let pe = ProposerAcceptorMessage::Elected(ProposerElected {
+        generation: INVALID_GENERATION,
         term: donor.term,
         start_streaming_at: last_common_point.lsn,
         term_history: donor_th,
-        timeline_start_lsn: Lsn::INVALID,
     });
     // Successful ProposerElected handling always returns None. If term changed,
     // we'll find out that during the streaming. Note: it is expected to get
@@ -437,13 +441,12 @@ async fn network_io(
         match msg {
             ReplicationMessage::XLogData(xlog_data) => {
                 let ar_hdr = AppendRequestHeader {
+                    generation: INVALID_GENERATION,
                     term: donor.term,
-                    term_start_lsn: Lsn::INVALID, // unused
                     begin_lsn: Lsn(xlog_data.wal_start()),
                     end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
                     commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
                     truncate_lsn: Lsn::INVALID, // do not attempt to advance
-                    proposer_uuid: [0; 16],
                 };
                 let ar = AppendRequest {
                     h: ar_hdr,
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index f816f8459a..f429cafed2 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -5,6 +5,11 @@ use byteorder::{LittleEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 
 use postgres_ffi::{TimeLineID, MAX_SEND_SIZE};
+use safekeeper_api::membership;
+use safekeeper_api::membership::MemberSet;
+use safekeeper_api::membership::SafekeeperGeneration as Generation;
+use safekeeper_api::membership::SafekeeperId;
+use safekeeper_api::membership::INVALID_GENERATION;
 use safekeeper_api::models::HotStandbyFeedback;
 use safekeeper_api::Term;
 use serde::{Deserialize, Serialize};
@@ -12,6 +17,7 @@ use std::cmp::max;
 use std::cmp::min;
 use std::fmt;
 use std::io::Read;
+use std::str::FromStr;
 use storage_broker::proto::SafekeeperTimelineInfo;
 
 use tracing::*;
@@ -29,7 +35,8 @@ use utils::{
     lsn::Lsn,
 };
 
-pub const SK_PROTOCOL_VERSION: u32 = 2;
+pub const SK_PROTO_VERSION_2: u32 = 2;
+pub const SK_PROTO_VERSION_3: u32 = 3;
 pub const UNKNOWN_SERVER_VERSION: u32 = 0;
 
 #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
@@ -56,8 +63,28 @@ impl TermHistory {
         TermHistory(Vec::new())
     }
 
-    // Parse TermHistory as n_entries followed by TermLsn pairs
+    // Parse TermHistory as n_entries followed by TermLsn pairs in network order.
     pub fn from_bytes(bytes: &mut Bytes) -> Result<TermHistory> {
+        let n_entries = bytes
+            .get_u32_f()
+            .with_context(|| "TermHistory misses len")?;
+        let mut res = Vec::with_capacity(n_entries as usize);
+        for i in 0..n_entries {
+            let term = bytes
+                .get_u64_f()
+                .with_context(|| format!("TermHistory pos {} misses term", i))?;
+            let lsn = bytes
+                .get_u64_f()
+                .with_context(|| format!("TermHistory pos {} misses lsn", i))?
+                .into();
+            res.push(TermLsn { term, lsn })
+        }
+        Ok(TermHistory(res))
+    }
+
+    // Parse TermHistory as n_entries followed by TermLsn pairs in LE order.
+    // TODO remove once v2 protocol is fully dropped.
+    pub fn from_bytes_le(bytes: &mut Bytes) -> Result<TermHistory> {
         if bytes.remaining() < 4 {
             bail!("TermHistory misses len");
         }
@@ -197,6 +224,18 @@ impl AcceptorState {
 /// Initial Proposer -> Acceptor message
 #[derive(Debug, Deserialize)]
 pub struct ProposerGreeting {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub mconf: membership::Configuration,
+    /// Postgres server version
+    pub pg_version: u32,
+    pub system_id: SystemId,
+    pub wal_seg_size: u32,
+}
+
+/// V2 of the message; exists as a struct because we (de)serialized it as is.
+#[derive(Debug, Deserialize)]
+pub struct ProposerGreetingV2 {
     /// proposer-acceptor protocol version
     pub protocol_version: u32,
     /// Postgres server version
@@ -213,27 +252,35 @@ pub struct ProposerGreeting {
 /// (acceptor voted for).
 #[derive(Debug, Serialize)]
 pub struct AcceptorGreeting {
-    term: u64,
     node_id: NodeId,
+    mconf: membership::Configuration,
+    term: u64,
 }
 
 /// Vote request sent from proposer to safekeepers
-#[derive(Debug, Deserialize)]
+#[derive(Debug)]
 pub struct VoteRequest {
+    pub generation: Generation,
+    pub term: Term,
+}
+
+/// V2 of the message; exists as a struct because we (de)serialized it as is.
+#[derive(Debug, Deserialize)]
+pub struct VoteRequestV2 {
     pub term: Term,
 }
 
 /// Vote itself, sent from safekeeper to proposer
 #[derive(Debug, Serialize)]
 pub struct VoteResponse {
+    generation: Generation, // membership conf generation
     pub term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
-    vote_given: u64, // fixme u64 due to padding
+    vote_given: bool,
     // Safekeeper flush_lsn (end of WAL) + history of term switches allow
     // proposer to choose the most advanced one.
     pub flush_lsn: Lsn,
     truncate_lsn: Lsn,
     pub term_history: TermHistory,
-    timeline_start_lsn: Lsn,
 }
 
 /*
@@ -242,10 +289,10 @@ pub struct VoteResponse {
  */
 #[derive(Debug)]
 pub struct ProposerElected {
+    pub generation: Generation, // membership conf generation
     pub term: Term,
     pub start_streaming_at: Lsn,
     pub term_history: TermHistory,
-    pub timeline_start_lsn: Lsn,
 }
 
 /// Request with WAL message sent from proposer to safekeeper. Along the way it
@@ -257,6 +304,22 @@ pub struct AppendRequest {
 }
 #[derive(Debug, Clone, Deserialize)]
 pub struct AppendRequestHeader {
+    pub generation: Generation, // membership conf generation
+    // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
+    pub term: Term,
+    /// start position of message in WAL
+    pub begin_lsn: Lsn,
+    /// end position of message in WAL
+    pub end_lsn: Lsn,
+    /// LSN committed by quorum of safekeepers
+    pub commit_lsn: Lsn,
+    /// minimal LSN which may be needed by proposer to perform recovery of some safekeeper
+    pub truncate_lsn: Lsn,
+}
+
+/// V2 of the message; exists as a struct because we (de)serialized it as is.
+#[derive(Debug, Clone, Deserialize)]
+pub struct AppendRequestHeaderV2 {
     // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
     pub term: Term,
     // TODO: remove this field from the protocol, it in unused -- LSN of term
@@ -277,6 +340,9 @@ pub struct AppendRequestHeader {
 /// Report safekeeper state to proposer
 #[derive(Debug, Serialize, Clone)]
 pub struct AppendResponse {
+    // Membership conf generation. Not strictly required because on mismatch
+    // connection is reset, but let's sanity check it.
+    generation: Generation,
     // Current term of the safekeeper; if it is higher than proposer's, the
     // compute is out of date.
     pub term: Term,
@@ -293,8 +359,9 @@ pub struct AppendResponse {
 }
 
 impl AppendResponse {
-    fn term_only(term: Term) -> AppendResponse {
+    fn term_only(generation: Generation, term: Term) -> AppendResponse {
         AppendResponse {
+            generation,
             term,
             flush_lsn: Lsn(0),
             commit_lsn: Lsn(0),
@@ -315,72 +382,322 @@ pub enum ProposerAcceptorMessage {
     FlushWAL,
 }
 
-impl ProposerAcceptorMessage {
-    /// Parse proposer message.
-    pub fn parse(msg_bytes: Bytes, proto_version: u32) -> Result<ProposerAcceptorMessage> {
-        if proto_version != SK_PROTOCOL_VERSION {
-            bail!(
-                "incompatible protocol version {}, expected {}",
-                proto_version,
-                SK_PROTOCOL_VERSION
-            );
+/// Augment Bytes with fallible get_uN where N is number of bytes methods.
+/// All reads are in network (big endian) order.
+trait BytesF {
+    fn get_u8_f(&mut self) -> Result<u8>;
+    fn get_u16_f(&mut self) -> Result<u16>;
+    fn get_u32_f(&mut self) -> Result<u32>;
+    fn get_u64_f(&mut self) -> Result<u64>;
+}
+
+impl BytesF for Bytes {
+    fn get_u8_f(&mut self) -> Result<u8> {
+        if self.is_empty() {
+            bail!("no bytes left, expected 1");
         }
-        // xxx using Reader is inefficient but easy to work with bincode
-        let mut stream = msg_bytes.reader();
-        // u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is
-        let tag = stream.read_u64::<LittleEndian>()? as u8 as char;
-        match tag {
-            'g' => {
-                let msg = ProposerGreeting::des_from(&mut stream)?;
-                Ok(ProposerAcceptorMessage::Greeting(msg))
-            }
-            'v' => {
-                let msg = VoteRequest::des_from(&mut stream)?;
-                Ok(ProposerAcceptorMessage::VoteRequest(msg))
-            }
-            'e' => {
-                let mut msg_bytes = stream.into_inner();
-                if msg_bytes.remaining() < 16 {
-                    bail!("ProposerElected message is not complete");
-                }
-                let term = msg_bytes.get_u64_le();
-                let start_streaming_at = msg_bytes.get_u64_le().into();
-                let term_history = TermHistory::from_bytes(&mut msg_bytes)?;
-                if msg_bytes.remaining() < 8 {
-                    bail!("ProposerElected message is not complete");
-                }
-                let timeline_start_lsn = msg_bytes.get_u64_le().into();
-                let msg = ProposerElected {
-                    term,
-                    start_streaming_at,
-                    timeline_start_lsn,
-                    term_history,
+        Ok(self.get_u8())
+    }
+    fn get_u16_f(&mut self) -> Result<u16> {
+        if self.remaining() < 2 {
+            bail!("no bytes left, expected 2");
+        }
+        Ok(self.get_u16())
+    }
+    fn get_u32_f(&mut self) -> Result<u32> {
+        if self.remaining() < 4 {
+            bail!("only {} bytes left, expected 4", self.remaining());
+        }
+        Ok(self.get_u32())
+    }
+    fn get_u64_f(&mut self) -> Result<u64> {
+        if self.remaining() < 8 {
+            bail!("only {} bytes left, expected 8", self.remaining());
+        }
+        Ok(self.get_u64())
+    }
+}
+
+impl ProposerAcceptorMessage {
+    /// Read cstring from Bytes.
+    fn get_cstr(buf: &mut Bytes) -> Result<String> {
+        let pos = buf
+            .iter()
+            .position(|x| *x == 0)
+            .ok_or_else(|| anyhow::anyhow!("missing cstring terminator"))?;
+        let result = buf.split_to(pos);
+        buf.advance(1); // drop the null terminator
+        match std::str::from_utf8(&result) {
+            Ok(s) => Ok(s.to_string()),
+            Err(e) => bail!("invalid utf8 in cstring: {}", e),
+        }
+    }
+
+    /// Read membership::Configuration from Bytes.
+    fn get_mconf(buf: &mut Bytes) -> Result<membership::Configuration> {
+        let generation = Generation::new(buf.get_u32_f().with_context(|| "reading generation")?);
+        let members_len = buf.get_u32_f().with_context(|| "reading members_len")?;
+        // Main member set must have at least someone in valid configuration.
+        // Empty conf is allowed until we fully migrate.
+        if generation != INVALID_GENERATION && members_len == 0 {
+            bail!("empty members_len");
+        }
+        let mut members = MemberSet::empty();
+        for i in 0..members_len {
+            let id = buf
+                .get_u64_f()
+                .with_context(|| format!("reading member {} node_id", i))?;
+            let host = Self::get_cstr(buf).with_context(|| format!("reading member {} host", i))?;
+            let pg_port = buf
+                .get_u16_f()
+                .with_context(|| format!("reading member {} port", i))?;
+            let sk = SafekeeperId {
+                id: NodeId(id),
+                host,
+                pg_port,
+            };
+            members.add(sk)?;
+        }
+        let new_members_len = buf.get_u32_f().with_context(|| "reading new_members_len")?;
+        // Non joint conf.
+        if new_members_len == 0 {
+            Ok(membership::Configuration {
+                generation,
+                members,
+                new_members: None,
+            })
+        } else {
+            let mut new_members = MemberSet::empty();
+            for i in 0..new_members_len {
+                let id = buf
+                    .get_u64_f()
+                    .with_context(|| format!("reading new member {} node_id", i))?;
+                let host = Self::get_cstr(buf)
+                    .with_context(|| format!("reading new member {} host", i))?;
+                let pg_port = buf
+                    .get_u16_f()
+                    .with_context(|| format!("reading new member {} port", i))?;
+                let sk = SafekeeperId {
+                    id: NodeId(id),
+                    host,
+                    pg_port,
                 };
-                Ok(ProposerAcceptorMessage::Elected(msg))
+                new_members.add(sk)?;
             }
-            'a' => {
-                // read header followed by wal data
-                let hdr = AppendRequestHeader::des_from(&mut stream)?;
-                let rec_size = hdr
-                    .end_lsn
-                    .checked_sub(hdr.begin_lsn)
-                    .context("begin_lsn > end_lsn in AppendRequest")?
-                    .0 as usize;
-                if rec_size > MAX_SEND_SIZE {
-                    bail!(
-                        "AppendRequest is longer than MAX_SEND_SIZE ({})",
-                        MAX_SEND_SIZE
-                    );
+            Ok(membership::Configuration {
+                generation,
+                members,
+                new_members: Some(new_members),
+            })
+        }
+    }
+
+    /// Parse proposer message.
+    pub fn parse(mut msg_bytes: Bytes, proto_version: u32) -> Result<ProposerAcceptorMessage> {
+        if proto_version == SK_PROTO_VERSION_3 {
+            if msg_bytes.is_empty() {
+                bail!("ProposerAcceptorMessage is not complete: missing tag");
+            }
+            let tag = msg_bytes.get_u8_f().with_context(|| {
+                "ProposerAcceptorMessage is not complete: missing tag".to_string()
+            })? as char;
+            match tag {
+                'g' => {
+                    let tenant_id_str =
+                        Self::get_cstr(&mut msg_bytes).with_context(|| "reading tenant_id")?;
+                    let tenant_id = TenantId::from_str(&tenant_id_str)?;
+                    let timeline_id_str =
+                        Self::get_cstr(&mut msg_bytes).with_context(|| "reading timeline_id")?;
+                    let timeline_id = TimelineId::from_str(&timeline_id_str)?;
+                    let mconf = Self::get_mconf(&mut msg_bytes)?;
+                    let pg_version = msg_bytes
+                        .get_u32_f()
+                        .with_context(|| "reading pg_version")?;
+                    let system_id = msg_bytes.get_u64_f().with_context(|| "reading system_id")?;
+                    let wal_seg_size = msg_bytes
+                        .get_u32_f()
+                        .with_context(|| "reading wal_seg_size")?;
+                    let g = ProposerGreeting {
+                        tenant_id,
+                        timeline_id,
+                        mconf,
+                        pg_version,
+                        system_id,
+                        wal_seg_size,
+                    };
+                    Ok(ProposerAcceptorMessage::Greeting(g))
                 }
+                'v' => {
+                    let generation = Generation::new(
+                        msg_bytes
+                            .get_u32_f()
+                            .with_context(|| "reading generation")?,
+                    );
+                    let term = msg_bytes.get_u64_f().with_context(|| "reading term")?;
+                    let v = VoteRequest { generation, term };
+                    Ok(ProposerAcceptorMessage::VoteRequest(v))
+                }
+                'e' => {
+                    let generation = Generation::new(
+                        msg_bytes
+                            .get_u32_f()
+                            .with_context(|| "reading generation")?,
+                    );
+                    let term = msg_bytes.get_u64_f().with_context(|| "reading term")?;
+                    let start_streaming_at: Lsn = msg_bytes
+                        .get_u64_f()
+                        .with_context(|| "reading start_streaming_at")?
+                        .into();
+                    let term_history = TermHistory::from_bytes(&mut msg_bytes)?;
+                    let msg = ProposerElected {
+                        generation,
+                        term,
+                        start_streaming_at,
+                        term_history,
+                    };
+                    Ok(ProposerAcceptorMessage::Elected(msg))
+                }
+                'a' => {
+                    let generation = Generation::new(
+                        msg_bytes
+                            .get_u32_f()
+                            .with_context(|| "reading generation")?,
+                    );
+                    let term = msg_bytes.get_u64_f().with_context(|| "reading term")?;
+                    let begin_lsn: Lsn = msg_bytes
+                        .get_u64_f()
+                        .with_context(|| "reading begin_lsn")?
+                        .into();
+                    let end_lsn: Lsn = msg_bytes
+                        .get_u64_f()
+                        .with_context(|| "reading end_lsn")?
+                        .into();
+                    let commit_lsn: Lsn = msg_bytes
+                        .get_u64_f()
+                        .with_context(|| "reading commit_lsn")?
+                        .into();
+                    let truncate_lsn: Lsn = msg_bytes
+                        .get_u64_f()
+                        .with_context(|| "reading truncate_lsn")?
+                        .into();
+                    let hdr = AppendRequestHeader {
+                        generation,
+                        term,
+                        begin_lsn,
+                        end_lsn,
+                        commit_lsn,
+                        truncate_lsn,
+                    };
+                    let rec_size = hdr
+                        .end_lsn
+                        .checked_sub(hdr.begin_lsn)
+                        .context("begin_lsn > end_lsn in AppendRequest")?
+                        .0 as usize;
+                    if rec_size > MAX_SEND_SIZE {
+                        bail!(
+                            "AppendRequest is longer than MAX_SEND_SIZE ({})",
+                            MAX_SEND_SIZE
+                        );
+                    }
+                    if msg_bytes.remaining() < rec_size {
+                        bail!(
+                            "reading WAL: only {} bytes left, wanted {}",
+                            msg_bytes.remaining(),
+                            rec_size
+                        );
+                    }
+                    let wal_data = msg_bytes.copy_to_bytes(rec_size);
+                    let msg = AppendRequest { h: hdr, wal_data };
 
-                let mut wal_data_vec: Vec<u8> = vec![0; rec_size];
-                stream.read_exact(&mut wal_data_vec)?;
-                let wal_data = Bytes::from(wal_data_vec);
-                let msg = AppendRequest { h: hdr, wal_data };
-
-                Ok(ProposerAcceptorMessage::AppendRequest(msg))
+                    Ok(ProposerAcceptorMessage::AppendRequest(msg))
+                }
+                _ => bail!("unknown proposer-acceptor message tag: {}", tag),
             }
-            _ => bail!("unknown proposer-acceptor message tag: {}", tag),
+        } else if proto_version == SK_PROTO_VERSION_2 {
+            // xxx using Reader is inefficient but easy to work with bincode
+            let mut stream = msg_bytes.reader();
+            // u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is
+            let tag = stream.read_u64::<LittleEndian>()? as u8 as char;
+            match tag {
+                'g' => {
+                    let msgv2 = ProposerGreetingV2::des_from(&mut stream)?;
+                    let g = ProposerGreeting {
+                        tenant_id: msgv2.tenant_id,
+                        timeline_id: msgv2.timeline_id,
+                        mconf: membership::Configuration {
+                            generation: INVALID_GENERATION,
+                            members: MemberSet::empty(),
+                            new_members: None,
+                        },
+                        pg_version: msgv2.pg_version,
+                        system_id: msgv2.system_id,
+                        wal_seg_size: msgv2.wal_seg_size,
+                    };
+                    Ok(ProposerAcceptorMessage::Greeting(g))
+                }
+                'v' => {
+                    let msg = VoteRequestV2::des_from(&mut stream)?;
+                    let v = VoteRequest {
+                        generation: INVALID_GENERATION,
+                        term: msg.term,
+                    };
+                    Ok(ProposerAcceptorMessage::VoteRequest(v))
+                }
+                'e' => {
+                    let mut msg_bytes = stream.into_inner();
+                    if msg_bytes.remaining() < 16 {
+                        bail!("ProposerElected message is not complete");
+                    }
+                    let term = msg_bytes.get_u64_le();
+                    let start_streaming_at = msg_bytes.get_u64_le().into();
+                    let term_history = TermHistory::from_bytes_le(&mut msg_bytes)?;
+                    if msg_bytes.remaining() < 8 {
+                        bail!("ProposerElected message is not complete");
+                    }
+                    let _timeline_start_lsn = msg_bytes.get_u64_le();
+                    let msg = ProposerElected {
+                        generation: INVALID_GENERATION,
+                        term,
+                        start_streaming_at,
+                        term_history,
+                    };
+                    Ok(ProposerAcceptorMessage::Elected(msg))
+                }
+                'a' => {
+                    // read header followed by wal data
+                    let hdrv2 = AppendRequestHeaderV2::des_from(&mut stream)?;
+                    let hdr = AppendRequestHeader {
+                        generation: INVALID_GENERATION,
+                        term: hdrv2.term,
+                        begin_lsn: hdrv2.begin_lsn,
+                        end_lsn: hdrv2.end_lsn,
+                        commit_lsn: hdrv2.commit_lsn,
+                        truncate_lsn: hdrv2.truncate_lsn,
+                    };
+                    let rec_size = hdr
+                        .end_lsn
+                        .checked_sub(hdr.begin_lsn)
+                        .context("begin_lsn > end_lsn in AppendRequest")?
+                        .0 as usize;
+                    if rec_size > MAX_SEND_SIZE {
+                        bail!(
+                            "AppendRequest is longer than MAX_SEND_SIZE ({})",
+                            MAX_SEND_SIZE
+                        );
+                    }
+
+                    let mut wal_data_vec: Vec<u8> = vec![0; rec_size];
+                    stream.read_exact(&mut wal_data_vec)?;
+                    let wal_data = Bytes::from(wal_data_vec);
+
+                    let msg = AppendRequest { h: hdr, wal_data };
+
+                    Ok(ProposerAcceptorMessage::AppendRequest(msg))
+                }
+                _ => bail!("unknown proposer-acceptor message tag: {}", tag),
+            }
+        } else {
+            bail!("unsupported protocol version {}", proto_version);
         }
     }
 
@@ -394,36 +711,21 @@ impl ProposerAcceptorMessage {
         // We explicitly list all fields, to draw attention here when new fields are added.
         let mut size = BASE_SIZE;
         size += match self {
-            Self::Greeting(ProposerGreeting {
-                protocol_version: _,
-                pg_version: _,
-                proposer_id: _,
-                system_id: _,
-                timeline_id: _,
-                tenant_id: _,
-                tli: _,
-                wal_seg_size: _,
-            }) => 0,
+            Self::Greeting(_) => 0,
 
-            Self::VoteRequest(VoteRequest { term: _ }) => 0,
+            Self::VoteRequest(_) => 0,
 
-            Self::Elected(ProposerElected {
-                term: _,
-                start_streaming_at: _,
-                term_history: _,
-                timeline_start_lsn: _,
-            }) => 0,
+            Self::Elected(_) => 0,
 
             Self::AppendRequest(AppendRequest {
                 h:
                     AppendRequestHeader {
+                        generation: _,
                         term: _,
-                        term_start_lsn: _,
                         begin_lsn: _,
                         end_lsn: _,
                         commit_lsn: _,
                         truncate_lsn: _,
-                        proposer_uuid: _,
                     },
                 wal_data,
             }) => wal_data.len(),
@@ -431,13 +733,12 @@ impl ProposerAcceptorMessage {
             Self::NoFlushAppendRequest(AppendRequest {
                 h:
                     AppendRequestHeader {
+                        generation: _,
                         term: _,
-                        term_start_lsn: _,
                         begin_lsn: _,
                         end_lsn: _,
                         commit_lsn: _,
                         truncate_lsn: _,
-                        proposer_uuid: _,
                     },
                 wal_data,
             }) => wal_data.len(),
@@ -458,45 +759,118 @@ pub enum AcceptorProposerMessage {
 }
 
 impl AcceptorProposerMessage {
-    /// Serialize acceptor -> proposer message.
-    pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> {
-        match self {
-            AcceptorProposerMessage::Greeting(msg) => {
-                buf.put_u64_le('g' as u64);
-                buf.put_u64_le(msg.term);
-                buf.put_u64_le(msg.node_id.0);
-            }
-            AcceptorProposerMessage::VoteResponse(msg) => {
-                buf.put_u64_le('v' as u64);
-                buf.put_u64_le(msg.term);
-                buf.put_u64_le(msg.vote_given);
-                buf.put_u64_le(msg.flush_lsn.into());
-                buf.put_u64_le(msg.truncate_lsn.into());
-                buf.put_u32_le(msg.term_history.0.len() as u32);
-                for e in &msg.term_history.0 {
-                    buf.put_u64_le(e.term);
-                    buf.put_u64_le(e.lsn.into());
-                }
-                buf.put_u64_le(msg.timeline_start_lsn.into());
-            }
-            AcceptorProposerMessage::AppendResponse(msg) => {
-                buf.put_u64_le('a' as u64);
-                buf.put_u64_le(msg.term);
-                buf.put_u64_le(msg.flush_lsn.into());
-                buf.put_u64_le(msg.commit_lsn.into());
-                buf.put_i64_le(msg.hs_feedback.ts);
-                buf.put_u64_le(msg.hs_feedback.xmin);
-                buf.put_u64_le(msg.hs_feedback.catalog_xmin);
+    fn put_cstr(buf: &mut BytesMut, s: &str) {
+        buf.put_slice(s.as_bytes());
+        buf.put_u8(0); // null terminator
+    }
 
-                // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
-                // if it is not present.
-                if let Some(ref msg) = msg.pageserver_feedback {
-                    msg.serialize(buf);
-                }
-            }
+    /// Serialize membership::Configuration into buf.
+    fn serialize_mconf(buf: &mut BytesMut, mconf: &membership::Configuration) {
+        buf.put_u32(mconf.generation.into_inner());
+        buf.put_u32(mconf.members.m.len() as u32);
+        for sk in &mconf.members.m {
+            buf.put_u64(sk.id.0);
+            Self::put_cstr(buf, &sk.host);
+            buf.put_u16(sk.pg_port);
         }
+        if let Some(ref new_members) = mconf.new_members {
+            buf.put_u32(new_members.m.len() as u32);
+            for sk in &new_members.m {
+                buf.put_u64(sk.id.0);
+                Self::put_cstr(buf, &sk.host);
+                buf.put_u16(sk.pg_port);
+            }
+        } else {
+            buf.put_u32(0);
+        }
+    }
 
-        Ok(())
+    /// Serialize acceptor -> proposer message.
+    pub fn serialize(&self, buf: &mut BytesMut, proto_version: u32) -> Result<()> {
+        if proto_version == SK_PROTO_VERSION_3 {
+            match self {
+                AcceptorProposerMessage::Greeting(msg) => {
+                    buf.put_u8(b'g');
+                    buf.put_u64(msg.node_id.0);
+                    Self::serialize_mconf(buf, &msg.mconf);
+                    buf.put_u64(msg.term)
+                }
+                AcceptorProposerMessage::VoteResponse(msg) => {
+                    buf.put_u8(b'v');
+                    buf.put_u32(msg.generation.into_inner());
+                    buf.put_u64(msg.term);
+                    buf.put_u8(msg.vote_given as u8);
+                    buf.put_u64(msg.flush_lsn.into());
+                    buf.put_u64(msg.truncate_lsn.into());
+                    buf.put_u32(msg.term_history.0.len() as u32);
+                    for e in &msg.term_history.0 {
+                        buf.put_u64(e.term);
+                        buf.put_u64(e.lsn.into());
+                    }
+                }
+                AcceptorProposerMessage::AppendResponse(msg) => {
+                    buf.put_u8(b'a');
+                    buf.put_u32(msg.generation.into_inner());
+                    buf.put_u64(msg.term);
+                    buf.put_u64(msg.flush_lsn.into());
+                    buf.put_u64(msg.commit_lsn.into());
+                    buf.put_i64(msg.hs_feedback.ts);
+                    buf.put_u64(msg.hs_feedback.xmin);
+                    buf.put_u64(msg.hs_feedback.catalog_xmin);
+
+                    // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
+                    // if it is not present.
+                    if let Some(ref msg) = msg.pageserver_feedback {
+                        msg.serialize(buf);
+                    }
+                }
+            }
+            Ok(())
+        // TODO remove 3 after converting all msgs
+        } else if proto_version == SK_PROTO_VERSION_2 {
+            match self {
+                AcceptorProposerMessage::Greeting(msg) => {
+                    buf.put_u64_le('g' as u64);
+                    // v2 didn't have mconf and fields were reordered
+                    buf.put_u64_le(msg.term);
+                    buf.put_u64_le(msg.node_id.0);
+                }
+                AcceptorProposerMessage::VoteResponse(msg) => {
+                    // v2 didn't have generation, had u64 vote_given and timeline_start_lsn
+                    buf.put_u64_le('v' as u64);
+                    buf.put_u64_le(msg.term);
+                    buf.put_u64_le(msg.vote_given as u64);
+                    buf.put_u64_le(msg.flush_lsn.into());
+                    buf.put_u64_le(msg.truncate_lsn.into());
+                    buf.put_u32_le(msg.term_history.0.len() as u32);
+                    for e in &msg.term_history.0 {
+                        buf.put_u64_le(e.term);
+                        buf.put_u64_le(e.lsn.into());
+                    }
+                    // removed timeline_start_lsn
+                    buf.put_u64_le(0);
+                }
+                AcceptorProposerMessage::AppendResponse(msg) => {
+                    // v2 didn't have generation
+                    buf.put_u64_le('a' as u64);
+                    buf.put_u64_le(msg.term);
+                    buf.put_u64_le(msg.flush_lsn.into());
+                    buf.put_u64_le(msg.commit_lsn.into());
+                    buf.put_i64_le(msg.hs_feedback.ts);
+                    buf.put_u64_le(msg.hs_feedback.xmin);
+                    buf.put_u64_le(msg.hs_feedback.catalog_xmin);
+
+                    // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
+                    // if it is not present.
+                    if let Some(ref msg) = msg.pageserver_feedback {
+                        msg.serialize(buf);
+                    }
+                }
+            }
+            Ok(())
+        } else {
+            bail!("unsupported protocol version {}", proto_version);
+        }
     }
 }
 
@@ -593,14 +967,6 @@ where
         &mut self,
         msg: &ProposerGreeting,
     ) -> Result<Option<AcceptorProposerMessage>> {
-        // Check protocol compatibility
-        if msg.protocol_version != SK_PROTOCOL_VERSION {
-            bail!(
-                "incompatible protocol version {}, expected {}",
-                msg.protocol_version,
-                SK_PROTOCOL_VERSION
-            );
-        }
         /* Postgres major version mismatch is treated as fatal error
          * because safekeepers parse WAL headers and the format
          * may change between versions.
@@ -655,15 +1021,16 @@ where
             self.state.finish_change(&state).await?;
         }
 
-        info!(
-            "processed greeting from walproposer {}, sending term {:?}",
-            msg.proposer_id.map(|b| format!("{:X}", b)).join(""),
-            self.state.acceptor_state.term
-        );
-        Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting {
-            term: self.state.acceptor_state.term,
+        let apg = AcceptorGreeting {
             node_id: self.node_id,
-        })))
+            mconf: self.state.mconf.clone(),
+            term: self.state.acceptor_state.term,
+        };
+        info!(
+            "processed greeting {:?} from walproposer, sending {:?}",
+            msg, apg
+        );
+        Ok(Some(AcceptorProposerMessage::Greeting(apg)))
     }
 
     /// Give vote for the given term, if we haven't done that previously.
@@ -684,12 +1051,12 @@ where
         self.wal_store.flush_wal().await?;
         // initialize with refusal
         let mut resp = VoteResponse {
+            generation: self.state.mconf.generation,
             term: self.state.acceptor_state.term,
-            vote_given: false as u64,
+            vote_given: false,
             flush_lsn: self.flush_lsn(),
             truncate_lsn: self.state.inmem.peer_horizon_lsn,
             term_history: self.get_term_history(),
-            timeline_start_lsn: self.state.timeline_start_lsn,
         };
         if self.state.acceptor_state.term < msg.term {
             let mut state = self.state.start_change();
@@ -698,15 +1065,16 @@ where
             self.state.finish_change(&state).await?;
 
             resp.term = self.state.acceptor_state.term;
-            resp.vote_given = true as u64;
+            resp.vote_given = true;
         }
-        info!("processed VoteRequest for term {}: {:?}", msg.term, &resp);
+        info!("processed {:?}: sending {:?}", msg, &resp);
         Ok(Some(AcceptorProposerMessage::VoteResponse(resp)))
     }
 
     /// Form AppendResponse from current state.
     fn append_response(&self) -> AppendResponse {
         let ar = AppendResponse {
+            generation: self.state.mconf.generation,
             term: self.state.acceptor_state.term,
             flush_lsn: self.flush_lsn(),
             commit_lsn: self.state.commit_lsn,
@@ -805,18 +1173,22 @@ where
             // Here we learn initial LSN for the first time, set fields
             // interested in that.
 
-            if state.timeline_start_lsn == Lsn(0) {
-                // Remember point where WAL begins globally.
-                state.timeline_start_lsn = msg.timeline_start_lsn;
-                info!(
-                    "setting timeline_start_lsn to {:?}",
-                    state.timeline_start_lsn
-                );
+            if let Some(start_lsn) = msg.term_history.0.first() {
+                if state.timeline_start_lsn == Lsn(0) {
+                    // Remember point where WAL begins globally. In the future it
+                    // will be intialized immediately on timeline creation.
+                    state.timeline_start_lsn = start_lsn.lsn;
+                    info!(
+                        "setting timeline_start_lsn to {:?}",
+                        state.timeline_start_lsn
+                    );
+                }
             }
+
             if state.peer_horizon_lsn == Lsn(0) {
                 // Update peer_horizon_lsn as soon as we know where timeline starts.
                 // It means that peer_horizon_lsn cannot be zero after we know timeline_start_lsn.
-                state.peer_horizon_lsn = msg.timeline_start_lsn;
+                state.peer_horizon_lsn = state.timeline_start_lsn;
             }
             if state.local_start_lsn == Lsn(0) {
                 state.local_start_lsn = msg.start_streaming_at;
@@ -896,7 +1268,10 @@ where
 
         // If our term is higher, immediately refuse the message.
         if self.state.acceptor_state.term > msg.h.term {
-            let resp = AppendResponse::term_only(self.state.acceptor_state.term);
+            let resp = AppendResponse::term_only(
+                self.state.mconf.generation,
+                self.state.acceptor_state.term,
+            );
             return Ok(Some(AcceptorProposerMessage::AppendResponse(resp)));
         }
 
@@ -924,10 +1299,8 @@ where
             );
         }
 
-        // Now we know that we are in the same term as the proposer,
-        // processing the message.
-
-        self.state.inmem.proposer_uuid = msg.h.proposer_uuid;
+        // Now we know that we are in the same term as the proposer, process the
+        // message.
 
         // do the job
         if !msg.wal_data.is_empty() {
@@ -1097,10 +1470,13 @@ mod tests {
         let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();
 
         // check voting for 1 is ok
-        let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 });
+        let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest {
+            generation: Generation::new(0),
+            term: 1,
+        });
         let mut vote_resp = sk.process_msg(&vote_request).await;
         match vote_resp.unwrap() {
-            Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given != 0),
+            Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given),
             r => panic!("unexpected response: {:?}", r),
         }
 
@@ -1115,7 +1491,7 @@ mod tests {
         // and ensure voting second time for 1 is not ok
         vote_resp = sk.process_msg(&vote_request).await;
         match vote_resp.unwrap() {
-            Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given == 0),
+            Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(!resp.vote_given),
             r => panic!("unexpected response: {:?}", r),
         }
     }
@@ -1130,13 +1506,12 @@ mod tests {
         let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();
 
         let mut ar_hdr = AppendRequestHeader {
+            generation: Generation::new(0),
             term: 2,
-            term_start_lsn: Lsn(3),
             begin_lsn: Lsn(1),
             end_lsn: Lsn(2),
             commit_lsn: Lsn(0),
             truncate_lsn: Lsn(0),
-            proposer_uuid: [0; 16],
         };
         let mut append_request = AppendRequest {
             h: ar_hdr.clone(),
@@ -1144,6 +1519,7 @@ mod tests {
         };
 
         let pem = ProposerElected {
+            generation: Generation::new(0),
             term: 2,
             start_streaming_at: Lsn(1),
             term_history: TermHistory(vec![
@@ -1156,7 +1532,6 @@ mod tests {
                     lsn: Lsn(3),
                 },
             ]),
-            timeline_start_lsn: Lsn(1),
         };
         sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
             .await
@@ -1191,26 +1566,25 @@ mod tests {
         let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();
 
         let pem = ProposerElected {
+            generation: Generation::new(0),
             term: 1,
             start_streaming_at: Lsn(1),
             term_history: TermHistory(vec![TermLsn {
                 term: 1,
                 lsn: Lsn(1),
             }]),
-            timeline_start_lsn: Lsn(1),
         };
         sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
             .await
             .unwrap();
 
         let ar_hdr = AppendRequestHeader {
+            generation: Generation::new(0),
             term: 1,
-            term_start_lsn: Lsn(3),
             begin_lsn: Lsn(1),
             end_lsn: Lsn(2),
             commit_lsn: Lsn(0),
             truncate_lsn: Lsn(0),
-            proposer_uuid: [0; 16],
         };
         let append_request = AppendRequest {
             h: ar_hdr.clone(),
diff --git a/safekeeper/src/test_utils.rs b/safekeeper/src/test_utils.rs
index 79ceddd366..32af4537d3 100644
--- a/safekeeper/src/test_utils.rs
+++ b/safekeeper/src/test_utils.rs
@@ -14,6 +14,7 @@ use crate::wal_backup::remote_timeline_path;
 use crate::{control_file, receive_wal, wal_storage, SafeKeeperConf};
 use camino_tempfile::Utf8TempDir;
 use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator};
+use safekeeper_api::membership::SafekeeperGeneration as Generation;
 use tokio::fs::create_dir_all;
 use utils::id::{NodeId, TenantTimelineId};
 use utils::lsn::Lsn;
@@ -73,10 +74,10 @@ impl Env {
         // Emulate an initial election.
         safekeeper
             .process_msg(&ProposerAcceptorMessage::Elected(ProposerElected {
+                generation: Generation::new(0),
                 term: 1,
                 start_streaming_at: start_lsn,
                 term_history: TermHistory(vec![(1, start_lsn).into()]),
-                timeline_start_lsn: start_lsn,
             }))
             .await?;
 
@@ -146,13 +147,12 @@ impl Env {
 
             let req = AppendRequest {
                 h: AppendRequestHeader {
+                    generation: Generation::new(0),
                     term: 1,
-                    term_start_lsn: start_lsn,
                     begin_lsn: lsn,
                     end_lsn: lsn + record.len() as u64,
                     commit_lsn: lsn,
                     truncate_lsn: Lsn(0),
-                    proposer_uuid: [0; 16],
                 },
                 wal_data: record,
             };
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 0023a4d22a..b9dfabe0d7 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -15,9 +15,7 @@ use desim::{
 };
 use http::Uri;
 use safekeeper::{
-    safekeeper::{
-        ProposerAcceptorMessage, SafeKeeper, SK_PROTOCOL_VERSION, UNKNOWN_SERVER_VERSION,
-    },
+    safekeeper::{ProposerAcceptorMessage, SafeKeeper, SK_PROTO_VERSION_3, UNKNOWN_SERVER_VERSION},
     state::{TimelinePersistentState, TimelineState},
     timeline::TimelineError,
     wal_storage::Storage,
@@ -287,7 +285,7 @@ impl ConnState {
                 bail!("finished processing START_REPLICATION")
             }
 
-            let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTOCOL_VERSION)?;
+            let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTO_VERSION_3)?;
             debug!("got msg: {:?}", msg);
             self.process(msg, global)
         } else {
@@ -403,7 +401,7 @@ impl ConnState {
             // TODO: if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn
 
             let mut buf = BytesMut::with_capacity(128);
-            reply.serialize(&mut buf)?;
+            reply.serialize(&mut buf, SK_PROTO_VERSION_3)?;
 
             self.tcp.send(AnyMessage::Bytes(buf.into()));
         }
diff --git a/test_runner/regress/test_normal_work.py b/test_runner/regress/test_normal_work.py
index ae2d171058..c8458b963e 100644
--- a/test_runner/regress/test_normal_work.py
+++ b/test_runner/regress/test_normal_work.py
@@ -6,9 +6,14 @@ from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
 from fixtures.pageserver.http import PageserverHttpClient
 
 
-def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient):
+def check_tenant(
+    env: NeonEnv, pageserver_http: PageserverHttpClient, safekeeper_proto_version: int
+):
     tenant_id, timeline_id = env.create_tenant()
-    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    config_lines = [
+        f"neon.safekeeper_proto_version = {safekeeper_proto_version}",
+    ]
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id, config_lines=config_lines)
     # we rely upon autocommit after each statement
     res_1 = endpoint.safe_psql_many(
         queries=[
@@ -33,7 +38,14 @@ def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient):
 
 
 @pytest.mark.parametrize("num_timelines,num_safekeepers", [(3, 1)])
-def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_safekeepers: int):
+# Test both proto versions until we fully migrate.
+@pytest.mark.parametrize("safekeeper_proto_version", [2, 3])
+def test_normal_work(
+    neon_env_builder: NeonEnvBuilder,
+    num_timelines: int,
+    num_safekeepers: int,
+    safekeeper_proto_version: int,
+):
     """
     Basic test:
     * create new tenant with a timeline
@@ -52,4 +64,4 @@ def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_s
     pageserver_http = env.pageserver.http_client()
 
     for _ in range(num_timelines):
-        check_tenant(env, pageserver_http)
+        check_tenant(env, pageserver_http, safekeeper_proto_version)
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index 936c774657..56539a0a08 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -539,13 +539,16 @@ def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder):
     asyncio.run(run_recovery_uncommitted(env))
 
 
-async def run_wal_truncation(env: NeonEnv):
+async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int):
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
     (sk1, sk2, sk3) = env.safekeepers
 
-    ep = env.endpoints.create_start("main")
+    config_lines = [
+        f"neon.safekeeper_proto_version = {safekeeper_proto_version}",
+    ]
+    ep = env.endpoints.create_start("main", config_lines=config_lines)
     ep.safe_psql("create table t (key int, value text)")
     ep.safe_psql("insert into t select generate_series(1, 100), 'payload'")
 
@@ -572,6 +575,7 @@ async def run_wal_truncation(env: NeonEnv):
     sk2.start()
     ep = env.endpoints.create_start(
         "main",
+        config_lines=config_lines,
     )
     ep.safe_psql("insert into t select generate_series(1, 200), 'payload'")
 
@@ -590,11 +594,13 @@ async def run_wal_truncation(env: NeonEnv):
 
 # Simple deterministic test creating tail of WAL on safekeeper which is
 # truncated when majority without this sk elects walproposer starting earlier.
-def test_wal_truncation(neon_env_builder: NeonEnvBuilder):
+# Test both proto versions until we fully migrate.
+@pytest.mark.parametrize("safekeeper_proto_version", [2, 3])
+def test_wal_truncation(neon_env_builder: NeonEnvBuilder, safekeeper_proto_version: int):
     neon_env_builder.num_safekeepers = 3
     env = neon_env_builder.init_start()
 
-    asyncio.run(run_wal_truncation(env))
+    asyncio.run(run_wal_truncation(env, safekeeper_proto_version))
 
 
 async def run_segment_init_failure(env: NeonEnv):

From 8f82c661d4d39c29b6b2cf0d616e6abc9aac0e25 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 25 Feb 2025 14:23:04 +0200
Subject: [PATCH 573/583] Move neon_pgstat_file_size_limit to the extension
 (#10959)

## Problem

PG14 uses separate backend for stats collector having no access to
shaerd memory.
As far as AUX mechanism requires access to shared memory, persisting
pgstat.stat file
is not supported at pg14. And so there is no definition of
`neon_pgstat_file_size_limit`
variable. It makes it impossible to provide same config for all Postgres
version.

## Summary of changes

Move neon_pgstat_file_size_limit to Neon extension.

Postgres submodules PR:
https://github.com/neondatabase/postgres/pull/587
https://github.com/neondatabase/postgres/pull/588
https://github.com/neondatabase/postgres/pull/589

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Tristan Partin <tristan@neon.tech>
---
 pgxn/neon/neon.c                   | 20 ++++++++++++++++++++
 test_runner/regress/test_pgstat.py |  2 +-
 vendor/postgres-v15                |  2 +-
 vendor/postgres-v16                |  2 +-
 vendor/postgres-v17                |  2 +-
 vendor/revisions.json              |  6 +++---
 6 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 700a942284..768d7ae9e8 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -12,6 +12,7 @@
 #include "fmgr.h"
 
 #include "miscadmin.h"
+#include "pgstat.h"
 #include "access/subtrans.h"
 #include "access/twophase.h"
 #include "access/xlog.h"
@@ -410,6 +411,16 @@ ReportSearchPath(void)
 	}
 }
 
+#if PG_VERSION_NUM < 150000
+/*
+ * PG14 uses separate backend for stats collector having no access to shared memory.
+ * As far as AUX mechanism requires access to shared memory, persisting pgstat.stat file
+ * is not supported in PG14. And so there is no definition of neon_pgstat_file_size_limit
+ * variable, so we have to declare it here.
+ */
+static int neon_pgstat_file_size_limit;
+#endif
+
 void
 _PG_init(void)
 {
@@ -467,6 +478,15 @@ _PG_init(void)
 							0,
 							NULL, NULL, NULL);
 
+	DefineCustomIntVariable("neon.pgstat_file_size_limit",
+							"Maximal size of pgstat.stat file saved in Neon storage",
+							"Zero value disables persisting pgstat.stat file",
+							&neon_pgstat_file_size_limit,
+							0, 0, 1000000, /* disabled by default */
+							PGC_SIGHUP,
+							GUC_UNIT_KB,
+							NULL, NULL, NULL);
+
 	/*
 	 * Important: This must happen after other parts of the extension are
 	 * loaded, otherwise any settings to GUCs that were set before the
diff --git a/test_runner/regress/test_pgstat.py b/test_runner/regress/test_pgstat.py
index c31e5ef7f8..bf9b982e14 100644
--- a/test_runner/regress/test_pgstat.py
+++ b/test_runner/regress/test_pgstat.py
@@ -13,7 +13,7 @@ def test_pgstat(neon_simple_env: NeonEnv):
 
     n = 10000
     endpoint = env.endpoints.create_start(
-        "main", config_lines=["neon_pgstat_file_size_limit=100kB", "autovacuum=off"]
+        "main", config_lines=["neon.pgstat_file_size_limit=100kB", "autovacuum=off"]
     )
 
     con = endpoint.connect()
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 6ff5044377..9b118b1cff 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 6ff50443773b69749e16da6db9d4f4b19064b4b7
+Subproject commit 9b118b1cffa6e4ca0d63389b57b54d11e207e9a8
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 261ed10e9b..799e7a08dd 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 261ed10e9b8c8dda01ad7aefb18e944e30aa161d
+Subproject commit 799e7a08dd171aa06a7395dd326f4243aaeb9f93
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 59b2fe851f..517b8dc244 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 59b2fe851f8e0595f6c830b90ee766f4f1c17a0f
+Subproject commit 517b8dc244abf3e56f0089849e464af76f70b94e
diff --git a/vendor/revisions.json b/vendor/revisions.json
index f85cec3a0b..8dde46a01e 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,15 +1,15 @@
 {
   "v17": [
     "17.4",
-    "59b2fe851f8e0595f6c830b90ee766f4f1c17a0f"
+    "517b8dc244abf3e56f0089849e464af76f70b94e"
   ],
   "v16": [
     "16.8",
-    "261ed10e9b8c8dda01ad7aefb18e944e30aa161d"
+    "799e7a08dd171aa06a7395dd326f4243aaeb9f93"
   ],
   "v15": [
     "15.12",
-    "6ff50443773b69749e16da6db9d4f4b19064b4b7"
+    "9b118b1cffa6e4ca0d63389b57b54d11e207e9a8"
   ],
   "v14": [
     "14.17",

From f4fefd9f2fbcc06a319d082b00a7868de9084dd3 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 25 Feb 2025 13:29:27 +0100
Subject: [PATCH 574/583] pre-commit: Switch to cargo fmt to handle per-crate
 editions (#10969)

cargo knows what edition each crate uses.
---
 pre-commit.py | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/pre-commit.py b/pre-commit.py
index c9567e0c50..09139459d5 100755
--- a/pre-commit.py
+++ b/pre-commit.py
@@ -29,12 +29,12 @@ def colorify(
     return f"{color.value}{s}{NC}"
 
 
-def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str:
-    cmd = "rustfmt --edition=2021"
+def cargo_fmt(fix_inplace: bool = False, no_color: bool = False) -> str:
+    cmd = "cargo fmt"
     if not fix_inplace:
         cmd += " --check"
     if no_color:
-        cmd += " --color=never"
+        cmd += " -- --color=never"
     return cmd
 
 
@@ -61,14 +61,23 @@ def get_commit_files() -> list[str]:
     return files.decode().splitlines()
 
 
-def check(name: str, suffix: str, cmd: str, changed_files: list[str], no_color: bool = False):
+def check(
+    name: str,
+    suffix: str,
+    cmd: str,
+    changed_files: list[str],
+    no_color: bool = False,
+    append_files_to_cmd: bool = True,
+):
     print(f"Checking: {name} ", end="")
     applicable_files = list(filter(lambda fname: fname.strip().endswith(suffix), changed_files))
     if not applicable_files:
         print(colorify("[NOT APPLICABLE]", Color.CYAN, no_color))
         return
 
-    cmd = f'{cmd} {" ".join(applicable_files)}'
+    if append_files_to_cmd:
+        cmd = f"{cmd} {' '.join(applicable_files)}"
+
     res = subprocess.run(cmd.split(), capture_output=True)
     if res.returncode != 0:
         print(colorify("[FAILED]", Color.RED, no_color))
@@ -100,15 +109,13 @@ if __name__ == "__main__":
     args = parser.parse_args()
 
     files = get_commit_files()
-    # we use rustfmt here because cargo fmt does not accept list of files
-    # it internally gathers project files and feeds them to rustfmt
-    # so because we want to check only files included in the commit we use rustfmt directly
     check(
-        name="rustfmt",
+        name="cargo fmt",
         suffix=".rs",
-        cmd=rustfmt(fix_inplace=args.fix_inplace, no_color=args.no_color),
+        cmd=cargo_fmt(fix_inplace=args.fix_inplace, no_color=args.no_color),
         changed_files=files,
         no_color=args.no_color,
+        append_files_to_cmd=False,
     )
     check(
         name="ruff check",

From f78ac4474894664faae2597425e7dc05e7bbf562 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 25 Feb 2025 13:44:06 +0100
Subject: [PATCH 575/583] Use the Dockerfile COPY instead of docker cp (#10943)

## Problem
We use `docker cp` to copy the files required for the extension tests
now.
It causes problems if we run older images with the newer source tree.
## Summary of changes
Copying the files was moved to the compute Dockerfile.
---
 .dockerignore                             | 1 +
 compute/compute-node.Dockerfile           | 2 +-
 docker-compose/docker_compose_test.sh     | 2 --
 docker-compose/test_extensions_upgrade.sh | 1 -
 4 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index 7ead48db7c..9fafc2e4ba 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -14,6 +14,7 @@
 !compute/
 !compute_tools/
 !control_plane/
+!docker-compose/ext-src
 !libs/
 !pageserver/
 !pgxn/
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index a74291fdb4..0cdb44853f 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1818,7 +1818,7 @@ RUN make PG_VERSION="${PG_VERSION:?}" -C compute
 
 FROM pg-build AS extension-tests
 ARG PG_VERSION
-RUN mkdir /ext-src
+COPY docker-compose/ext-src/ /ext-src/
 
 COPY --from=pg-build /postgres /postgres
 #COPY --from=postgis-src /ext-src/ /ext-src/
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index 5b3cfc74eb..0f03d600a3 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -51,8 +51,6 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
     done
 
     if [ $pg_version -ge 16 ]; then
-        docker cp ext-src $TEST_CONTAINER_NAME:/
-        docker exec $TEST_CONTAINER_NAME bash -c "apt update && apt install -y libtap-parser-sourcehandler-pgtap-perl"
         # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
         # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
         echo Adding dummy config
diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh
index 06d351b496..c2168c47af 100755
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -57,7 +57,6 @@ new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regress
 docker compose --profile test-extensions down
 TAG=${OLDTAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
 wait_for_ready
-docker compose cp  ext-src neon-test-extensions:/
 docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
 docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
 docker compose exec neon-test-extensions psql -c "CREATE DATABASE pgtap_regression"

From 8deeddd4f0dcab486d41a8ffdd5d3e1d5da70f30 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 25 Feb 2025 15:49:41 +0100
Subject: [PATCH 576/583] pageserver: ignore `CollectKeySpaceError::Cancelled`
 during compaction (#10968)

This pops up a few times during deployment. Not sure why it fires
without `self.cancel` being cancelled, but could be e.g. ancestor
timelines or sth.
---
 pageserver/src/tenant/timeline/compaction.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index d75591bd74..bfb610e0d9 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -779,6 +779,7 @@ impl Timeline {
             // Suppress errors when cancelled.
             Err(_) if self.cancel.is_cancelled() => {}
             Err(CompactionError::ShuttingDown) => {}
+            Err(CompactionError::CollectKeySpaceError(CollectKeySpaceError::Cancelled)) => {}
 
             // Alert on critical errors that indicate data corruption.
             Err(

From b7fcf2c7a7fb47ebc736db7b201cde04995b21c5 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 25 Feb 2025 09:50:22 -0500
Subject: [PATCH 577/583] test(pageserver): add reldir v2 into tests (#10750)

## Problem

We have `test_perf_many_relations` but it only runs on remote clusters,
and we cannot directly modify tenant config. Therefore, I patched one of
the current tests to benchmark relv2 performance.

close https://github.com/neondatabase/neon/issues/9986

## Summary of changes

* Add `v1/v2` selector to `test_tx_abort_with_many_relations`.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../performance/test_perf_many_relations.py   | 51 ++++++++++
 test_runner/regress/test_pg_regress.py        | 93 ++++++++++++-------
 2 files changed, 109 insertions(+), 35 deletions(-)

diff --git a/test_runner/performance/test_perf_many_relations.py b/test_runner/performance/test_perf_many_relations.py
index 0ee0efe8b9..2570c55f6c 100644
--- a/test_runner/performance/test_perf_many_relations.py
+++ b/test_runner/performance/test_perf_many_relations.py
@@ -2,8 +2,10 @@ import os
 from pathlib import Path
 
 import pytest
+from fixtures.benchmark_fixture import NeonBenchmarker
 from fixtures.compare_fixtures import RemoteCompare
 from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder
 
 
 def get_num_relations(default: int = 1000) -> list[int]:
@@ -64,3 +66,52 @@ def test_perf_many_relations(remote_compare: RemoteCompare, num_relations: int):
             env.pg_bin.run_capture(
                 ["psql", env.pg.connstr(options="-cstatement_timeout=1000s "), "-c", sql]
             )
+
+
+def test_perf_simple_many_relations_reldir_v2(
+    neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker
+):
+    """
+    Test creating many relations in a single database.
+    """
+    env = neon_env_builder.init_start(initial_tenant_conf={"rel_size_v2_enabled": "true"})
+    ep = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            "shared_buffers=1000MB",
+            "max_locks_per_transaction=16384",
+        ],
+    )
+
+    n = 100000
+    step = 5000
+    # Create many relations
+    log.info(f"Creating {n} relations...")
+    begin = 0
+    with zenbenchmark.record_duration("create_first_relation"):
+        ep.safe_psql("CREATE TABLE IF NOT EXISTS table_begin (id SERIAL PRIMARY KEY, data TEXT)")
+    with zenbenchmark.record_duration("create_many_relations"):
+        while True:
+            end = begin + step
+            ep.safe_psql_many(
+                [
+                    "BEGIN",
+                    f"""DO $$
+                DECLARE
+                    i INT;
+                    table_name TEXT;
+                BEGIN
+                    FOR i IN {begin}..{end} LOOP
+                        table_name := 'table_' || i;
+                        EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)';
+                    END LOOP;
+                END $$;
+                """,
+                    "COMMIT",
+                ]
+            )
+            begin = end
+            if begin >= n:
+                break
+    with zenbenchmark.record_duration("create_last_relation"):
+        ep.safe_psql(f"CREATE TABLE IF NOT EXISTS table_{begin} (id SERIAL PRIMARY KEY, data TEXT)")
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 411888efbc..afc7ef3e01 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -332,8 +332,10 @@ def test_sql_regress(
 
 
 @skip_in_debug_build("only run with release build")
+@pytest.mark.parametrize("reldir_type", ["v1", "v2"])
 def test_tx_abort_with_many_relations(
     neon_env_builder: NeonEnvBuilder,
+    reldir_type: str,
 ):
     """
     This is not a pg_regress test as such, but perhaps it should be -- this test exercises postgres
@@ -342,7 +344,11 @@ def test_tx_abort_with_many_relations(
     Reproducer for https://github.com/neondatabase/neon/issues/9505
     """
 
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "rel_size_v2_enabled": "true" if reldir_type == "v2" else "false",
+        }
+    )
     ep = env.endpoints.create_start(
         "main",
         tenant_id=env.initial_tenant,
@@ -354,48 +360,65 @@ def test_tx_abort_with_many_relations(
 
     # How many relations: this number is tuned to be long enough to take tens of seconds
     # if the rollback code path is buggy, tripping the test's timeout.
-    n = 4000
+    if reldir_type == "v1":
+        n = 4000
+        step = 4000
+    else:
+        n = 100000
+        step = 5000
 
     def create():
         # Create many relations
         log.info(f"Creating {n} relations...")
-        ep.safe_psql_many(
-            [
-                "BEGIN",
-                f"""DO $$
-            DECLARE
-                i INT;
-                table_name TEXT;
-            BEGIN
-                FOR i IN 1..{n} LOOP
-                    table_name := 'table_' || i;
-                    EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)';
-                END LOOP;
-            END $$;
-            """,
-                "COMMIT",
-            ]
-        )
+        begin = 0
+        while True:
+            end = begin + step
+            ep.safe_psql_many(
+                [
+                    "BEGIN",
+                    f"""DO $$
+                DECLARE
+                    i INT;
+                    table_name TEXT;
+                BEGIN
+                    FOR i IN {begin}..{end} LOOP
+                        table_name := 'table_' || i;
+                        EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)';
+                    END LOOP;
+                END $$;
+                """,
+                    "COMMIT",
+                ]
+            )
+            begin = end
+            if begin >= n:
+                break
 
     def truncate():
         # Truncate relations, then roll back the transaction containing the truncations
         log.info(f"Truncating {n} relations...")
-        ep.safe_psql_many(
-            [
-                "BEGIN",
-                f"""DO $$
-            DECLARE
-                i INT;
-                table_name TEXT;
-            BEGIN
-                FOR i IN 1..{n} LOOP
-                    table_name := 'table_' || i;
-                    EXECUTE 'TRUNCATE ' || table_name ;
-                END LOOP;
-            END $$;
-            """,
-            ]
-        )
+        begin = 0
+        while True:
+            end = begin + step
+            ep.safe_psql_many(
+                [
+                    "BEGIN",
+                    f"""DO $$
+                DECLARE
+                    i INT;
+                    table_name TEXT;
+                BEGIN
+                    FOR i IN {begin}..{end} LOOP
+                        table_name := 'table_' || i;
+                        EXECUTE 'TRUNCATE ' || table_name ;
+                    END LOOP;
+                END $$;
+                """,
+                ]
+            )
+            begin = end
+            if begin >= n:
+                break
 
     def rollback_and_wait():
         log.info(f"Rolling back after truncating {n} relations...")

From 015092d259b517f11ce98b2d19a9d3e9df3a633e Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 25 Feb 2025 09:50:39 -0500
Subject: [PATCH 578/583] feat(pageserver): add automatic trigger for
 gc-compaction (#10798)

## Problem

part of https://github.com/neondatabase/neon/issues/9114

## Summary of changes

Add the auto trigger for gc-compaction. It computes two values: L1 size
and L2 size. When L1 size >= initial trigger threshold, we will trigger
an initial gc-compaction. When l1_size / l2_size >=
gc_compaction_ratio_percent, we will trigger the "tiered" gc-compaction.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/config.rs             |   2 +-
 pageserver/src/http/routes.rs                 |   3 +-
 pageserver/src/tenant.rs                      |  33 +-
 .../src/tenant/remote_timeline_client.rs      |  13 +
 .../tenant/remote_timeline_client/index.rs    |  59 +++-
 pageserver/src/tenant/tasks.rs                |   1 +
 pageserver/src/tenant/timeline.rs             |  51 +++-
 pageserver/src/tenant/timeline/compaction.rs  | 289 +++++++++++++++---
 pageserver/src/tenant/timeline/delete.rs      |   6 +
 pageserver/src/tenant/timeline/offload.rs     |   7 +
 test_runner/regress/test_compaction.py        |  53 ++++
 11 files changed, 446 insertions(+), 71 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 1aff5a7012..5a695c04ed 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -584,7 +584,7 @@ pub mod tenant_conf_defaults {
     // image layers should be created.
     pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
     pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false;
-    pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 10240000;
+    pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
     pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
 }
 
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 56a84a98a8..9f37fc32a3 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2396,7 +2396,8 @@ async fn timeline_checkpoint_handler(
                         CompactionError::ShuttingDown => ApiError::ShuttingDown,
                         CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
                         CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
-                        CompactionError::Other(e) => ApiError::InternalServerError(e)
+                        CompactionError::Other(e) => ApiError::InternalServerError(e),
+                        CompactionError::AlreadyRunning(_) => ApiError::InternalServerError(anyhow::anyhow!(e)),
                     }
                 )?;
         }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 56718f5294..46f9c9a427 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -34,6 +34,7 @@ use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeoutOrCancel;
+use remote_timeline_client::index::GcCompactionState;
 use remote_timeline_client::manifest::{
     OffloadedTimelineManifest, TenantManifest, LATEST_TENANT_MANIFEST_VERSION,
 };
@@ -1168,6 +1169,7 @@ impl Tenant {
             resources,
             CreateTimelineCause::Load,
             idempotency.clone(),
+            index_part.gc_compaction.clone(),
         )?;
         let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
         anyhow::ensure!(
@@ -3125,20 +3127,19 @@ impl Tenant {
 
             // If we're done compacting, check the scheduled GC compaction queue for more work.
             if outcome == CompactionOutcome::Done {
-                let queue = self
-                    .scheduled_compaction_tasks
-                    .lock()
-                    .unwrap()
-                    .get(&timeline.timeline_id)
-                    .cloned();
-                if let Some(queue) = queue {
-                    outcome = queue
-                        .iteration(cancel, ctx, &self.gc_block, &timeline)
-                        .instrument(
-                            info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id),
-                        )
-                        .await?;
-                }
+                let queue = {
+                    let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+                    guard
+                        .entry(timeline.timeline_id)
+                        .or_insert_with(|| Arc::new(GcCompactionQueue::new()))
+                        .clone()
+                };
+                outcome = queue
+                    .iteration(cancel, ctx, &self.gc_block, &timeline)
+                    .instrument(
+                        info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id),
+                    )
+                    .await?;
             }
 
             // If we're done compacting, offload the timeline if requested.
@@ -3195,6 +3196,7 @@ impl Tenant {
                     .unwrap()
                     .fail(&CIRCUIT_BREAKERS_BROKEN, err);
             }
+            CompactionError::AlreadyRunning(_) => {}
         }
     }
 
@@ -4150,6 +4152,7 @@ impl Tenant {
         resources: TimelineResources,
         cause: CreateTimelineCause,
         create_idempotency: CreateTimelineIdempotency,
+        gc_compaction_state: Option<GcCompactionState>,
     ) -> anyhow::Result<Arc<Timeline>> {
         let state = match cause {
             CreateTimelineCause::Load => {
@@ -4181,6 +4184,7 @@ impl Tenant {
             state,
             self.attach_wal_lag_cooldown.clone(),
             create_idempotency,
+            gc_compaction_state,
             self.cancel.child_token(),
         );
 
@@ -5246,6 +5250,7 @@ impl Tenant {
                 resources,
                 CreateTimelineCause::Load,
                 create_guard.idempotency.clone(),
+                None,
             )
             .context("Failed to create timeline data structure")?;
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 713efbb9a4..e01da48052 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -184,6 +184,7 @@ use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};
 
 pub(crate) use download::download_initdb_tar_zst;
+use index::GcCompactionState;
 use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use regex::Regex;
@@ -913,6 +914,18 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Launch an index-file upload operation in the background, setting `import_pgdata` field.
+    pub(crate) fn schedule_index_upload_for_gc_compaction_state_update(
+        self: &Arc<Self>,
+        gc_compaction_state: GcCompactionState,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+        upload_queue.dirty.gc_compaction = Some(gc_compaction_state);
+        self.schedule_index_upload(upload_queue);
+        Ok(())
+    }
+
     ///
     /// Launch an index-file upload operation in the background, if necessary.
     ///
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index b8b18005fd..727b25fbf4 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -85,9 +85,36 @@ pub struct IndexPart {
     #[serde(skip_serializing_if = "Option::is_none", default)]
     pub(crate) rel_size_migration: Option<RelSizeMigration>,
 
-    /// The LSN of gc-compaction horizon. Once gc-compaction is finished for all layer files below an LSN, this LSN will be updated.
+    /// Not used anymore -- kept here for backwards compatibility. Merged into the `gc_compaction` field.
     #[serde(skip_serializing_if = "Option::is_none", default)]
-    pub(crate) l2_lsn: Option<Lsn>,
+    l2_lsn: Option<Lsn>,
+
+    /// State for the garbage-collecting compaction pass.
+    ///
+    /// Garbage-collecting compaction (gc-compaction) prunes `Value`s that are outside
+    /// the PITR window and not needed by child timelines.
+    ///
+    /// A commonly used synonym for this compaction pass is
+    /// "bottommost-compaction"  because the affected LSN range
+    /// is the "bottom" of the (key,lsn) map.
+    ///
+    /// Gc-compaction is a quite expensive operation; that's why we use
+    /// trigger condition.
+    /// This field here holds the state pertaining to that trigger condition
+    /// and (in future) to the progress of the gc-compaction, so that it's
+    /// resumable across restarts & migrations.
+    ///
+    /// Note that the underlying algorithm is _also_ called `gc-compaction`
+    /// in most places & design docs; but in fact it is more flexible than
+    /// just the specific use case here; it needs a new name.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub(crate) gc_compaction: Option<GcCompactionState>,
+}
+
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+pub struct GcCompactionState {
+    /// The upper bound of the last completed garbage-collecting compaction, aka. L2 LSN.
+    pub(crate) last_completed_lsn: Lsn,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -123,10 +150,11 @@ impl IndexPart {
     /// - 10: +import_pgdata
     /// - 11: +rel_size_migration
     /// - 12: +l2_lsn
-    const LATEST_VERSION: usize = 12;
+    /// - 13: +gc_compaction
+    const LATEST_VERSION: usize = 13;
 
     // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13];
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
@@ -144,6 +172,7 @@ impl IndexPart {
             import_pgdata: None,
             rel_size_migration: None,
             l2_lsn: None,
+            gc_compaction: None,
         }
     }
 
@@ -450,6 +479,7 @@ mod tests {
             import_pgdata: None,
             rel_size_migration: None,
             l2_lsn: None,
+            gc_compaction: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -497,6 +527,7 @@ mod tests {
             import_pgdata: None,
             rel_size_migration: None,
             l2_lsn: None,
+            gc_compaction: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -545,6 +576,7 @@ mod tests {
             import_pgdata: None,
             rel_size_migration: None,
             l2_lsn: None,
+            gc_compaction: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -596,6 +628,7 @@ mod tests {
             import_pgdata: None,
             rel_size_migration: None,
             l2_lsn: None,
+            gc_compaction: None,
         };
 
         let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap();
@@ -642,6 +675,7 @@ mod tests {
             import_pgdata: None,
             rel_size_migration: None,
             l2_lsn: None,
+            gc_compaction: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -691,6 +725,7 @@ mod tests {
             import_pgdata: None,
             rel_size_migration: None,
             l2_lsn: None,
+            gc_compaction: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -745,6 +780,7 @@ mod tests {
             import_pgdata: None,
             rel_size_migration: None,
             l2_lsn: None,
+            gc_compaction: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -804,6 +840,7 @@ mod tests {
             import_pgdata: None,
             rel_size_migration: None,
             l2_lsn: None,
+            gc_compaction: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -864,6 +901,7 @@ mod tests {
             import_pgdata: None,
             rel_size_migration: None,
             l2_lsn: None,
+            gc_compaction: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -929,6 +967,7 @@ mod tests {
             import_pgdata: None,
             rel_size_migration: None,
             l2_lsn: None,
+            gc_compaction: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -1007,6 +1046,7 @@ mod tests {
             }))),
             rel_size_migration: None,
             l2_lsn: None,
+            gc_compaction: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -1086,6 +1126,7 @@ mod tests {
             }))),
             rel_size_migration: Some(RelSizeMigration::Legacy),
             l2_lsn: None,
+            gc_compaction: None,
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -1093,7 +1134,7 @@ mod tests {
     }
 
     #[test]
-    fn v12_l2_lsn_is_parsed() {
+    fn v12_v13_l2_gc_ompaction_is_parsed() {
         let example = r#"{
             "version": 12,
             "layer_metadata":{
@@ -1124,7 +1165,10 @@ mod tests {
                 }
             },
             "rel_size_migration": "legacy",
-            "l2_lsn": "0/16960E8"
+            "l2_lsn": "0/16960E8",
+            "gc_compaction": {
+                "last_completed_lsn": "0/16960E8"
+            }
         }"#;
 
         let expected = IndexPart {
@@ -1166,6 +1210,9 @@ mod tests {
             }))),
             rel_size_migration: Some(RelSizeMigration::Legacy),
             l2_lsn: Some("0/16960E8".parse::<Lsn>().unwrap()),
+            gc_compaction: Some(GcCompactionState {
+                last_completed_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            }),
         };
 
         let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 5e63f59fd8..b12655b0f3 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -295,6 +295,7 @@ fn log_compaction_error(
     let level = match err {
         ShuttingDown => return,
         Offload(_) => Level::ERROR,
+        AlreadyRunning(_) => Level::ERROR,
         CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO,
         CollectKeySpaceError(_) => Level::ERROR,
         _ if task_cancelled => Level::INFO,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 319c5e3d87..a80d407d54 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -19,7 +19,7 @@ use arc_swap::{ArcSwap, ArcSwapOption};
 use bytes::Bytes;
 use camino::Utf8Path;
 use chrono::{DateTime, Utc};
-use compaction::CompactionOutcome;
+use compaction::{CompactionOutcome, GcCompactionCombinedSettings};
 use enumset::EnumSet;
 use fail::fail_point;
 use futures::FutureExt;
@@ -148,6 +148,7 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
+use super::remote_timeline_client::index::GcCompactionState;
 use super::{
     config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized,
     MaybeOffloaded,
@@ -323,6 +324,9 @@ pub struct Timeline {
     ancestor_timeline: Option<Arc<Timeline>>,
     ancestor_lsn: Lsn,
 
+    // The LSN of gc-compaction that was last applied to this timeline.
+    gc_compaction_state: ArcSwap<Option<GcCompactionState>>,
+
     pub(super) metrics: TimelineMetrics,
 
     // `Timeline` doesn't write these metrics itself, but it manages the lifetime.  Code
@@ -1889,6 +1893,7 @@ impl Timeline {
             // abruptly stall nor resume L0 flushes in these cases.
             Err(CompactionError::Offload(_)) => {}
             Err(CompactionError::ShuttingDown) => {}
+            Err(CompactionError::AlreadyRunning(_)) => {}
         };
 
         result
@@ -2531,6 +2536,31 @@ impl Timeline {
             )
     }
 
+    fn get_gc_compaction_settings(&self) -> GcCompactionCombinedSettings {
+        let tenant_conf = &self.tenant_conf.load();
+        let gc_compaction_enabled = tenant_conf
+            .tenant_conf
+            .gc_compaction_enabled
+            .unwrap_or(self.conf.default_tenant_conf.gc_compaction_enabled);
+        let gc_compaction_initial_threshold_kb = tenant_conf
+            .tenant_conf
+            .gc_compaction_initial_threshold_kb
+            .unwrap_or(
+                self.conf
+                    .default_tenant_conf
+                    .gc_compaction_initial_threshold_kb,
+            );
+        let gc_compaction_ratio_percent = tenant_conf
+            .tenant_conf
+            .gc_compaction_ratio_percent
+            .unwrap_or(self.conf.default_tenant_conf.gc_compaction_ratio_percent);
+        GcCompactionCombinedSettings {
+            gc_compaction_enabled,
+            gc_compaction_initial_threshold_kb,
+            gc_compaction_ratio_percent,
+        }
+    }
+
     fn get_image_creation_preempt_threshold(&self) -> usize {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
@@ -2609,6 +2639,7 @@ impl Timeline {
         state: TimelineState,
         attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
         create_idempotency: crate::tenant::CreateTimelineIdempotency,
+        gc_compaction_state: Option<GcCompactionState>,
         cancel: CancellationToken,
     ) -> Arc<Self> {
         let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2667,6 +2698,8 @@ impl Timeline {
                 }),
                 disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0),
 
+                gc_compaction_state: ArcSwap::new(Arc::new(gc_compaction_state)),
+
                 last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0),
                 last_freeze_ts: RwLock::new(Instant::now()),
 
@@ -2831,6 +2864,20 @@ impl Timeline {
         );
     }
 
+    pub(crate) fn update_gc_compaction_state(
+        &self,
+        gc_compaction_state: GcCompactionState,
+    ) -> anyhow::Result<()> {
+        self.gc_compaction_state
+            .store(Arc::new(Some(gc_compaction_state.clone())));
+        self.remote_client
+            .schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state)
+    }
+
+    pub(crate) fn get_gc_compaction_state(&self) -> Option<GcCompactionState> {
+        self.gc_compaction_state.load_full().as_ref().clone()
+    }
+
     /// Creates and starts the wal receiver.
     ///
     /// This function is expected to be called at most once per Timeline's lifecycle
@@ -5373,6 +5420,8 @@ pub(crate) enum CompactionError {
     CollectKeySpaceError(CollectKeySpaceError),
     #[error(transparent)]
     Other(anyhow::Error),
+    #[error("Compaction already running: {0}")]
+    AlreadyRunning(&'static str),
 }
 
 impl From<OffloadError> for CompactionError {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index bfb610e0d9..c6ef5165ef 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -20,11 +20,13 @@ use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
+use once_cell::sync::Lazy;
 use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
 use pageserver_api::models::CompactInfoResponse;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
 use serde::Serialize;
+use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, info_span, trace, warn, Instrument};
 use utils::critical;
@@ -37,6 +39,7 @@ use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::gc_block::GcBlock;
 use crate::tenant::layer_map::LayerMap;
+use crate::tenant::remote_timeline_client::index::GcCompactionState;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::batch_split_writer::{
     BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -77,13 +80,22 @@ impl std::fmt::Display for GcCompactionJobId {
     }
 }
 
+pub struct GcCompactionCombinedSettings {
+    pub gc_compaction_enabled: bool,
+    pub gc_compaction_initial_threshold_kb: u64,
+    pub gc_compaction_ratio_percent: u64,
+}
+
 #[derive(Debug, Clone)]
 pub enum GcCompactionQueueItem {
-    Manual(CompactOptions),
+    MetaJob {
+        /// Compaction options
+        options: CompactOptions,
+        /// Whether the compaction is triggered automatically (determines whether we need to update L2 LSN)
+        auto: bool,
+    },
     SubCompactionJob(CompactOptions),
-    #[allow(dead_code)]
-    UpdateL2Lsn(Lsn),
-    Notify(GcCompactionJobId),
+    Notify(GcCompactionJobId, Option<Lsn>),
 }
 
 impl GcCompactionQueueItem {
@@ -93,7 +105,7 @@ impl GcCompactionQueueItem {
         running: bool,
     ) -> Option<CompactInfoResponse> {
         match self {
-            GcCompactionQueueItem::Manual(options) => Some(CompactInfoResponse {
+            GcCompactionQueueItem::MetaJob { options, .. } => Some(CompactInfoResponse {
                 compact_key_range: options.compact_key_range,
                 compact_lsn_range: options.compact_lsn_range,
                 sub_compaction: options.sub_compaction,
@@ -107,17 +119,22 @@ impl GcCompactionQueueItem {
                 running,
                 job_id: id.0,
             }),
-            GcCompactionQueueItem::UpdateL2Lsn(_) => None,
-            GcCompactionQueueItem::Notify(_) => None,
+            GcCompactionQueueItem::Notify(_, _) => None,
         }
     }
 }
 
+#[derive(Default)]
+struct GcCompactionGuardItems {
+    notify: Option<tokio::sync::oneshot::Sender<()>>,
+    gc_guard: Option<gc_block::Guard>,
+    permit: Option<OwnedSemaphorePermit>,
+}
+
 struct GcCompactionQueueInner {
     running: Option<(GcCompactionJobId, GcCompactionQueueItem)>,
     queued: VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
-    notify: HashMap<GcCompactionJobId, tokio::sync::oneshot::Sender<()>>,
-    gc_guards: HashMap<GcCompactionJobId, gc_block::Guard>,
+    guards: HashMap<GcCompactionJobId, GcCompactionGuardItems>,
     last_id: GcCompactionJobId,
 }
 
@@ -137,14 +154,18 @@ pub struct GcCompactionQueue {
     consumer_lock: tokio::sync::Mutex<()>,
 }
 
+static CONCURRENT_GC_COMPACTION_TASKS: Lazy<Arc<Semaphore>> = Lazy::new(|| {
+    // Only allow two timelines on one pageserver to run gc compaction at a time.
+    Arc::new(Semaphore::new(2))
+});
+
 impl GcCompactionQueue {
     pub fn new() -> Self {
         GcCompactionQueue {
             inner: std::sync::Mutex::new(GcCompactionQueueInner {
                 running: None,
                 queued: VecDeque::new(),
-                notify: HashMap::new(),
-                gc_guards: HashMap::new(),
+                guards: HashMap::new(),
                 last_id: GcCompactionJobId(0),
             }),
             consumer_lock: tokio::sync::Mutex::new(()),
@@ -154,8 +175,9 @@ impl GcCompactionQueue {
     pub fn cancel_scheduled(&self) {
         let mut guard = self.inner.lock().unwrap();
         guard.queued.clear();
-        guard.notify.clear();
-        guard.gc_guards.clear();
+        // TODO: if there is a running job, we should keep the gc guard. However, currently, the cancel
+        // API is only used for testing purposes, so we can drop everything here.
+        guard.guards.clear();
     }
 
     /// Schedule a manual compaction job.
@@ -166,29 +188,162 @@ impl GcCompactionQueue {
     ) -> GcCompactionJobId {
         let mut guard = self.inner.lock().unwrap();
         let id = guard.next_id();
-        guard
-            .queued
-            .push_back((id, GcCompactionQueueItem::Manual(options)));
-        if let Some(notify) = notify {
-            guard.notify.insert(id, notify);
-        }
+        guard.queued.push_back((
+            id,
+            GcCompactionQueueItem::MetaJob {
+                options,
+                auto: false,
+            },
+        ));
+        guard.guards.entry(id).or_default().notify = notify;
         info!("scheduled compaction job id={}", id);
         id
     }
 
+    /// Schedule an auto compaction job.
+    fn schedule_auto_compaction(
+        &self,
+        options: CompactOptions,
+        permit: OwnedSemaphorePermit,
+    ) -> GcCompactionJobId {
+        let mut guard = self.inner.lock().unwrap();
+        let id = guard.next_id();
+        guard.queued.push_back((
+            id,
+            GcCompactionQueueItem::MetaJob {
+                options,
+                auto: true,
+            },
+        ));
+        guard.guards.entry(id).or_default().permit = Some(permit);
+        id
+    }
+
     /// Trigger an auto compaction.
-    #[allow(dead_code)]
-    pub fn trigger_auto_compaction(&self, _: &Arc<Timeline>) {}
+    pub async fn trigger_auto_compaction(&self, timeline: &Arc<Timeline>) {
+        let GcCompactionCombinedSettings {
+            gc_compaction_enabled,
+            gc_compaction_initial_threshold_kb,
+            gc_compaction_ratio_percent,
+        } = timeline.get_gc_compaction_settings();
+        if !gc_compaction_enabled {
+            return;
+        }
+        if self.remaining_jobs_num() > 0 {
+            // Only schedule auto compaction when the queue is empty
+            return;
+        }
+        if timeline.ancestor_timeline().is_some() {
+            // Do not trigger auto compaction for child timelines. We haven't tested
+            // it enough in staging yet.
+            return;
+        }
+
+        let Ok(permit) = CONCURRENT_GC_COMPACTION_TASKS.clone().try_acquire_owned() else {
+            // Only allow one compaction run at a time. TODO: As we do `try_acquire_owned`, we cannot ensure
+            // the fairness of the lock across timelines. We should listen for both `acquire` and `l0_compaction_trigger`
+            // to ensure the fairness while avoid starving other tasks.
+            return;
+        };
+
+        let gc_compaction_state = timeline.get_gc_compaction_state();
+        let l2_lsn = gc_compaction_state
+            .map(|x| x.last_completed_lsn)
+            .unwrap_or(Lsn::INVALID);
+
+        let layers = {
+            let guard = timeline.layers.read().await;
+            let layer_map = guard.layer_map().unwrap();
+            layer_map.iter_historic_layers().collect_vec()
+        };
+        let mut l2_size: u64 = 0;
+        let mut l1_size = 0;
+        let gc_cutoff = *timeline.get_applied_gc_cutoff_lsn();
+        for layer in layers {
+            if layer.lsn_range.start <= l2_lsn {
+                l2_size += layer.file_size();
+            } else if layer.lsn_range.start <= gc_cutoff {
+                l1_size += layer.file_size();
+            }
+        }
+
+        fn trigger_compaction(
+            l1_size: u64,
+            l2_size: u64,
+            gc_compaction_initial_threshold_kb: u64,
+            gc_compaction_ratio_percent: u64,
+        ) -> bool {
+            const AUTO_TRIGGER_LIMIT: u64 = 150 * 1024 * 1024 * 1024; // 150GB
+            if l1_size >= AUTO_TRIGGER_LIMIT || l2_size >= AUTO_TRIGGER_LIMIT {
+                // Do not auto-trigger when physical size >= 150GB
+                return false;
+            }
+            // initial trigger
+            if l2_size == 0 && l1_size >= gc_compaction_initial_threshold_kb * 1024 {
+                info!(
+                    "trigger auto-compaction because l1_size={} >= gc_compaction_initial_threshold_kb={}",
+                    l1_size,
+                    gc_compaction_initial_threshold_kb
+                );
+                return true;
+            }
+            // size ratio trigger
+            if l2_size == 0 {
+                return false;
+            }
+            if l1_size as f64 / l2_size as f64 >= (gc_compaction_ratio_percent as f64 / 100.0) {
+                info!(
+                    "trigger auto-compaction because l1_size={} / l2_size={} > gc_compaction_ratio_percent={}",
+                    l1_size,
+                    l2_size,
+                    gc_compaction_ratio_percent
+                );
+                return true;
+            }
+            false
+        }
+
+        if trigger_compaction(
+            l1_size,
+            l2_size,
+            gc_compaction_initial_threshold_kb,
+            gc_compaction_ratio_percent,
+        ) {
+            self.schedule_auto_compaction(
+                CompactOptions {
+                    flags: {
+                        let mut flags = EnumSet::new();
+                        flags |= CompactFlags::EnhancedGcBottomMostCompaction;
+                        flags
+                    },
+                    sub_compaction: true,
+                    compact_key_range: None,
+                    compact_lsn_range: None,
+                    sub_compaction_max_job_size_mb: None,
+                },
+                permit,
+            );
+            info!(
+                "scheduled auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}",
+                l1_size, l2_size, l2_lsn, gc_cutoff
+            );
+        } else {
+            info!(
+                "did not trigger auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}",
+                l1_size, l2_size, l2_lsn, gc_cutoff
+            );
+        }
+    }
 
     /// Notify the caller the job has finished and unblock GC.
     fn notify_and_unblock(&self, id: GcCompactionJobId) {
         info!("compaction job id={} finished", id);
         let mut guard = self.inner.lock().unwrap();
-        if let Some(blocking) = guard.gc_guards.remove(&id) {
-            drop(blocking)
-        }
-        if let Some(tx) = guard.notify.remove(&id) {
-            let _ = tx.send(());
+        if let Some(items) = guard.guards.remove(&id) {
+            drop(items.gc_guard);
+            if let Some(tx) = items.notify {
+                let _ = tx.send(());
+            }
         }
     }
 
@@ -198,9 +353,10 @@ impl GcCompactionQueue {
         options: CompactOptions,
         timeline: &Arc<Timeline>,
         gc_block: &GcBlock,
+        auto: bool,
     ) -> Result<(), CompactionError> {
         info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
-        let jobs: Vec<GcCompactJob> = timeline
+        let jobs = timeline
             .gc_compaction_split_jobs(
                 GcCompactJob::from_compact_options(options.clone()),
                 options.sub_compaction_max_job_size_mb,
@@ -223,6 +379,9 @@ impl GcCompactionQueue {
 
             let jobs_len = jobs.len();
             let mut pending_tasks = Vec::new();
+            // gc-compaction might pick more layers or fewer layers to compact. The L2 LSN does not need to be accurate.
+            // And therefore, we simply assume the maximum LSN of all jobs is the expected L2 LSN.
+            let expected_l2_lsn = jobs.iter().map(|job| job.compact_lsn_range.end).max();
             for job in jobs {
                 // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
                 // until we do further refactors to allow directly call `compact_with_gc`.
@@ -240,10 +399,16 @@ impl GcCompactionQueue {
                 };
                 pending_tasks.push(GcCompactionQueueItem::SubCompactionJob(options));
             }
-            pending_tasks.push(GcCompactionQueueItem::Notify(id));
+
+            if !auto {
+                pending_tasks.push(GcCompactionQueueItem::Notify(id, None));
+            } else {
+                pending_tasks.push(GcCompactionQueueItem::Notify(id, expected_l2_lsn));
+            }
+
             {
                 let mut guard = self.inner.lock().unwrap();
-                guard.gc_guards.insert(id, gc_guard);
+                guard.guards.entry(id).or_default().gc_guard = Some(gc_guard);
                 let mut tasks = Vec::new();
                 for task in pending_tasks {
                     let id = guard.next_id();
@@ -267,29 +432,41 @@ impl GcCompactionQueue {
         gc_block: &GcBlock,
         timeline: &Arc<Timeline>,
     ) -> Result<CompactionOutcome, CompactionError> {
-        let _one_op_at_a_time_guard = self.consumer_lock.lock().await;
-        let has_pending_tasks;
-        let (id, item) = {
-            let mut guard = self.inner.lock().unwrap();
-            let Some((id, item)) = guard.queued.pop_front() else {
-                return Ok(CompactionOutcome::Done);
-            };
-            guard.running = Some((id, item.clone()));
-            has_pending_tasks = !guard.queued.is_empty();
-            (id, item)
+        let Ok(_one_op_at_a_time_guard) = self.consumer_lock.try_lock() else {
+            return Err(CompactionError::AlreadyRunning("cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue."));
+        };
+        let has_pending_tasks;
+        let Some((id, item)) = ({
+            let mut guard = self.inner.lock().unwrap();
+            if let Some((id, item)) = guard.queued.pop_front() {
+                guard.running = Some((id, item.clone()));
+                has_pending_tasks = !guard.queued.is_empty();
+                Some((id, item))
+            } else {
+                has_pending_tasks = false;
+                None
+            }
+        }) else {
+            self.trigger_auto_compaction(timeline).await;
+            // Always yield after triggering auto-compaction. Gc-compaction is a low-priority task and we
+            // have not implemented preemption mechanism yet. We always want to yield it to more important
+            // tasks if there is one.
+            return Ok(CompactionOutcome::Done);
         };
-
         match item {
-            GcCompactionQueueItem::Manual(options) => {
+            GcCompactionQueueItem::MetaJob { options, auto } => {
                 if !options
                     .flags
                     .contains(CompactFlags::EnhancedGcBottomMostCompaction)
                 {
                     warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", options);
                 } else if options.sub_compaction {
-                    self.handle_sub_compaction(id, options, timeline, gc_block)
+                    info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
+                    self.handle_sub_compaction(id, options, timeline, gc_block, auto)
                         .await?;
                 } else {
+                    // Auto compaction always enables sub-compaction so we don't need to handle update_l2_lsn
+                    // in this branch.
                     let gc_guard = match gc_block.start().await {
                         Ok(guard) => guard,
                         Err(e) => {
@@ -301,20 +478,37 @@ impl GcCompactionQueue {
                     };
                     {
                         let mut guard = self.inner.lock().unwrap();
-                        guard.gc_guards.insert(id, gc_guard);
+                        guard.guards.entry(id).or_default().gc_guard = Some(gc_guard);
                     }
                     let _ = timeline.compact_with_options(cancel, options, ctx).await?;
                     self.notify_and_unblock(id);
                 }
             }
             GcCompactionQueueItem::SubCompactionJob(options) => {
+                // TODO: error handling, clear the queue if any task fails?
                 let _ = timeline.compact_with_options(cancel, options, ctx).await?;
             }
-            GcCompactionQueueItem::Notify(id) => {
+            GcCompactionQueueItem::Notify(id, l2_lsn) => {
                 self.notify_and_unblock(id);
-            }
-            GcCompactionQueueItem::UpdateL2Lsn(_) => {
-                unreachable!()
+                if let Some(l2_lsn) = l2_lsn {
+                    let current_l2_lsn = timeline
+                        .get_gc_compaction_state()
+                        .map(|x| x.last_completed_lsn)
+                        .unwrap_or(Lsn::INVALID);
+                    if l2_lsn >= current_l2_lsn {
+                        info!("l2_lsn updated to {}", l2_lsn);
+                        timeline
+                            .update_gc_compaction_state(GcCompactionState {
+                                last_completed_lsn: l2_lsn,
+                            })
+                            .map_err(CompactionError::Other)?;
+                    } else {
+                        warn!(
+                            "l2_lsn updated to {} but it is less than the current l2_lsn {}",
+                            l2_lsn, current_l2_lsn
+                        );
+                    }
+                }
             }
         }
         {
@@ -339,7 +533,6 @@ impl GcCompactionQueue {
         (guard.running.clone(), guard.queued.clone())
     }
 
-    #[allow(dead_code)]
     pub fn remaining_jobs_num(&self) -> usize {
         let guard = self.inner.lock().unwrap();
         guard.queued.len() + if guard.running.is_some() { 1 } else { 0 }
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 841b2fa1c7..f4ae1ea166 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -137,6 +137,11 @@ async fn remove_maybe_offloaded_timeline_from_tenant(
             timelines.remove(&timeline.timeline_id).expect(
                 "timeline that we were deleting was concurrently removed from 'timelines' map",
             );
+            tenant
+                .scheduled_compaction_tasks
+                .lock()
+                .unwrap()
+                .remove(&timeline.timeline_id);
         }
         TimelineOrOffloaded::Offloaded(timeline) => {
             let offloaded_timeline = timelines_offloaded
@@ -300,6 +305,7 @@ impl DeleteTimelineFlow {
                 // Thus we need to skip the validation here.
                 CreateTimelineCause::Delete,
                 crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
+                None, // doesn't matter what we put here
             )
             .context("create_timeline_struct")?;
 
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 93e5a1100d..424a75005d 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -143,5 +143,12 @@ fn remove_timeline_from_tenant(
         .remove(&timeline.timeline_id)
         .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
 
+    // Clear the compaction queue for this timeline
+    tenant
+        .scheduled_compaction_tasks
+        .lock()
+        .unwrap()
+        .remove(&timeline.timeline_id);
+
     Arc::strong_count(&timeline)
 }
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index c091cd0869..ce8ed3c7c5 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -466,6 +466,59 @@ def test_pageserver_gc_compaction_interrupt(neon_env_builder: NeonEnvBuilder):
     ps_http.timeline_gc(tenant_id, timeline_id, None)
 
 
+@skip_in_debug_build("only run with release build")
+def test_pageserver_gc_compaction_trigger(neon_env_builder: NeonEnvBuilder):
+    SMOKE_CONF = {
+        # Run both gc and gc-compaction.
+        "gc_period": "5s",
+        "compaction_period": "5s",
+        # No PiTR interval and small GC horizon
+        "pitr_interval": "0s",
+        "gc_horizon": f"{1024 * 16}",
+        "lsn_lease_length": "0s",
+        "gc_compaction_enabled": "true",
+        "gc_compaction_initial_threshold_kb": "16",
+        "gc_compaction_ratio_percent": "50",
+        # Do not generate image layers with create_image_layers
+        "image_layer_creation_check_threshold": "100",
+    }
+
+    env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF)
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    row_count = 10000
+    churn_rounds = 20
+
+    ps_http = env.pageserver.http_client()
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageserver.id)
+
+    log.info("Writing initial data ...")
+    workload.write_rows(row_count, env.pageserver.id)
+
+    ps_http.timeline_gc(
+        tenant_id, timeline_id, None
+    )  # Force refresh gc info to have gc_cutoff generated
+
+    def compaction_finished():
+        queue_depth = len(ps_http.timeline_compact_info(tenant_id, timeline_id))
+        assert queue_depth == 0
+
+    for i in range(1, churn_rounds + 1):
+        log.info(f"Running churn round {i}/{churn_rounds} ...")
+        workload.churn_rows(row_count, env.pageserver.id, upload=True)
+        wait_until(compaction_finished, timeout=60)
+        workload.validate(env.pageserver.id)
+
+    # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
+    env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction")
+
+    log.info("Validating at workload end ...")
+    workload.validate(env.pageserver.id)
+
+
 # Stripe sizes in number of pages.
 TINY_STRIPES = 16
 LARGE_STRIPES = 32768

From 1fb2faab5bc87c41bef086975dbc145ec77dbc1f Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 25 Feb 2025 17:00:43 +0100
Subject: [PATCH 579/583] Rename the patch files for the semver test (#10966)

## Problem
The patch for `semver` extensions relies on `PG_VERSION` environment
variable. The files were named without the letter `v` so script cannot
find them.
## Summary of changes
The patch files were renamed.
---
 .../{test-upgrade-16.patch => test-upgrade-v16.patch}             | 0
 .../{test-upgrade-17.patch => test-upgrade-v17.patch}             | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename docker-compose/ext-src/pg_semver-src/{test-upgrade-16.patch => test-upgrade-v16.patch} (100%)
 rename docker-compose/ext-src/pg_semver-src/{test-upgrade-17.patch => test-upgrade-v17.patch} (100%)

diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade-16.patch b/docker-compose/ext-src/pg_semver-src/test-upgrade-v16.patch
similarity index 100%
rename from docker-compose/ext-src/pg_semver-src/test-upgrade-16.patch
rename to docker-compose/ext-src/pg_semver-src/test-upgrade-v16.patch
diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch b/docker-compose/ext-src/pg_semver-src/test-upgrade-v17.patch
similarity index 100%
rename from docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch
rename to docker-compose/ext-src/pg_semver-src/test-upgrade-v17.patch

From c69ebb4486d6678500ebc13282f48b0525547beb Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 25 Feb 2025 12:37:23 -0500
Subject: [PATCH 580/583] fix(ci): extend timeout to 75min (#10963)

60min is not enough for debug builds

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .github/workflows/_build-and-test-locally.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 3740e6dc9c..30fde127b0 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -337,7 +337,7 @@ jobs:
       - name: Pytest regression tests
         continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
         uses: ./.github/actions/run-python-test-set
-        timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 60 || 180 }}
+        timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 75 || 180 }}
         with:
           build_type: ${{ inputs.build-type }}
           test_selection: regress

From d05606252d4d899ad9d779527d41e4f855c2e3b0 Mon Sep 17 00:00:00 2001
From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com>
Date: Tue, 25 Feb 2025 14:26:14 -0500
Subject: [PATCH 581/583] fix: only showing LSN for static computes in `neon
 endpoint list` (#10931)

## Problem

`neon endpoint list` shows a different LSN than what the state of the
replica is. This is mainly down to what we define as LSN in this output.
If we define it as the LSN that a compute was started with, it only
makes sense to show it for static computes.

## Summary of changes

Removed the output of `last_record_lsn` for primary/hot standby
computes.

Closes: https://github.com/neondatabase/neon/issues/5825

---------

Co-authored-by: Tristan Partin <tristan@neon.tech>
---
 control_plane/src/bin/neon_local.rs | 29 +++--------------------------
 1 file changed, 3 insertions(+), 26 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 02d793400a..7d908ccae9 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -887,20 +887,6 @@ fn print_timeline(
     Ok(())
 }
 
-/// Returns a map of timeline IDs to timeline_id@lsn strings.
-/// Connects to the pageserver to query this information.
-async fn get_timeline_infos(
-    env: &local_env::LocalEnv,
-    tenant_shard_id: &TenantShardId,
-) -> Result<HashMap<TimelineId, TimelineInfo>> {
-    Ok(get_default_pageserver(env)
-        .timeline_list(tenant_shard_id)
-        .await?
-        .into_iter()
-        .map(|timeline_info| (timeline_info.timeline_id, timeline_info))
-        .collect())
-}
-
 /// Helper function to get tenant id from an optional --tenant_id option or from the config file
 fn get_tenant_id(
     tenant_id_arg: Option<TenantId>,
@@ -1251,12 +1237,6 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
             // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller
             // where shard 0 is attached, and query there.
             let tenant_shard_id = get_tenant_shard_id(args.tenant_shard_id, env)?;
-            let timeline_infos = get_timeline_infos(env, &tenant_shard_id)
-                .await
-                .unwrap_or_else(|e| {
-                    eprintln!("Failed to load timeline info: {}", e);
-                    HashMap::new()
-                });
 
             let timeline_name_mappings = env.timeline_name_mappings();
 
@@ -1285,12 +1265,9 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                         lsn.to_string()
                     }
                     _ => {
-                        // -> primary endpoint or hot replica
-                        // Use the LSN at the end of the timeline.
-                        timeline_infos
-                            .get(&endpoint.timeline_id)
-                            .map(|bi| bi.last_record_lsn.to_string())
-                            .unwrap_or_else(|| "?".to_string())
+                        // As the LSN here refers to the one that the compute is started with,
+                        // we display nothing as it is a primary/hot standby compute.
+                        "---".to_string()
                     }
                 };
 

From dc975d554ab9d05ab1c6c1fdc8baeee9079efa22 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 25 Feb 2025 21:51:38 +0200
Subject: [PATCH 582/583] Incremenet getpage histogram in prefetch_lookup
 (#10965)

## Problem

PR https://github.com/neondatabase/neon/pull/10442 added prefetch_lookup
function.
It changed handling of getpage requests in compute.

Before:
1. Lookup in LFC (return if found)
2. Register prefetch buffer
3. Wait prefetch result (increment getpage_hist)
Now:

1. Lookup prefetch ring (return if prefetch request is already
completed)
2. Lookup in LFC (return if found)
3. Register prefetch buffer
4. Wait prefetch result (increment getpage_hist)

So if prefetch result is already available, then get page histogram is
not incremented.
It case failure of some test_throughtput benchmarks:
https://neondb.slack.com/archives/C033RQ5SPDH/p1740425527249499

## Summary of changes

Increment getpage histogram in `prefetch_lookup`

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/pagestore_smgr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 4a79acd777..091ad555e0 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -967,6 +967,7 @@ prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, n
 			BITMAP_SET(mask, i);
 
 			hits += 1;
+			inc_getpage_wait(0);
 		}
 	}
 	pgBufferUsage.prefetch.hits += hits;

From 920040e40240774219b6607f1f8ef74478dc4b29 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 26 Feb 2025 00:51:37 +0100
Subject: [PATCH 583/583] Update storage components to edition 2024 (#10919)

Updates storage components to edition 2024. We like to stay on the
latest edition if possible. There is no functional changes, however some
code changes had to be done to accommodate the edition's breaking
changes.

The PR has two commits:

* the first commit updates storage crates to edition 2024 and appeases
`cargo clippy` by changing code. i have accidentially ran the formatter
on some files that had other edits.
* the second commit performs a `cargo fmt`

I would recommend a closer review of the first commit and a less close
review of the second one (as it just runs `cargo fmt`).

part of https://github.com/neondatabase/neon/issues/10918
---
 libs/pageserver_api/Cargo.toml                |   2 +-
 libs/pageserver_api/src/config.rs             |  21 +-
 libs/pageserver_api/src/controller_api.rs     |  12 +-
 libs/pageserver_api/src/key.rs                |  30 +-
 libs/pageserver_api/src/keyspace.rs           |  19 +-
 libs/pageserver_api/src/models.rs             |  48 ++-
 libs/pageserver_api/src/models/utilization.rs |  14 +-
 libs/pageserver_api/src/record.rs             |   2 +-
 libs/pageserver_api/src/reltag.rs             |   6 +-
 libs/pageserver_api/src/shard.rs              |  10 +-
 libs/pageserver_api/src/upcall_api.rs         |  10 +-
 libs/pageserver_api/src/value.rs              |   7 +-
 libs/remote_storage/Cargo.toml                |   2 +-
 libs/remote_storage/src/azure_blob.rs         |  34 +-
 libs/remote_storage/src/config.rs             |   6 +-
 libs/remote_storage/src/lib.rs                |  58 +--
 libs/remote_storage/src/local_fs.rs           |  54 +--
 libs/remote_storage/src/metrics.rs            |   4 +-
 libs/remote_storage/src/s3_bucket.rs          |  79 ++--
 libs/remote_storage/src/simulate_failures.rs  |  11 +-
 libs/remote_storage/src/support.rs            |  13 +-
 libs/remote_storage/tests/common/tests.rs     |  20 +-
 libs/remote_storage/tests/test_real_azure.rs  |   6 +-
 libs/remote_storage/tests/test_real_s3.rs     |  18 +-
 libs/safekeeper_api/Cargo.toml                |   2 +-
 libs/safekeeper_api/src/membership.rs         |   6 +-
 libs/safekeeper_api/src/models.rs             |  15 +-
 pageserver/Cargo.toml                         |   2 +-
 pageserver/benches/bench_ingest.rs            |  30 +-
 pageserver/benches/bench_layer_map.rs         |  20 +-
 pageserver/benches/bench_walredo.rs           |  21 +-
 pageserver/benches/upload_queue.rs            |   6 +-
 pageserver/compaction/src/helpers.rs          |   4 +-
 .../pagebench/src/util/request_stats.rs       |   4 +-
 pageserver/src/assert_u64_eq_usize.rs         |   4 +-
 pageserver/src/aux_file.rs                    |   2 +-
 pageserver/src/basebackup.rs                  |  34 +-
 pageserver/src/bin/pageserver.rs              |  72 ++--
 .../src/bin/test_helper_slow_client_reads.rs  |  12 +-
 pageserver/src/config.rs                      |  35 +-
 pageserver/src/consumption_metrics.rs         |  25 +-
 .../src/consumption_metrics/disk_cache.rs     |   6 +-
 pageserver/src/consumption_metrics/metrics.rs |  15 +-
 .../src/consumption_metrics/metrics/tests.rs  |   4 +-
 pageserver/src/consumption_metrics/upload.rs  |  16 +-
 pageserver/src/controller_upcall_client.rs    |  34 +-
 pageserver/src/deletion_queue.rs              |  65 ++--
 pageserver/src/deletion_queue/deleter.rs      |  15 +-
 pageserver/src/deletion_queue/list_writer.rs  |  16 +-
 pageserver/src/deletion_queue/validator.rs    |  25 +-
 pageserver/src/disk_usage_eviction_task.rs    |  52 +--
 pageserver/src/http/routes.rs                 | 169 ++++-----
 pageserver/src/import_datadir.rs              |  20 +-
 pageserver/src/l0_flush.rs                    |   3 +-
 pageserver/src/lib.rs                         |  12 +-
 pageserver/src/metrics.rs                     |  36 +-
 pageserver/src/page_cache.rs                  |  27 +-
 pageserver/src/page_service.rs                | 135 ++++---
 pageserver/src/pgdatadir_mapping.rs           | 132 +++----
 pageserver/src/statvfs.rs                     |   7 +-
 pageserver/src/task_mgr.rs                    |   5 +-
 pageserver/src/tenant.rs                      | 311 ++++++++--------
 pageserver/src/tenant/blob_io.rs              |  20 +-
 pageserver/src/tenant/block_io.rs             |   8 +-
 pageserver/src/tenant/checks.rs               |   6 +-
 pageserver/src/tenant/config.rs               |  14 +-
 pageserver/src/tenant/disk_btree.rs           |  32 +-
 pageserver/src/tenant/ephemeral_file.rs       |  28 +-
 pageserver/src/tenant/gc_block.rs             |   3 +-
 pageserver/src/tenant/gc_result.rs            |   5 +-
 pageserver/src/tenant/layer_map.rs            |  47 +--
 .../layer_map/historic_layer_coverage.rs      |   3 +-
 pageserver/src/tenant/metadata.rs             |   8 +-
 pageserver/src/tenant/mgr.rs                  | 100 ++---
 .../src/tenant/remote_timeline_client.rs      | 170 ++++-----
 .../tenant/remote_timeline_client/download.rs |  42 +--
 .../tenant/remote_timeline_client/index.rs    |  12 +-
 .../tenant/remote_timeline_client/manifest.rs |   3 +-
 .../tenant/remote_timeline_client/upload.rs   |  20 +-
 pageserver/src/tenant/secondary.rs            |  47 +--
 pageserver/src/tenant/secondary/downloader.rs |  92 ++---
 pageserver/src/tenant/secondary/heatmap.rs    |  12 +-
 .../src/tenant/secondary/heatmap_uploader.rs  |  57 ++-
 pageserver/src/tenant/secondary/scheduler.rs  |  17 +-
 pageserver/src/tenant/size.rs                 |  21 +-
 pageserver/src/tenant/storage_layer.rs        |  34 +-
 .../storage_layer/batch_split_writer.rs       |  28 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  92 +++--
 .../tenant/storage_layer/filter_iterator.rs   |  31 +-
 .../src/tenant/storage_layer/image_layer.rs   | 114 +++---
 .../tenant/storage_layer/inmemory_layer.rs    |  58 +--
 .../inmemory_layer/vectored_dio_read.rs       |  35 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  36 +-
 .../src/tenant/storage_layer/layer/tests.rs   |  30 +-
 .../src/tenant/storage_layer/layer_desc.rs    |  11 +-
 .../src/tenant/storage_layer/layer_name.rs    |   4 +-
 .../tenant/storage_layer/merge_iterator.rs    |  41 +--
 pageserver/src/tenant/tasks.rs                |  25 +-
 pageserver/src/tenant/throttle.rs             |  10 +-
 pageserver/src/tenant/timeline.rs             | 347 +++++++++---------
 pageserver/src/tenant/timeline/analysis.rs    |   3 +-
 pageserver/src/tenant/timeline/compaction.rs  | 153 +++++---
 pageserver/src/tenant/timeline/delete.rs      |  34 +-
 .../src/tenant/timeline/detach_ancestor.rs    |  39 +-
 .../src/tenant/timeline/eviction_task.rs      |  37 +-
 pageserver/src/tenant/timeline/handle.rs      |  23 +-
 .../timeline/heatmap_layers_downloader.rs     |   3 +-
 .../src/tenant/timeline/import_pgdata.rs      |   8 +-
 .../src/tenant/timeline/import_pgdata/flow.rs |  56 ++-
 .../import_pgdata/importbucket_client.rs      |   7 +-
 .../import_pgdata/index_part_format.rs        |   3 +-
 .../timeline/import_pgdata/upcall_api.rs      |   5 +-
 pageserver/src/tenant/timeline/init.rs        |  24 +-
 .../src/tenant/timeline/layer_manager.rs      |  37 +-
 .../src/tenant/timeline/logical_size.rs       |   5 +-
 pageserver/src/tenant/timeline/offload.rs     |   4 +-
 pageserver/src/tenant/timeline/uninit.rs      |  23 +-
 pageserver/src/tenant/timeline/walreceiver.rs |  15 +-
 .../walreceiver/connection_manager.rs         |  87 ++---
 .../walreceiver/walreceiver_connection.rs     |  90 +++--
 pageserver/src/tenant/upload_queue.rs         | 205 ++++++++---
 pageserver/src/tenant/vectored_blob_io.rs     |  13 +-
 pageserver/src/utilization.rs                 |  10 +-
 pageserver/src/virtual_file.rs                |  56 +--
 pageserver/src/virtual_file/io_engine.rs      |  17 +-
 .../io_engine/tokio_epoll_uring_ext.rs        |  14 +-
 pageserver/src/virtual_file/open_options.rs   |   4 +-
 .../owned_buffers_io/aligned_buffer/buffer.rs |  10 +-
 .../aligned_buffer/buffer_mut.rs              |  28 +-
 .../owned_buffers_io/aligned_buffer/raw.rs    |   8 +-
 .../owned_buffers_io/io_buf_ext.rs            |   5 +-
 .../virtual_file/owned_buffers_io/slice.rs    |   8 +-
 .../virtual_file/owned_buffers_io/write.rs    |  16 +-
 .../owned_buffers_io/write/flush.rs           |   8 +-
 pageserver/src/walingest.rs                   |  89 +++--
 pageserver/src/walredo.rs                     |  30 +-
 pageserver/src/walredo/apply_neon.rs          |   3 +-
 pageserver/src/walredo/process.rs             |  34 +-
 .../src/walredo/process/no_leak_child.rs      |  18 +-
 safekeeper/Cargo.toml                         |   2 +-
 safekeeper/benches/receive_wal.rs             |   4 +-
 safekeeper/src/bin/safekeeper.rs              |  57 ++-
 safekeeper/src/broker.rs                      |  44 +--
 safekeeper/src/control_file.rs                |  24 +-
 safekeeper/src/control_file_upgrade.rs        |  31 +-
 safekeeper/src/copy_timeline.rs               |  30 +-
 safekeeper/src/debug_dump.rs                  |  32 +-
 safekeeper/src/handler.rs                     |  33 +-
 safekeeper/src/http/mod.rs                    |   6 +-
 safekeeper/src/http/routes.rs                 |  60 ++-
 safekeeper/src/json_ctrl.rs                   |  15 +-
 safekeeper/src/lib.rs                         |  12 +-
 safekeeper/src/metrics.rs                     |  28 +-
 safekeeper/src/patch_control_file.rs          |   3 +-
 safekeeper/src/pull_timeline.rs               |  67 ++--
 safekeeper/src/receive_wal.rs                 |  47 ++-
 safekeeper/src/recovery.rs                    |  46 ++-
 safekeeper/src/safekeeper.rs                  |  85 +++--
 safekeeper/src/send_interpreted_wal.rs        |  41 +--
 safekeeper/src/send_wal.rs                    |  62 ++--
 safekeeper/src/state.rs                       |  30 +-
 safekeeper/src/test_utils.rs                  |  17 +-
 safekeeper/src/timeline.rs                    |  46 +--
 safekeeper/src/timeline_eviction.rs           |  22 +-
 safekeeper/src/timeline_manager.rs            |  49 ++-
 safekeeper/src/timelines_global_map.rs        |  30 +-
 safekeeper/src/timelines_set.rs               |   3 +-
 safekeeper/src/wal_backup.rs                  |  29 +-
 safekeeper/src/wal_backup_partial.rs          |  20 +-
 safekeeper/src/wal_reader_stream.rs           |  24 +-
 safekeeper/src/wal_service.rs                 |  18 +-
 safekeeper/src/wal_storage.rs                 |  37 +-
 safekeeper/tests/misc_test.rs                 |   6 +-
 safekeeper/tests/random_test.rs               |  10 +-
 safekeeper/tests/simple_test.rs               |   3 +-
 safekeeper/tests/walproposer_sim/log.rs       |   6 +-
 .../tests/walproposer_sim/safekeeper.rs       |  37 +-
 .../tests/walproposer_sim/safekeeper_disk.rs  |  21 +-
 .../tests/walproposer_sim/simulation.rs       |  31 +-
 .../tests/walproposer_sim/walproposer_api.rs  |  34 +-
 .../tests/walproposer_sim/walproposer_disk.rs |   3 +-
 storage_broker/Cargo.toml                     |   2 +-
 storage_broker/benches/rps.rs                 |   8 +-
 storage_broker/src/bin/storage_broker.rs      |  44 +--
 storage_broker/src/lib.rs                     |  18 +-
 storage_broker/src/metrics.rs                 |   2 +-
 storage_controller/Cargo.toml                 |   2 +-
 .../src/background_node_operations.rs         |   3 +-
 storage_controller/src/compute_hook.rs        |  11 +-
 storage_controller/src/drain_utils.rs         |  23 +-
 storage_controller/src/heartbeater.rs         |  32 +-
 storage_controller/src/http.rs                |  92 ++---
 storage_controller/src/id_lock_map.rs         |   7 +-
 storage_controller/src/leadership.rs          |  12 +-
 storage_controller/src/main.rs                |  22 +-
 storage_controller/src/metrics.rs             |  13 +-
 storage_controller/src/node.rs                |  22 +-
 storage_controller/src/pageserver_client.rs   |  22 +-
 storage_controller/src/peer_client.rs         |   9 +-
 storage_controller/src/persistence.rs         |  41 +--
 .../src/persistence/split_state.rs            |   8 +-
 storage_controller/src/reconciler.rs          |  18 +-
 storage_controller/src/safekeeper.rs          |  15 +-
 storage_controller/src/safekeeper_client.rs   |   9 +-
 storage_controller/src/scheduler.rs           |  44 +--
 storage_controller/src/service.rs             | 319 +++++++++-------
 .../src/service/chaos_injector.rs             |  19 +-
 .../src/service/context_iterator.rs           |  11 +-
 storage_controller/src/tenant_shard.rs        | 124 +++----
 storage_scrubber/Cargo.toml                   |   2 +-
 storage_scrubber/src/checks.rs                |  36 +-
 storage_scrubber/src/cloud_admin_api.rs       |   4 +-
 storage_scrubber/src/find_large_objects.rs    |   7 +-
 storage_scrubber/src/garbage.rs               |  42 ++-
 storage_scrubber/src/lib.rs                   |  12 +-
 storage_scrubber/src/main.rs                  |  38 +-
 storage_scrubber/src/metadata_stream.rs       |  10 +-
 .../src/pageserver_physical_gc.rs             |  17 +-
 .../src/scan_pageserver_metadata.rs           |  15 +-
 .../src/scan_safekeeper_metadata.rs           |  19 +-
 storage_scrubber/src/tenant_snapshot.rs       |  15 +-
 221 files changed, 3543 insertions(+), 3611 deletions(-)

diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 79da05da6c..87dfdfb5ec 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "pageserver_api"
 version = "0.1.0"
-edition.workspace = true
+edition = "2024"
 license.workspace = true
 
 [features]
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 5a695c04ed..039cc1319e 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -9,19 +9,18 @@ pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN
 pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
 pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
 
+use std::collections::HashMap;
+use std::num::{NonZeroU64, NonZeroUsize};
+use std::str::FromStr;
+use std::time::Duration;
+
 use postgres_backend::AuthType;
 use remote_storage::RemoteStorageConfig;
 use serde_with::serde_as;
-use std::{
-    collections::HashMap,
-    num::{NonZeroU64, NonZeroUsize},
-    str::FromStr,
-    time::Duration,
-};
-use utils::{logging::LogFormat, postgres_client::PostgresClientProtocol};
+use utils::logging::LogFormat;
+use utils::postgres_client::PostgresClientProtocol;
 
-use crate::models::ImageCompressionAlgorithm;
-use crate::models::LsnLease;
+use crate::models::{ImageCompressionAlgorithm, LsnLease};
 
 // Certain metadata (e.g. externally-addressable name, AZ) is delivered
 // as a separate structure.  This information is not neeed by the pageserver
@@ -367,10 +366,10 @@ pub struct TenantConfigToml {
 }
 
 pub mod defaults {
-    use crate::models::ImageCompressionAlgorithm;
-
     pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
 
+    use crate::models::ImageCompressionAlgorithm;
+
     pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
     pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
 
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index f94bfab581..2cfe1a85f9 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -9,11 +9,8 @@ use std::time::{Duration, Instant};
 use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId};
 
-use crate::models::PageserverUtilization;
-use crate::{
-    models::{ShardParameters, TenantConfig},
-    shard::{ShardStripeSize, TenantShardId},
-};
+use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
+use crate::shard::{ShardStripeSize, TenantShardId};
 
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
@@ -354,7 +351,7 @@ impl FromStr for SkSchedulingPolicy {
             _ => {
                 return Err(anyhow::anyhow!(
                     "Unknown scheduling policy '{s}', try active,pause,decomissioned"
-                ))
+                ));
             }
         })
     }
@@ -457,9 +454,10 @@ pub struct SafekeeperSchedulingPolicyRequest {
 
 #[cfg(test)]
 mod test {
-    use super::*;
     use serde_json;
 
+    use super::*;
+
     /// Check stability of PlacementPolicy's serialization
     #[test]
     fn placement_policy_encoding() -> anyhow::Result<()> {
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index b88a2e46a1..8836e7ec87 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,11 +1,12 @@
-use anyhow::{bail, Result};
-use byteorder::{ByteOrder, BE};
+use std::fmt;
+use std::ops::Range;
+
+use anyhow::{Result, bail};
+use byteorder::{BE, ByteOrder};
 use bytes::Bytes;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
-use postgres_ffi::Oid;
-use postgres_ffi::RepOriginId;
+use postgres_ffi::{Oid, RepOriginId};
 use serde::{Deserialize, Serialize};
-use std::{fmt, ops::Range};
 use utils::const_assert;
 
 use crate::reltag::{BlockNumber, RelTag, SlruKind};
@@ -954,25 +955,22 @@ impl std::str::FromStr for Key {
 mod tests {
     use std::str::FromStr;
 
-    use crate::key::is_metadata_key_slice;
-    use crate::key::Key;
-
-    use rand::Rng;
-    use rand::SeedableRng;
+    use rand::{Rng, SeedableRng};
 
     use super::AUX_KEY_PREFIX;
+    use crate::key::{Key, is_metadata_key_slice};
 
     #[test]
     fn display_fromstr_bijection() {
         let mut rng = rand::rngs::StdRng::seed_from_u64(42);
 
         let key = Key {
-            field1: rng.gen(),
-            field2: rng.gen(),
-            field3: rng.gen(),
-            field4: rng.gen(),
-            field5: rng.gen(),
-            field6: rng.gen(),
+            field1: rng.r#gen(),
+            field2: rng.r#gen(),
+            field3: rng.r#gen(),
+            field4: rng.r#gen(),
+            field5: rng.r#gen(),
+            field6: rng.r#gen(),
         };
 
         assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index c55b9e9484..e505f23e49 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -1,11 +1,10 @@
-use postgres_ffi::BLCKSZ;
 use std::ops::Range;
 
-use crate::{
-    key::Key,
-    shard::{ShardCount, ShardIdentity},
-};
 use itertools::Itertools;
+use postgres_ffi::BLCKSZ;
+
+use crate::key::Key;
+use crate::shard::{ShardCount, ShardIdentity};
 
 ///
 /// Represents a set of Keys, in a compact form.
@@ -609,15 +608,13 @@ pub fn singleton_range(key: Key) -> Range<Key> {
 
 #[cfg(test)]
 mod tests {
+    use std::fmt::Write;
+
     use rand::{RngCore, SeedableRng};
 
-    use crate::{
-        models::ShardParameters,
-        shard::{ShardCount, ShardNumber},
-    };
-
     use super::*;
-    use std::fmt::Write;
+    use crate::models::ShardParameters;
+    use crate::shard::{ShardCount, ShardNumber};
 
     // Helper function to create a key range.
     //
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 1164048229..ea565e7769 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -2,38 +2,30 @@ pub mod detach_ancestor;
 pub mod partitioning;
 pub mod utilization;
 
-#[cfg(feature = "testing")]
-use camino::Utf8PathBuf;
-pub use utilization::PageserverUtilization;
-
 use core::ops::Range;
-use std::{
-    collections::HashMap,
-    fmt::Display,
-    io::{BufRead, Read},
-    num::{NonZeroU32, NonZeroU64, NonZeroUsize},
-    str::FromStr,
-    time::{Duration, SystemTime},
-};
+use std::collections::HashMap;
+use std::fmt::Display;
+use std::io::{BufRead, Read};
+use std::num::{NonZeroU32, NonZeroU64, NonZeroUsize};
+use std::str::FromStr;
+use std::time::{Duration, SystemTime};
 
 use byteorder::{BigEndian, ReadBytesExt};
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+#[cfg(feature = "testing")]
+use camino::Utf8PathBuf;
 use postgres_ffi::BLCKSZ;
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use serde_with::serde_as;
-use utils::{
-    completion,
-    id::{NodeId, TenantId, TimelineId},
-    lsn::Lsn,
-    postgres_client::PostgresClientProtocol,
-    serde_system_time,
-};
+pub use utilization::PageserverUtilization;
+use utils::id::{NodeId, TenantId, TimelineId};
+use utils::lsn::Lsn;
+use utils::postgres_client::PostgresClientProtocol;
+use utils::{completion, serde_system_time};
 
-use crate::{
-    key::{CompactKey, Key},
-    reltag::RelTag,
-    shard::{ShardCount, ShardStripeSize, TenantShardId},
-};
-use bytes::{Buf, BufMut, Bytes, BytesMut};
+use crate::key::{CompactKey, Key};
+use crate::reltag::RelTag;
+use crate::shard::{ShardCount, ShardStripeSize, TenantShardId};
 
 /// The state of a tenant in this pageserver.
 ///
@@ -332,7 +324,8 @@ pub struct ImportPgdataIdempotencyKey(pub String);
 
 impl ImportPgdataIdempotencyKey {
     pub fn random() -> Self {
-        use rand::{distributions::Alphanumeric, Rng};
+        use rand::Rng;
+        use rand::distributions::Alphanumeric;
         Self(
             rand::thread_rng()
                 .sample_iter(&Alphanumeric)
@@ -2288,9 +2281,10 @@ impl Default for PageTraceEvent {
 
 #[cfg(test)]
 mod tests {
-    use serde_json::json;
     use std::str::FromStr;
 
+    use serde_json::json;
+
     use super::*;
 
     #[test]
diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs
index 641aa51989..69c240ff3c 100644
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -1,5 +1,7 @@
 use std::time::SystemTime;
-use utils::{serde_percent::Percent, serde_system_time};
+
+use utils::serde_percent::Percent;
+use utils::serde_system_time;
 
 /// Pageserver current utilization and scoring for how good candidate the pageserver would be for
 /// the next tenant.
@@ -131,12 +133,12 @@ impl PageserverUtilization {
 
 /// Test helper
 pub mod test_utilization {
-    use super::PageserverUtilization;
     use std::time::SystemTime;
-    use utils::{
-        serde_percent::Percent,
-        serde_system_time::{self},
-    };
+
+    use utils::serde_percent::Percent;
+    use utils::serde_system_time::{self};
+
+    use super::PageserverUtilization;
 
     // Parameters of the imaginary node used for test utilization instances
     const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024;
diff --git a/libs/pageserver_api/src/record.rs b/libs/pageserver_api/src/record.rs
index bb62b35d36..fda504a26e 100644
--- a/libs/pageserver_api/src/record.rs
+++ b/libs/pageserver_api/src/record.rs
@@ -1,7 +1,7 @@
 //! This module defines the WAL record format used within the pageserver.
 
 use bytes::Bytes;
-use postgres_ffi::walrecord::{describe_postgres_wal_record, MultiXactMember};
+use postgres_ffi::walrecord::{MultiXactMember, describe_postgres_wal_record};
 use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
 use utils::bin_ser::DeserializeError;
diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs
index 09d1fae221..473a44dbf9 100644
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -1,10 +1,10 @@
-use serde::{Deserialize, Serialize};
 use std::cmp::Ordering;
 use std::fmt;
 
-use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
-use postgres_ffi::relfile_utils::{forkname_to_number, forknumber_to_name, MAIN_FORKNUM};
 use postgres_ffi::Oid;
+use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
+use postgres_ffi::relfile_utils::{MAIN_FORKNUM, forkname_to_number, forknumber_to_name};
+use serde::{Deserialize, Serialize};
 
 ///
 /// Relation data file segment id throughout the Postgres cluster.
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index e03df02afb..eca04b1f3d 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -33,12 +33,13 @@
 
 use std::hash::{Hash, Hasher};
 
-use crate::{key::Key, models::ShardParameters};
+#[doc(inline)]
+pub use ::utils::shard::*;
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
 
-#[doc(inline)]
-pub use ::utils::shard::*;
+use crate::key::Key;
+use crate::models::ShardParameters;
 
 /// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
 /// and to check whether that [`ShardNumber`] is the same as the current shard.
@@ -337,7 +338,8 @@ pub fn describe(
 mod tests {
     use std::str::FromStr;
 
-    use utils::{id::TenantId, Hex};
+    use utils::Hex;
+    use utils::id::TenantId;
 
     use super::*;
 
diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs
index 2e88836bd0..647d01c3c2 100644
--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -6,9 +6,9 @@
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;
 
-use crate::{
-    controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId,
-};
+use crate::controller_api::NodeRegisterRequest;
+use crate::models::LocationConfigMode;
+use crate::shard::TenantShardId;
 
 /// Upcall message sent by the pageserver to the configured `control_plane_api` on
 /// startup.
@@ -30,7 +30,7 @@ fn default_mode() -> LocationConfigMode {
 pub struct ReAttachResponseTenant {
     pub id: TenantShardId,
     /// Mandatory if LocationConfigMode is None or set to an Attached* mode
-    pub gen: Option<u32>,
+    pub r#gen: Option<u32>,
 
     /// Default value only for backward compat: this field should be set
     #[serde(default = "default_mode")]
@@ -44,7 +44,7 @@ pub struct ReAttachResponse {
 #[derive(Serialize, Deserialize)]
 pub struct ValidateRequestTenant {
     pub id: TenantShardId,
-    pub gen: u32,
+    pub r#gen: u32,
 }
 
 #[derive(Serialize, Deserialize)]
diff --git a/libs/pageserver_api/src/value.rs b/libs/pageserver_api/src/value.rs
index 1f8ed30a9a..883d903ff3 100644
--- a/libs/pageserver_api/src/value.rs
+++ b/libs/pageserver_api/src/value.rs
@@ -7,10 +7,11 @@
 //! Note that the [`Value`] type is used for the permananent storage format, so any
 //! changes to it must be backwards compatible.
 
-use crate::record::NeonWalRecord;
 use bytes::Bytes;
 use serde::{Deserialize, Serialize};
 
+use crate::record::NeonWalRecord;
+
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub enum Value {
     /// An Image value contains a full copy of the value
@@ -83,11 +84,11 @@ impl ValueBytes {
 
 #[cfg(test)]
 mod test {
-    use super::*;
-
     use bytes::Bytes;
     use utils::bin_ser::BeSer;
 
+    use super::*;
+
     macro_rules! roundtrip {
         ($orig:expr, $expected:expr) => {{
             let orig: Value = $orig;
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 33fa6e89f5..7bdf340f74 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "remote_storage"
 version = "0.1.0"
-edition.workspace = true
+edition = "2024"
 license.workspace = true
 
 [dependencies]
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 9027a8bf55..dee61a410d 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -2,33 +2,26 @@
 
 use std::borrow::Cow;
 use std::collections::HashMap;
-use std::env;
 use std::fmt::Display;
-use std::io;
 use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::str::FromStr;
 use std::sync::Arc;
-use std::time::Duration;
-use std::time::SystemTime;
+use std::time::{Duration, SystemTime};
+use std::{env, io};
 
-use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
-use anyhow::Context;
-use anyhow::Result;
+use anyhow::{Context, Result};
 use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
-use azure_core::HttpClient;
-use azure_core::TransportOptions;
-use azure_core::{Continuable, RetryOptions};
+use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions};
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::blob::CopyStatus;
-use azure_storage_blobs::prelude::ClientBuilder;
-use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
+use azure_storage_blobs::blob::operations::GetBlobBuilder;
+use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient};
 use bytes::Bytes;
+use futures::FutureExt;
 use futures::future::Either;
 use futures::stream::Stream;
-use futures::FutureExt;
-use futures_util::StreamExt;
-use futures_util::TryStreamExt;
+use futures_util::{StreamExt, TryStreamExt};
 use http_types::{StatusCode, Url};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
@@ -36,12 +29,13 @@ use tracing::debug;
 use utils::backoff;
 use utils::backoff::exponential_backoff_duration_seconds;
 
-use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
-use crate::DownloadKind;
+use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
+use crate::config::AzureConfig;
+use crate::error::Cancelled;
+use crate::metrics::{AttemptOutcome, RequestKind, start_measuring_requests};
 use crate::{
-    config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError,
-    DownloadOpts, Listing, ListingMode, ListingObject, RemotePath, RemoteStorage, StorageMetadata,
-    TimeTravelError, TimeoutOrCancel,
+    ConcurrencyLimiter, Download, DownloadError, DownloadKind, DownloadOpts, Listing, ListingMode,
+    ListingObject, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
 };
 
 pub struct AzureBlobStorage {
diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index ff34158c9c..52978be5b4 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -1,8 +1,10 @@
-use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration};
+use std::fmt::Debug;
+use std::num::NonZeroUsize;
+use std::str::FromStr;
+use std::time::Duration;
 
 use aws_sdk_s3::types::StorageClass;
 use camino::Utf8PathBuf;
-
 use serde::{Deserialize, Serialize};
 
 use crate::{
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 69b522d63e..6eb5570d9b 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -18,40 +18,35 @@ mod s3_bucket;
 mod simulate_failures;
 mod support;
 
-use std::{
-    collections::HashMap,
-    fmt::Debug,
-    num::NonZeroU32,
-    ops::Bound,
-    pin::{pin, Pin},
-    sync::Arc,
-    time::SystemTime,
-};
+use std::collections::HashMap;
+use std::fmt::Debug;
+use std::num::NonZeroU32;
+use std::ops::Bound;
+use std::pin::{Pin, pin};
+use std::sync::Arc;
+use std::time::SystemTime;
 
 use anyhow::Context;
-use camino::{Utf8Path, Utf8PathBuf};
-
+/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here.
+pub use azure_core::Etag;
 use bytes::Bytes;
-use futures::{stream::Stream, StreamExt};
+use camino::{Utf8Path, Utf8PathBuf};
+pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
+use futures::StreamExt;
+use futures::stream::Stream;
 use itertools::Itertools as _;
+use s3_bucket::RequestKind;
 use serde::{Deserialize, Serialize};
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 
-pub use self::{
-    azure_blob::AzureBlobStorage, local_fs::LocalFs, s3_bucket::S3Bucket,
-    simulate_failures::UnreliableWrapper,
-};
-use s3_bucket::RequestKind;
-
+pub use self::azure_blob::AzureBlobStorage;
+pub use self::local_fs::LocalFs;
+pub use self::s3_bucket::S3Bucket;
+pub use self::simulate_failures::UnreliableWrapper;
 pub use crate::config::{AzureConfig, RemoteStorageConfig, RemoteStorageKind, S3Config};
 
-/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here.
-pub use azure_core::Etag;
-
-pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
-
 /// Default concurrency limit for S3 operations
 ///
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
@@ -640,8 +635,13 @@ impl GenericRemoteStorage {
                 let profile = std::env::var("AWS_PROFILE").unwrap_or_else(|_| "<none>".into());
                 let access_key_id =
                     std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
-                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
-                      s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
+                info!(
+                    "Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
+                    s3_config.bucket_name,
+                    s3_config.bucket_region,
+                    s3_config.prefix_in_bucket,
+                    s3_config.endpoint
+                );
                 Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout).await?))
             }
             RemoteStorageKind::AzureContainer(azure_config) => {
@@ -649,8 +649,12 @@ impl GenericRemoteStorage {
                     .storage_account
                     .as_deref()
                     .unwrap_or("<AZURE_STORAGE_ACCOUNT>");
-                info!("Using azure container '{}' in account '{storage_account}' in region '{}' as a remote storage, prefix in container: '{:?}'",
-                      azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container);
+                info!(
+                    "Using azure container '{}' in account '{storage_account}' in region '{}' as a remote storage, prefix in container: '{:?}'",
+                    azure_config.container_name,
+                    azure_config.container_region,
+                    azure_config.prefix_in_container
+                );
                 Self::AzureBlob(Arc::new(AzureBlobStorage::new(
                     azure_config,
                     timeout,
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index a8b00173ba..f03d6ac8ee 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -4,31 +4,26 @@
 //! This storage used in tests, but can also be used in cases when a certain persistent
 //! volume is mounted to the local FS.
 
-use std::{
-    collections::HashSet,
-    io::ErrorKind,
-    num::NonZeroU32,
-    time::{Duration, SystemTime, UNIX_EPOCH},
-};
+use std::collections::HashSet;
+use std::io::ErrorKind;
+use std::num::NonZeroU32;
+use std::time::{Duration, SystemTime, UNIX_EPOCH};
 
-use anyhow::{bail, ensure, Context};
+use anyhow::{Context, bail, ensure};
 use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::stream::Stream;
-use tokio::{
-    fs,
-    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
-};
-use tokio_util::{io::ReaderStream, sync::CancellationToken};
+use tokio::fs;
+use tokio::io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
+use tokio_util::io::ReaderStream;
+use tokio_util::sync::CancellationToken;
 use utils::crashsafe::path_with_suffix_extension;
 
-use crate::{
-    Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject, RemotePath,
-    TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
-};
-
 use super::{RemoteStorage, StorageMetadata};
-use crate::Etag;
+use crate::{
+    Download, DownloadError, DownloadOpts, Etag, Listing, ListingMode, ListingObject,
+    REMOTE_STORAGE_PREFIX_SEPARATOR, RemotePath, TimeTravelError, TimeoutOrCancel,
+};
 
 const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
 
@@ -91,7 +86,8 @@ impl LocalFs {
 
     #[cfg(test)]
     async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
-        use std::{future::Future, pin::Pin};
+        use std::future::Future;
+        use std::pin::Pin;
         fn get_all_files<'a, P>(
             directory_path: P,
         ) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
@@ -284,7 +280,9 @@ impl LocalFs {
             })?;
 
         if bytes_read < from_size_bytes {
-            bail!("Provided stream was shorter than expected: {bytes_read} vs {from_size_bytes} bytes");
+            bail!(
+                "Provided stream was shorter than expected: {bytes_read} vs {from_size_bytes} bytes"
+            );
         }
         // Check if there is any extra data after the given size.
         let mut from = buffer_to_read.into_inner();
@@ -642,10 +640,13 @@ fn mock_etag(meta: &std::fs::Metadata) -> Etag {
 
 #[cfg(test)]
 mod fs_tests {
-    use super::*;
+    use std::collections::HashMap;
+    use std::io::Write;
+    use std::ops::Bound;
 
     use camino_tempfile::tempdir;
-    use std::{collections::HashMap, io::Write, ops::Bound};
+
+    use super::*;
 
     async fn read_and_check_metadata(
         storage: &LocalFs,
@@ -736,9 +737,14 @@ mod fs_tests {
         );
 
         let non_existing_path = RemotePath::new(Utf8Path::new("somewhere/else"))?;
-        match storage.download(&non_existing_path, &DownloadOpts::default(), &cancel).await {
+        match storage
+            .download(&non_existing_path, &DownloadOpts::default(), &cancel)
+            .await
+        {
             Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
-            other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
+            other => panic!(
+                "Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"
+            ),
         }
         Ok(())
     }
diff --git a/libs/remote_storage/src/metrics.rs b/libs/remote_storage/src/metrics.rs
index 48c121fbc8..81e68e9a29 100644
--- a/libs/remote_storage/src/metrics.rs
+++ b/libs/remote_storage/src/metrics.rs
@@ -1,5 +1,5 @@
 use metrics::{
-    register_histogram_vec, register_int_counter, register_int_counter_vec, Histogram, IntCounter,
+    Histogram, IntCounter, register_histogram_vec, register_int_counter, register_int_counter_vec,
 };
 use once_cell::sync::Lazy;
 
@@ -16,8 +16,8 @@ pub(crate) enum RequestKind {
     Head = 6,
 }
 
-use scopeguard::ScopeGuard;
 use RequestKind::*;
+use scopeguard::ScopeGuard;
 
 impl RequestKind {
     const fn as_str(&self) -> &'static str {
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index d3f19f0b11..ba7ce9e1e7 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,56 +4,50 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.
 
-use std::{
-    borrow::Cow,
-    collections::HashMap,
-    num::NonZeroU32,
-    pin::Pin,
-    sync::Arc,
-    task::{Context, Poll},
-    time::{Duration, SystemTime},
-};
+use std::borrow::Cow;
+use std::collections::HashMap;
+use std::num::NonZeroU32;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+use std::time::{Duration, SystemTime};
 
-use anyhow::{anyhow, Context as _};
-use aws_config::{
-    default_provider::credentials::DefaultCredentialsChain,
-    retry::{RetryConfigBuilder, RetryMode},
-    BehaviorVersion,
-};
-use aws_sdk_s3::{
-    config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
-    error::SdkError,
-    operation::{get_object::GetObjectError, head_object::HeadObjectError},
-    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
-    Client,
-};
+use anyhow::{Context as _, anyhow};
+use aws_config::BehaviorVersion;
+use aws_config::default_provider::credentials::DefaultCredentialsChain;
+use aws_config::retry::{RetryConfigBuilder, RetryMode};
+use aws_sdk_s3::Client;
+use aws_sdk_s3::config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep};
+use aws_sdk_s3::error::SdkError;
+use aws_sdk_s3::operation::get_object::GetObjectError;
+use aws_sdk_s3::operation::head_object::HeadObjectError;
+use aws_sdk_s3::types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass};
 use aws_smithy_async::rt::sleep::TokioSleep;
-use http_body_util::StreamBody;
-use http_types::StatusCode;
-
-use aws_smithy_types::{body::SdkBody, DateTime};
-use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError};
+use aws_smithy_types::DateTime;
+use aws_smithy_types::body::SdkBody;
+use aws_smithy_types::byte_stream::ByteStream;
+use aws_smithy_types::date_time::ConversionError;
 use bytes::Bytes;
 use futures::stream::Stream;
 use futures_util::StreamExt;
+use http_body_util::StreamBody;
+use http_types::StatusCode;
 use hyper::body::Frame;
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use utils::backoff;
 
 use super::StorageMetadata;
-use crate::{
-    config::S3Config,
-    error::Cancelled,
-    metrics::{start_counting_cancelled_wait, start_measuring_requests},
-    support::PermitCarrying,
-    ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject,
-    RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE_S3,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
-};
-
-use crate::metrics::AttemptOutcome;
+use crate::config::S3Config;
+use crate::error::Cancelled;
 pub(super) use crate::metrics::RequestKind;
+use crate::metrics::{AttemptOutcome, start_counting_cancelled_wait, start_measuring_requests};
+use crate::support::PermitCarrying;
+use crate::{
+    ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject,
+    MAX_KEYS_PER_DELETE_S3, REMOTE_STORAGE_PREFIX_SEPARATOR, RemotePath, RemoteStorage,
+    TimeTravelError, TimeoutOrCancel,
+};
 
 /// AWS S3 storage.
 pub struct S3Bucket {
@@ -958,8 +952,10 @@ impl RemoteStorage for S3Bucket {
                 version_id, key, ..
             } = &vd;
             if version_id == "null" {
-                return Err(TimeTravelError::Other(anyhow!("Received ListVersions response for key={key} with version_id='null', \
-                    indicating either disabled versioning, or legacy objects with null version id values")));
+                return Err(TimeTravelError::Other(anyhow!(
+                    "Received ListVersions response for key={key} with version_id='null', \
+                    indicating either disabled versioning, or legacy objects with null version id values"
+                )));
             }
             tracing::trace!(
                 "Parsing version key={key} version_id={version_id} kind={:?}",
@@ -1126,9 +1122,10 @@ impl VerOrDelete {
 
 #[cfg(test)]
 mod tests {
-    use camino::Utf8Path;
     use std::num::NonZeroUsize;
 
+    use camino::Utf8Path;
+
     use crate::{RemotePath, S3Bucket, S3Config};
 
     #[tokio::test]
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 63c24beb51..f56be873c4 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -1,14 +1,15 @@
 //! This module provides a wrapper around a real RemoteStorage implementation that
 //! causes the first N attempts at each upload or download operatio to fail. For
 //! testing purposes.
-use bytes::Bytes;
-use futures::stream::Stream;
-use futures::StreamExt;
 use std::collections::HashMap;
+use std::collections::hash_map::Entry;
 use std::num::NonZeroU32;
-use std::sync::Mutex;
+use std::sync::{Arc, Mutex};
 use std::time::SystemTime;
-use std::{collections::hash_map::Entry, sync::Arc};
+
+use bytes::Bytes;
+use futures::StreamExt;
+use futures::stream::Stream;
 use tokio_util::sync::CancellationToken;
 
 use crate::{
diff --git a/libs/remote_storage/src/support.rs b/libs/remote_storage/src/support.rs
index 1ed9ed9305..07da38cf77 100644
--- a/libs/remote_storage/src/support.rs
+++ b/libs/remote_storage/src/support.rs
@@ -1,9 +1,7 @@
-use std::{
-    future::Future,
-    pin::Pin,
-    task::{Context, Poll},
-    time::Duration,
-};
+use std::future::Future;
+use std::pin::Pin;
+use std::task::{Context, Poll};
+use std::time::Duration;
 
 use bytes::Bytes;
 use futures_util::Stream;
@@ -114,9 +112,10 @@ pub(crate) fn cancel_or_timeout(
 
 #[cfg(test)]
 mod tests {
+    use futures::stream::StreamExt;
+
     use super::*;
     use crate::DownloadError;
-    use futures::stream::StreamExt;
 
     #[tokio::test(start_paused = true)]
     async fn cancelled_download_stream() {
diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs
index d5da1d48e9..6a78ddc01e 100644
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,19 +1,20 @@
+use std::collections::HashSet;
+use std::num::NonZeroU32;
+use std::ops::Bound;
+use std::sync::Arc;
+
 use anyhow::Context;
 use camino::Utf8Path;
 use futures::StreamExt;
 use remote_storage::{DownloadError, DownloadOpts, ListingMode, ListingObject, RemotePath};
-use std::ops::Bound;
-use std::sync::Arc;
-use std::{collections::HashSet, num::NonZeroU32};
 use test_context::test_context;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
 
-use crate::common::{download_to_vec, upload_stream, wrap_stream};
-
 use super::{
     MaybeEnabledStorage, MaybeEnabledStorageWithSimpleTestBlobs, MaybeEnabledStorageWithTestBlobs,
 };
+use crate::common::{download_to_vec, upload_stream, wrap_stream};
 
 /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
@@ -62,7 +63,8 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
         .into_iter()
         .collect::<HashSet<_>>();
     assert_eq!(
-        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
+        root_remote_prefixes,
+        HashSet::from([base_prefix.clone()]),
         "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
     );
 
@@ -84,7 +86,8 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
         .difference(&nested_remote_prefixes)
         .collect::<HashSet<_>>();
     assert_eq!(
-        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
+        remote_only_prefixes.len() + missing_uploaded_prefixes.len(),
+        0,
         "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
     );
 
@@ -119,7 +122,8 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
         .difference(&nested_remote_prefixes_combined)
         .collect::<HashSet<_>>();
     assert_eq!(
-        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
+        remote_only_prefixes.len() + missing_uploaded_prefixes.len(),
+        0,
         "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
     );
 
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index 15004dbf83..31c9ca3200 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -1,9 +1,9 @@
+use std::collections::HashSet;
 use std::env;
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::sync::Arc;
-use std::time::UNIX_EPOCH;
-use std::{collections::HashSet, time::Duration};
+use std::time::{Duration, UNIX_EPOCH};
 
 use anyhow::Context;
 use remote_storage::{
@@ -208,7 +208,7 @@ async fn create_azure_client(
         .as_millis();
 
     // because nanos can be the same for two threads so can millis, add randomness
-    let random = rand::thread_rng().gen::<u32>();
+    let random = rand::thread_rng().r#gen::<u32>();
 
     let remote_storage_config = RemoteStorageConfig {
         storage: RemoteStorageKind::AzureContainer(AzureConfig {
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index e60ec18c93..6996bb27ae 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,13 +1,12 @@
+use std::collections::HashSet;
 use std::env;
 use std::fmt::{Debug, Display};
 use std::future::Future;
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::sync::Arc;
-use std::time::{Duration, UNIX_EPOCH};
-use std::{collections::HashSet, time::SystemTime};
+use std::time::{Duration, SystemTime, UNIX_EPOCH};
 
-use crate::common::{download_to_vec, upload_stream};
 use anyhow::Context;
 use camino::Utf8Path;
 use futures_util::StreamExt;
@@ -15,12 +14,13 @@ use remote_storage::{
     DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath,
     RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
-use test_context::test_context;
-use test_context::AsyncTestContext;
+use test_context::{AsyncTestContext, test_context};
 use tokio::io::AsyncBufReadExt;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 
+use crate::common::{download_to_vec, upload_stream};
+
 mod common;
 
 #[path = "common/tests.rs"]
@@ -128,8 +128,10 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
         let t0_hwt = t0 + half_wt;
         let t1_hwt = t1 - half_wt;
         if !(t0_hwt..=t1_hwt).contains(&last_modified) {
-            panic!("last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \
-                This likely means a large lock discrepancy between S3 and the local clock.");
+            panic!(
+                "last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \
+                This likely means a large lock discrepancy between S3 and the local clock."
+            );
         }
     }
 
@@ -383,7 +385,7 @@ async fn create_s3_client(
         .as_millis();
 
     // because nanos can be the same for two threads so can millis, add randomness
-    let random = rand::thread_rng().gen::<u32>();
+    let random = rand::thread_rng().r#gen::<u32>();
 
     let remote_storage_config = RemoteStorageConfig {
         storage: RemoteStorageKind::AwsS3(S3Config {
diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml
index 6b72ace019..d9d080e8fe 100644
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "safekeeper_api"
 version = "0.1.0"
-edition.workspace = true
+edition = "2024"
 license.workspace = true
 
 [dependencies]
diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs
index 2f20ec5f94..4ccdd491b0 100644
--- a/libs/safekeeper_api/src/membership.rs
+++ b/libs/safekeeper_api/src/membership.rs
@@ -2,7 +2,8 @@
 //! rfcs/035-safekeeper-dynamic-membership-change.md
 //! for details.
 
-use std::{collections::HashSet, fmt::Display};
+use std::collections::HashSet;
+use std::fmt::Display;
 
 use anyhow;
 use anyhow::bail;
@@ -148,9 +149,10 @@ impl Display for Configuration {
 
 #[cfg(test)]
 mod tests {
-    use super::{MemberSet, SafekeeperId};
     use utils::id::NodeId;
 
+    use super::{MemberSet, SafekeeperId};
+
     #[test]
     fn test_member_set() {
         let mut members = MemberSet::empty();
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index 41ccdaa428..2f2aeaa429 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -1,18 +1,17 @@
 //! Types used in safekeeper http API. Many of them are also reused internally.
 
+use std::net::SocketAddr;
+
 use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::TimestampTz;
 use serde::{Deserialize, Serialize};
-use std::net::SocketAddr;
 use tokio::time::Instant;
+use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
+use utils::lsn::Lsn;
+use utils::pageserver_feedback::PageserverFeedback;
 
-use utils::{
-    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
-    lsn::Lsn,
-    pageserver_feedback::PageserverFeedback,
-};
-
-use crate::{membership::Configuration, ServerInfo, Term};
+use crate::membership::Configuration;
+use crate::{ServerInfo, Term};
 
 #[derive(Debug, Serialize)]
 pub struct SafekeeperStatus {
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 9d4463d595..7330856be4 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "pageserver"
 version = "0.1.0"
-edition.workspace = true
+edition = "2024"
 license.workspace = true
 
 [features]
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index b67a9cc479..b1103948d6 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -1,22 +1,20 @@
-use std::{env, num::NonZeroUsize};
+use std::env;
+use std::num::NonZeroUsize;
 
 use bytes::Bytes;
 use camino::Utf8PathBuf;
-use criterion::{criterion_group, criterion_main, Criterion};
-use pageserver::{
-    config::PageServerConf,
-    context::{DownloadBehavior, RequestContext},
-    l0_flush::{L0FlushConfig, L0FlushGlobalState},
-    page_cache,
-    task_mgr::TaskKind,
-    tenant::storage_layer::InMemoryLayer,
-    virtual_file,
-};
-use pageserver_api::{key::Key, shard::TenantShardId, value::Value};
-use utils::{
-    bin_ser::BeSer,
-    id::{TenantId, TimelineId},
-};
+use criterion::{Criterion, criterion_group, criterion_main};
+use pageserver::config::PageServerConf;
+use pageserver::context::{DownloadBehavior, RequestContext};
+use pageserver::l0_flush::{L0FlushConfig, L0FlushGlobalState};
+use pageserver::task_mgr::TaskKind;
+use pageserver::tenant::storage_layer::InMemoryLayer;
+use pageserver::{page_cache, virtual_file};
+use pageserver_api::key::Key;
+use pageserver_api::shard::TenantShardId;
+use pageserver_api::value::Value;
+use utils::bin_ser::BeSer;
+use utils::id::{TenantId, TimelineId};
 use wal_decoder::serialized_batch::SerializedValueBatch;
 
 // A very cheap hash for generating non-sequential keys.
diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 5c5b52db44..e11af49449 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,23 +1,21 @@
-use criterion::measurement::WallTime;
-use pageserver::keyspace::{KeyPartitioning, KeySpace};
-use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::LayerName;
-use pageserver::tenant::storage_layer::PersistentLayerDesc;
-use pageserver_api::key::Key;
-use pageserver_api::shard::TenantShardId;
-use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
 use std::fs::File;
 use std::io::{BufRead, BufReader};
 use std::path::PathBuf;
 use std::str::FromStr;
 use std::time::Instant;
+
+use criterion::measurement::WallTime;
+use criterion::{BenchmarkGroup, Criterion, black_box, criterion_group, criterion_main};
+use pageserver::keyspace::{KeyPartitioning, KeySpace};
+use pageserver::tenant::layer_map::LayerMap;
+use pageserver::tenant::storage_layer::{LayerName, PersistentLayerDesc};
+use pageserver_api::key::Key;
+use pageserver_api::shard::TenantShardId;
+use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use utils::id::{TenantId, TimelineId};
-
 use utils::lsn::Lsn;
 
-use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion};
-
 fn fixture_path(relative: &str) -> PathBuf {
     PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
 }
diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index d3551b56e1..77b3f90b3e 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -56,20 +56,23 @@
 //! medium/128              time:   [10.412 ms 10.574 ms 10.718 ms]
 //! ```
 
+use std::future::Future;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
 use anyhow::Context;
 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
 use once_cell::sync::Lazy;
-use pageserver::{config::PageServerConf, walredo::PostgresRedoManager};
+use pageserver::config::PageServerConf;
+use pageserver::walredo::PostgresRedoManager;
+use pageserver_api::key::Key;
 use pageserver_api::record::NeonWalRecord;
-use pageserver_api::{key::Key, shard::TenantShardId};
-use std::{
-    future::Future,
-    sync::Arc,
-    time::{Duration, Instant},
-};
-use tokio::{sync::Barrier, task::JoinSet};
-use utils::{id::TenantId, lsn::Lsn};
+use pageserver_api::shard::TenantShardId;
+use tokio::sync::Barrier;
+use tokio::task::JoinSet;
+use utils::id::TenantId;
+use utils::lsn::Lsn;
 
 fn bench(c: &mut Criterion) {
     macro_rules! bench_group {
diff --git a/pageserver/benches/upload_queue.rs b/pageserver/benches/upload_queue.rs
index ed5daa8ae1..8de06a6c25 100644
--- a/pageserver/benches/upload_queue.rs
+++ b/pageserver/benches/upload_queue.rs
@@ -1,15 +1,15 @@
 //! Upload queue benchmarks.
 
 use std::str::FromStr as _;
-use std::sync::atomic::AtomicU32;
 use std::sync::Arc;
+use std::sync::atomic::AtomicU32;
 
-use criterion::{criterion_group, criterion_main, Bencher, Criterion};
+use criterion::{Bencher, Criterion, criterion_group, criterion_main};
+use pageserver::tenant::IndexPart;
 use pageserver::tenant::metadata::TimelineMetadata;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::upload_queue::{Delete, UploadOp, UploadQueue, UploadTask};
-use pageserver::tenant::IndexPart;
 use pprof::criterion::{Output, PProfProfiler};
 use utils::generation::Generation;
 use utils::shard::{ShardCount, ShardIndex, ShardNumber};
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index 6b739d85a7..7e4e3042b3 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -221,12 +221,12 @@ where
             // performed implicitly when `top` is dropped).
             if let Some(mut top) = this.heap.peek_mut() {
                 match top.deref_mut() {
-                    LazyLoadLayer::Unloaded(ref mut l) => {
+                    LazyLoadLayer::Unloaded(l) => {
                         let fut = l.load_keys(this.ctx);
                         this.load_future.set(Some(Box::pin(fut)));
                         continue;
                     }
-                    LazyLoadLayer::Loaded(ref mut entries) => {
+                    LazyLoadLayer::Loaded(entries) => {
                         let result = entries.pop_front().unwrap();
                         if entries.is_empty() {
                             std::collections::binary_heap::PeekMut::pop(top);
diff --git a/pageserver/pagebench/src/util/request_stats.rs b/pageserver/pagebench/src/util/request_stats.rs
index 4aa6950782..ebe7bc031d 100644
--- a/pageserver/pagebench/src/util/request_stats.rs
+++ b/pageserver/pagebench/src/util/request_stats.rs
@@ -40,9 +40,7 @@ impl Stats {
         }
     }
     pub(crate) fn add(&mut self, other: &Self) {
-        let Self {
-            ref mut latency_histo,
-        } = self;
+        let Self { latency_histo } = self;
         latency_histo.add(&other.latency_histo).unwrap();
     }
 }
diff --git a/pageserver/src/assert_u64_eq_usize.rs b/pageserver/src/assert_u64_eq_usize.rs
index 66ca7fd057..c4b8d9acba 100644
--- a/pageserver/src/assert_u64_eq_usize.rs
+++ b/pageserver/src/assert_u64_eq_usize.rs
@@ -2,7 +2,9 @@
 
 pub(crate) const _ASSERT_U64_EQ_USIZE: () = {
     if std::mem::size_of::<usize>() != std::mem::size_of::<u64>() {
-        panic!("the traits defined in this module assume that usize and u64 can be converted to each other without loss of information");
+        panic!(
+            "the traits defined in this module assume that usize and u64 can be converted to each other without loss of information"
+        );
     }
 };
 
diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs
index 5cc20a70b2..b76c0e045f 100644
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -2,7 +2,7 @@ use std::sync::Arc;
 
 use ::metrics::IntGauge;
 use bytes::{Buf, BufMut, Bytes};
-use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
+use pageserver_api::key::{AUX_KEY_PREFIX, Key, METADATA_KEY_SIZE};
 use tracing::warn;
 
 // BEGIN Copyright (c) 2017 Servo Contributors
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 99b0775316..ce54bd9c1c 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -10,33 +10,31 @@
 //! This module is responsible for creation of such tarball
 //! from data stored in object storage.
 //!
-use anyhow::{anyhow, Context};
-use bytes::{BufMut, Bytes, BytesMut};
-use fail::fail_point;
-use pageserver_api::key::{rel_block_to_key, Key};
-use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::{Instant, SystemTime};
+
+use anyhow::{Context, anyhow};
+use bytes::{BufMut, Bytes, BytesMut};
+use fail::fail_point;
+use pageserver_api::key::{Key, rel_block_to_key};
+use pageserver_api::reltag::{RelTag, SlruKind};
+use postgres_ffi::pg_constants::{
+    DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID, PG_HBA, PGDATA_SPECIAL_FILES,
+};
+use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
+use postgres_ffi::{
+    BLCKSZ, PG_TLI, RELSEG_SIZE, WAL_SEGMENT_SIZE, XLogFileName, dispatch_pgversion, pg_constants,
+};
 use tokio::io;
 use tokio::io::AsyncWrite;
-use tracing::*;
-
 use tokio_tar::{Builder, EntryType, Header};
+use tracing::*;
+use utils::lsn::Lsn;
 
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::Version;
-use crate::tenant::storage_layer::IoConcurrency;
 use crate::tenant::Timeline;
-use pageserver_api::reltag::{RelTag, SlruKind};
-
-use postgres_ffi::dispatch_pgversion;
-use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
-use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PG_HBA};
-use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
-use postgres_ffi::XLogFileName;
-use postgres_ffi::PG_TLI;
-use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
-use utils::lsn::Lsn;
+use crate::tenant::storage_layer::IoConcurrency;
 
 #[derive(Debug, thiserror::Error)]
 pub enum BasebackupError {
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index e2b9a7f073..ab8d37df2e 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -3,49 +3,41 @@
 //! Main entry point for the Page Server executable.
 
 use std::env;
-use std::env::{var, VarError};
+use std::env::{VarError, var};
 use std::io::Read;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 
-use anyhow::{anyhow, Context};
+use anyhow::{Context, anyhow};
 use camino::Utf8Path;
 use clap::{Arg, ArgAction, Command};
-
-use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
-use pageserver::config::PageserverIdentity;
+use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric};
+use metrics::set_build_info_metric;
+use pageserver::config::{PageServerConf, PageserverIdentity};
 use pageserver::controller_upcall_client::ControllerUpcallClient;
+use pageserver::deletion_queue::DeletionQueue;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
-use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
-use pageserver::tenant::{secondary, TenantSharedResources};
-use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener};
+use pageserver::task_mgr::{
+    BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
+};
+use pageserver::tenant::{TenantSharedResources, mgr, secondary};
+use pageserver::{
+    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, http, page_cache, page_service,
+    task_mgr, virtual_file,
+};
+use postgres_backend::AuthType;
 use remote_storage::GenericRemoteStorage;
 use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-
-use metrics::set_build_info_metric;
-use pageserver::{
-    config::PageServerConf,
-    deletion_queue::DeletionQueue,
-    http, page_cache, page_service, task_mgr,
-    task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME},
-    tenant::mgr,
-    virtual_file,
-};
-use postgres_backend::AuthType;
+use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::crashsafe::syncfs;
-use utils::failpoint_support;
 use utils::logging::TracingErrorLayerEnablement;
-use utils::{
-    auth::{JwtAuth, SwappableJwtAuth},
-    logging, project_build_tag, project_git_version,
-    sentry_init::init_sentry,
-    tcp_listener,
-};
+use utils::sentry_init::init_sentry;
+use utils::{failpoint_support, logging, project_build_tag, project_git_version, tcp_listener};
 
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
@@ -57,7 +49,7 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 /// This adds roughly 3% overhead for allocations on average, which is acceptable considering
 /// performance-sensitive code will avoid allocations as far as possible anyway.
 #[allow(non_upper_case_globals)]
-#[export_name = "malloc_conf"]
+#[unsafe(export_name = "malloc_conf")]
 pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
 
 const PID_FILE_NAME: &str = "pageserver.pid";
@@ -85,6 +77,9 @@ fn main() -> anyhow::Result<()> {
         return Ok(());
     }
 
+    // Initialize up failpoints support
+    let scenario = failpoint_support::init();
+
     let workdir = arg_matches
         .get_one::<String>("workdir")
         .map(Utf8Path::new)
@@ -178,9 +173,6 @@ fn main() -> anyhow::Result<()> {
         }
     }
 
-    // Initialize up failpoints support
-    let scenario = failpoint_support::init();
-
     // Basic initialization of things that don't change after startup
     tracing::info!("Initializing virtual_file...");
     virtual_file::init(
@@ -217,7 +209,9 @@ fn initialize_config(
         Ok(mut f) => {
             let md = f.metadata().context("stat config file")?;
             if !md.is_file() {
-                anyhow::bail!("Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ...");
+                anyhow::bail!(
+                    "Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ..."
+                );
             }
 
             let mut s = String::new();
@@ -225,7 +219,9 @@ fn initialize_config(
             toml_edit::de::from_str::<PageserverIdentity>(&s)?
         }
         Err(e) => {
-            anyhow::bail!("Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ...");
+            anyhow::bail!(
+                "Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ..."
+            );
         }
     };
 
@@ -401,11 +397,9 @@ fn start_pageserver(
         Err(VarError::NotPresent) => {
             info!("No JWT token for authentication with Safekeeper detected");
         }
-        Err(e) => {
-            return Err(e).with_context(|| {
-                "Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable"
-            })
-        }
+        Err(e) => return Err(e).with_context(
+            || "Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable",
+        ),
     };
 
     // Top-level cancellation token for the process
@@ -711,7 +705,9 @@ async fn create_remote_storage_client(
     // wrapper that simulates failures.
     if conf.test_remote_failures > 0 {
         if !cfg!(feature = "testing") {
-            anyhow::bail!("test_remote_failures option is not available because pageserver was compiled without the 'testing' feature");
+            anyhow::bail!(
+                "test_remote_failures option is not available because pageserver was compiled without the 'testing' feature"
+            );
         }
         info!(
             "Simulating remote failures for first {} attempts of each op",
diff --git a/pageserver/src/bin/test_helper_slow_client_reads.rs b/pageserver/src/bin/test_helper_slow_client_reads.rs
index c1ce332b6c..0215dd06fb 100644
--- a/pageserver/src/bin/test_helper_slow_client_reads.rs
+++ b/pageserver/src/bin/test_helper_slow_client_reads.rs
@@ -1,14 +1,10 @@
-use std::{
-    io::{stdin, stdout, Read, Write},
-    time::Duration,
-};
+use std::io::{Read, Write, stdin, stdout};
+use std::time::Duration;
 
 use clap::Parser;
 use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest};
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
+use utils::id::{TenantId, TimelineId};
+use utils::lsn::Lsn;
 
 #[derive(clap::Parser)]
 struct Args {
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 09d9444dd5..64d00882b9 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -4,36 +4,29 @@
 //! file, or on the command line.
 //! See also `settings.md` for better description on every parameter.
 
-use anyhow::{bail, ensure, Context};
-use pageserver_api::models::ImageCompressionAlgorithm;
-use pageserver_api::{
-    config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes},
-    shard::TenantShardId,
-};
-use remote_storage::{RemotePath, RemoteStorageConfig};
 use std::env;
-use storage_broker::Uri;
-use utils::logging::SecretString;
-use utils::postgres_client::PostgresClientProtocol;
-
-use once_cell::sync::OnceCell;
-use reqwest::Url;
 use std::num::NonZeroUsize;
 use std::sync::Arc;
 use std::time::Duration;
 
+use anyhow::{Context, bail, ensure};
 use camino::{Utf8Path, Utf8PathBuf};
+use once_cell::sync::OnceCell;
+use pageserver_api::config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes};
+use pageserver_api::models::ImageCompressionAlgorithm;
+use pageserver_api::shard::TenantShardId;
 use postgres_backend::AuthType;
-use utils::{
-    id::{NodeId, TimelineId},
-    logging::LogFormat,
-};
+use remote_storage::{RemotePath, RemoteStorageConfig};
+use reqwest::Url;
+use storage_broker::Uri;
+use utils::id::{NodeId, TimelineId};
+use utils::logging::{LogFormat, SecretString};
+use utils::postgres_client::PostgresClientProtocol;
 
 use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
-use crate::virtual_file;
 use crate::virtual_file::io_engine;
-use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME};
+use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, virtual_file};
 
 /// Global state of pageserver.
 ///
@@ -440,7 +433,9 @@ impl PageServerConf {
                     io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise
                     io_engine::FeatureTestResult::Worse { engine, remark } => {
                         // TODO: bubble this up to the caller so we can tracing::warn! it.
-                        eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}");
+                        eprintln!(
+                            "auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}"
+                        );
                         engine
                     }
                 },
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 7e8c00c293..0231190e69 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -1,13 +1,9 @@
 //! Periodically collect consumption metrics for all active tenants
 //! and push them to a HTTP endpoint.
-use crate::config::PageServerConf;
-use crate::consumption_metrics::metrics::MetricsKey;
-use crate::consumption_metrics::upload::KeyGen as _;
-use crate::context::{DownloadBehavior, RequestContext};
-use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::size::CalculateSyntheticSizeError;
-use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant};
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::{Duration, SystemTime};
+
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use itertools::Itertools as _;
@@ -15,14 +11,21 @@ use pageserver_api::models::TenantState;
 use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
-use std::sync::Arc;
-use std::time::{Duration, SystemTime};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::NodeId;
 
+use crate::config::PageServerConf;
+use crate::consumption_metrics::metrics::MetricsKey;
+use crate::consumption_metrics::upload::KeyGen as _;
+use crate::context::{DownloadBehavior, RequestContext};
+use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind};
+use crate::tenant::mgr::TenantManager;
+use crate::tenant::size::CalculateSyntheticSizeError;
+use crate::tenant::tasks::BackgroundLoopKind;
+use crate::tenant::{LogicalSizeCalculationCause, Tenant};
+
 mod disk_cache;
 mod metrics;
 mod upload;
diff --git a/pageserver/src/consumption_metrics/disk_cache.rs b/pageserver/src/consumption_metrics/disk_cache.rs
index 54a505a134..f1dad8793d 100644
--- a/pageserver/src/consumption_metrics/disk_cache.rs
+++ b/pageserver/src/consumption_metrics/disk_cache.rs
@@ -1,10 +1,10 @@
-use anyhow::Context;
-use camino::{Utf8Path, Utf8PathBuf};
 use std::sync::Arc;
 
-use crate::consumption_metrics::NewMetricsRefRoot;
+use anyhow::Context;
+use camino::{Utf8Path, Utf8PathBuf};
 
 use super::{NewMetricsRoot, NewRawMetric, RawMetric};
+use crate::consumption_metrics::NewMetricsRefRoot;
 
 pub(super) fn read_metrics_from_serde_value(
     json_value: serde_json::Value,
diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs
index 07fac09f6f..71910011ea 100644
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -1,15 +1,16 @@
-use crate::tenant::mgr::TenantManager;
-use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
+use std::sync::Arc;
+use std::time::SystemTime;
+
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
 use futures::stream::StreamExt;
-use std::{sync::Arc, time::SystemTime};
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
+use utils::id::{TenantId, TimelineId};
+use utils::lsn::Lsn;
 
 use super::{Cache, NewRawMetric};
+use crate::context::RequestContext;
+use crate::tenant::mgr::TenantManager;
+use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 
 /// Name of the metric, used by `MetricsKey` factory methods and `deserialize_cached_events`
 /// instead of static str.
diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs
index 3ed7b44123..52b4fb8680 100644
--- a/pageserver/src/consumption_metrics/metrics/tests.rs
+++ b/pageserver/src/consumption_metrics/metrics/tests.rs
@@ -1,7 +1,7 @@
-use crate::consumption_metrics::RawMetric;
+use std::collections::HashMap;
 
 use super::*;
-use std::collections::HashMap;
+use crate::consumption_metrics::RawMetric;
 
 #[test]
 fn startup_collected_timeline_metrics_before_advancing() {
diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs
index 448bf47525..59e0145a5b 100644
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -2,15 +2,16 @@ use std::error::Error as _;
 use std::time::SystemTime;
 
 use chrono::{DateTime, Utc};
-use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
+use consumption_metrics::{CHUNK_SIZE, Event, EventChunk, IdempotencyKey};
 use remote_storage::{GenericRemoteStorage, RemotePath};
 use tokio::io::AsyncWriteExt;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
-
-use super::{metrics::Name, Cache, MetricsKey, NewRawMetric, RawMetric};
 use utils::id::{TenantId, TimelineId};
 
+use super::metrics::Name;
+use super::{Cache, MetricsKey, NewRawMetric, RawMetric};
+
 /// How the metrics from pageserver are identified.
 #[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)]
 struct Ids {
@@ -438,14 +439,13 @@ async fn upload(
 
 #[cfg(test)]
 mod tests {
-    use crate::consumption_metrics::{
-        disk_cache::read_metrics_from_serde_value, NewMetricsRefRoot,
-    };
-
-    use super::*;
     use chrono::{DateTime, Utc};
     use once_cell::sync::Lazy;
 
+    use super::*;
+    use crate::consumption_metrics::NewMetricsRefRoot;
+    use crate::consumption_metrics::disk_cache::read_metrics_from_serde_value;
+
     #[test]
     fn chunked_serialization() {
         let examples = metric_samples();
diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs
index 4990f17b40..8462594607 100644
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -1,21 +1,23 @@
 use std::collections::HashMap;
 
 use futures::Future;
-use pageserver_api::{
-    controller_api::{AvailabilityZone, NodeRegisterRequest},
-    shard::TenantShardId,
-    upcall_api::{
-        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
-        ValidateRequestTenant, ValidateResponse,
-    },
+use pageserver_api::config::NodeMetadata;
+use pageserver_api::controller_api::{AvailabilityZone, NodeRegisterRequest};
+use pageserver_api::shard::TenantShardId;
+use pageserver_api::upcall_api::{
+    ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
+    ValidateRequestTenant, ValidateResponse,
 };
-use serde::{de::DeserializeOwned, Serialize};
+use serde::Serialize;
+use serde::de::DeserializeOwned;
 use tokio_util::sync::CancellationToken;
 use url::Url;
-use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
+use utils::generation::Generation;
+use utils::id::NodeId;
+use utils::{backoff, failpoint_support};
 
-use crate::{config::PageServerConf, virtual_file::on_fatal_io_error};
-use pageserver_api::config::NodeMetadata;
+use crate::config::PageServerConf;
+use crate::virtual_file::on_fatal_io_error;
 
 /// The Pageserver's client for using the storage controller upcall API: this is a small API
 /// for dealing with generations (see docs/rfcs/025-generation-numbers.md).
@@ -157,14 +159,18 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
                         match az_id_from_metadata {
                             Some(az_id) => Some(AvailabilityZone(az_id)),
                             None => {
-                                tracing::warn!("metadata.json does not contain an 'availability_zone_id' field");
+                                tracing::warn!(
+                                    "metadata.json does not contain an 'availability_zone_id' field"
+                                );
                                 conf.availability_zone.clone().map(AvailabilityZone)
                             }
                         }
                     };
 
                     if az_id.is_none() {
-                        panic!("Availablity zone id could not be inferred from metadata.json or pageserver config");
+                        panic!(
+                            "Availablity zone id could not be inferred from metadata.json or pageserver config"
+                        );
                     }
 
                     Some(NodeRegisterRequest {
@@ -236,7 +242,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
                     .iter()
                     .map(|(id, generation)| ValidateRequestTenant {
                         id: *id,
-                        gen: (*generation).into().expect(
+                        r#gen: (*generation).into().expect(
                             "Generation should always be valid for a Tenant doing deletions",
                         ),
                     })
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index a2395b0dca..8118f66252 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -6,38 +6,31 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::Duration;
 
-use crate::controller_upcall_client::ControlPlaneGenerationsApi;
-use crate::metrics;
-use crate::tenant::remote_timeline_client::remote_timeline_path;
-use crate::tenant::remote_timeline_client::LayerFileMetadata;
-use crate::virtual_file::MaybeFatalIo;
-use crate::virtual_file::VirtualFile;
 use anyhow::Context;
 use camino::Utf8PathBuf;
+use deleter::DeleterMessage;
+use list_writer::ListWriterQueueMessage;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{GenericRemoteStorage, RemotePath};
-use serde::Deserialize;
-use serde::Serialize;
+use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tokio_util::sync::CancellationToken;
-use tracing::Instrument;
-use tracing::{debug, error};
+use tracing::{Instrument, debug, error};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::generation::Generation;
 use utils::id::TimelineId;
-use utils::lsn::AtomicLsn;
-use utils::lsn::Lsn;
-
-use self::deleter::Deleter;
-use self::list_writer::DeletionOp;
-use self::list_writer::ListWriter;
-use self::list_writer::RecoverOp;
-use self::validator::Validator;
-use deleter::DeleterMessage;
-use list_writer::ListWriterQueueMessage;
+use utils::lsn::{AtomicLsn, Lsn};
 use validator::ValidatorQueueMessage;
 
-use crate::{config::PageServerConf, tenant::storage_layer::LayerName};
+use self::deleter::Deleter;
+use self::list_writer::{DeletionOp, ListWriter, RecoverOp};
+use self::validator::Validator;
+use crate::config::PageServerConf;
+use crate::controller_upcall_client::ControlPlaneGenerationsApi;
+use crate::metrics;
+use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_timeline_path};
+use crate::tenant::storage_layer::LayerName;
+use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 
 // TODO: configurable for how long to wait before executing deletions
 
@@ -664,21 +657,22 @@ impl DeletionQueue {
 
 #[cfg(test)]
 mod test {
+    use std::io::ErrorKind;
+    use std::time::Duration;
+
     use camino::Utf8Path;
     use hex_literal::hex;
-    use pageserver_api::{key::Key, shard::ShardIndex, upcall_api::ReAttachResponseTenant};
-    use std::{io::ErrorKind, time::Duration};
-    use tracing::info;
-
+    use pageserver_api::key::Key;
+    use pageserver_api::shard::ShardIndex;
+    use pageserver_api::upcall_api::ReAttachResponseTenant;
     use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
     use tokio::task::JoinHandle;
-
-    use crate::{
-        controller_upcall_client::RetryForeverError,
-        tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
-    };
+    use tracing::info;
 
     use super::*;
+    use crate::controller_upcall_client::RetryForeverError;
+    use crate::tenant::harness::TenantHarness;
+    use crate::tenant::storage_layer::DeltaLayerName;
     pub const TIMELINE_ID: TimelineId =
         TimelineId::from_array(hex!("11223344556677881122334455667788"));
 
@@ -724,26 +718,26 @@ mod test {
                 .expect("Failed to join workers for previous deletion queue");
         }
 
-        fn set_latest_generation(&self, gen: Generation) {
+        fn set_latest_generation(&self, gen_: Generation) {
             let tenant_shard_id = self.harness.tenant_shard_id;
             self.mock_control_plane
                 .latest_generation
                 .lock()
                 .unwrap()
-                .insert(tenant_shard_id, gen);
+                .insert(tenant_shard_id, gen_);
         }
 
         /// Returns remote layer file name, suitable for use in assert_remote_files
         fn write_remote_layer(
             &self,
             file_name: LayerName,
-            gen: Generation,
+            gen_: Generation,
         ) -> anyhow::Result<String> {
             let tenant_shard_id = self.harness.tenant_shard_id;
             let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
             let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path());
             std::fs::create_dir_all(&remote_timeline_path)?;
-            let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix());
+            let remote_layer_file_name = format!("{}{}", file_name, gen_.get_suffix());
 
             let content: Vec<u8> = format!("placeholder contents of {file_name}").into();
 
@@ -1098,11 +1092,12 @@ mod test {
 /// or coalescing, and doesn't actually execute any deletions unless you call pump() to kick it.
 #[cfg(test)]
 pub(crate) mod mock {
+    use std::sync::atomic::{AtomicUsize, Ordering};
+
     use tracing::info;
 
     use super::*;
     use crate::tenant::remote_timeline_client::remote_layer_path;
-    use std::sync::atomic::{AtomicUsize, Ordering};
 
     pub struct ConsumerState {
         rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs
index ef1dfbac19..691ba75cc7 100644
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -6,21 +6,16 @@
 //! number of full-sized DeleteObjects requests, rather than a larger number of
 //! smaller requests.
 
-use remote_storage::GenericRemoteStorage;
-use remote_storage::RemotePath;
-use remote_storage::TimeoutOrCancel;
 use std::time::Duration;
+
+use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use tokio_util::sync::CancellationToken;
-use tracing::info;
-use tracing::warn;
-use utils::backoff;
-use utils::pausable_failpoint;
+use tracing::{info, warn};
+use utils::{backoff, pausable_failpoint};
 
+use super::{DeletionQueueError, FlushOp};
 use crate::metrics;
 
-use super::DeletionQueueError;
-use super::FlushOp;
-
 const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
 
 pub(super) enum DeleterMessage {
diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs
index ae3b2c9180..a385e35a02 100644
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -10,11 +10,6 @@
 //!
 //! DeletionLists are passed onwards to the Validator.
 
-use super::DeletionHeader;
-use super::DeletionList;
-use super::FlushOp;
-use super::ValidatorQueueMessage;
-
 use std::collections::HashMap;
 use std::fs::create_dir_all;
 use std::time::Duration;
@@ -23,20 +18,17 @@ use pageserver_api::shard::TenantShardId;
 use regex::Regex;
 use remote_storage::RemotePath;
 use tokio_util::sync::CancellationToken;
-use tracing::debug;
-use tracing::info;
-use tracing::warn;
+use tracing::{debug, info, warn};
 use utils::generation::Generation;
 use utils::id::TimelineId;
 
+use super::{DeletionHeader, DeletionList, FlushOp, ValidatorQueueMessage};
 use crate::config::PageServerConf;
 use crate::deletion_queue::TEMP_SUFFIX;
 use crate::metrics;
-use crate::tenant::remote_timeline_client::remote_layer_path;
-use crate::tenant::remote_timeline_client::LayerFileMetadata;
+use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_layer_path};
 use crate::tenant::storage_layer::LayerName;
-use crate::virtual_file::on_fatal_io_error;
-use crate::virtual_file::MaybeFatalIo;
+use crate::virtual_file::{MaybeFatalIo, on_fatal_io_error};
 
 // The number of keys in a DeletionList before we will proactively persist it
 // (without reaching a flush deadline).  This aims to deliver objects of the order
diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs
index 1d55581ebd..b0ce2b80b4 100644
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -20,22 +20,14 @@ use std::time::Duration;
 
 use camino::Utf8PathBuf;
 use tokio_util::sync::CancellationToken;
-use tracing::debug;
-use tracing::info;
-use tracing::warn;
-
-use crate::config::PageServerConf;
-use crate::controller_upcall_client::ControlPlaneGenerationsApi;
-use crate::controller_upcall_client::RetryForeverError;
-use crate::metrics;
-use crate::virtual_file::MaybeFatalIo;
+use tracing::{debug, info, warn};
 
 use super::deleter::DeleterMessage;
-use super::DeletionHeader;
-use super::DeletionList;
-use super::DeletionQueueError;
-use super::FlushOp;
-use super::VisibleLsnUpdates;
+use super::{DeletionHeader, DeletionList, DeletionQueueError, FlushOp, VisibleLsnUpdates};
+use crate::config::PageServerConf;
+use crate::controller_upcall_client::{ControlPlaneGenerationsApi, RetryForeverError};
+use crate::metrics;
+use crate::virtual_file::MaybeFatalIo;
 
 // After this length of time, do any validation work that is pending,
 // even if we haven't accumulated many keys to delete.
@@ -190,7 +182,10 @@ where
                 }
             } else {
                 // If we failed validation, then do not apply any of the projected updates
-                info!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
+                info!(
+                    "Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}",
+                    tenant_lsn_state.generation
+                );
                 metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
             }
         }
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 738a783813..13252037e5 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -41,30 +41,31 @@
 // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
 //   reading these fields. We use the Debug impl for semi-structured logging, though.
 
-use std::{sync::Arc, time::SystemTime};
+use std::sync::Arc;
+use std::time::SystemTime;
 
 use anyhow::Context;
-use pageserver_api::{config::DiskUsageEvictionTaskConfig, shard::TenantShardId};
+use pageserver_api::config::DiskUsageEvictionTaskConfig;
+use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
 use serde::Serialize;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, instrument, warn, Instrument};
-use utils::{completion, id::TimelineId};
+use tracing::{Instrument, debug, error, info, instrument, warn};
+use utils::completion;
+use utils::id::TimelineId;
 
-use crate::{
-    config::PageServerConf,
-    metrics::disk_usage_based_eviction::METRICS,
-    task_mgr::{self, BACKGROUND_RUNTIME},
-    tenant::{
-        mgr::TenantManager,
-        remote_timeline_client::LayerFileMetadata,
-        secondary::SecondaryTenant,
-        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint},
-        tasks::sleep_random,
-    },
-    CancellableTask, DiskUsageEvictionTask,
+use crate::config::PageServerConf;
+use crate::metrics::disk_usage_based_eviction::METRICS;
+use crate::task_mgr::{self, BACKGROUND_RUNTIME};
+use crate::tenant::mgr::TenantManager;
+use crate::tenant::remote_timeline_client::LayerFileMetadata;
+use crate::tenant::secondary::SecondaryTenant;
+use crate::tenant::storage_layer::{
+    AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint,
 };
+use crate::tenant::tasks::sleep_random;
+use crate::{CancellableTask, DiskUsageEvictionTask};
 
 /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
 /// partitioning.
@@ -1007,10 +1008,14 @@ async fn collect_eviction_candidates(
         }
     }
 
-    debug_assert!(EvictionPartition::Above < EvictionPartition::Below,
-        "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
-    debug_assert!(EvictionPartition::EvictNow < EvictionPartition::Above,
-        "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
+    debug_assert!(
+        EvictionPartition::Above < EvictionPartition::Below,
+        "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"
+    );
+    debug_assert!(
+        EvictionPartition::EvictNow < EvictionPartition::Above,
+        "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"
+    );
 
     eviction_order.sort(&mut candidates);
 
@@ -1157,9 +1162,8 @@ mod filesystem_level_usage {
     use anyhow::Context;
     use camino::Utf8Path;
 
-    use crate::statvfs::Statvfs;
-
     use super::DiskUsageEvictionTaskConfig;
+    use crate::statvfs::Statvfs;
 
     #[derive(Debug, Clone, Copy)]
     pub struct Usage<'a> {
@@ -1224,10 +1228,12 @@ mod filesystem_level_usage {
 
     #[test]
     fn max_usage_pct_pressure() {
-        use super::Usage as _;
         use std::time::Duration;
+
         use utils::serde_percent::Percent;
 
+        use super::Usage as _;
+
         let mut usage = Usage {
             config: &DiskUsageEvictionTaskConfig {
                 max_usage_pct: Percent::new(85).unwrap(),
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 9f37fc32a3..dd5a24a41f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2,125 +2,83 @@
 //! Management HTTP API
 //!
 use std::cmp::Reverse;
-use std::collections::BinaryHeap;
-use std::collections::HashMap;
+use std::collections::{BinaryHeap, HashMap};
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 
-use anyhow::{anyhow, Context, Result};
+use anyhow::{Context, Result, anyhow};
 use enumset::EnumSet;
 use futures::future::join_all;
-use futures::StreamExt;
-use futures::TryFutureExt;
+use futures::{StreamExt, TryFutureExt};
 use http_utils::endpoint::{
-    profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span,
+    self, attach_openapi_ui, auth_middleware, check_permission_with, profile_cpu_handler,
+    profile_heap_handler, prometheus_metrics_handler, request_span,
 };
+use http_utils::error::{ApiError, HttpErrorBody};
 use http_utils::failpoints::failpoints_handler;
-use http_utils::request::must_parse_query_param;
-use http_utils::request::{get_request_param, must_get_query_param, parse_query_param};
+use http_utils::json::{json_request, json_request_maybe, json_response};
+use http_utils::request::{
+    get_request_param, must_get_query_param, must_parse_query_param, parse_query_param,
+    parse_request_param,
+};
+use http_utils::{RequestExt, RouterBuilder};
 use humantime::format_rfc3339;
-use hyper::header;
-use hyper::StatusCode;
-use hyper::{Body, Request, Response, Uri};
+use hyper::{Body, Request, Response, StatusCode, Uri, header};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::virtual_file::IoMode;
-use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
-use pageserver_api::models::IngestAuxFilesRequest;
-use pageserver_api::models::ListAuxFilesRequest;
-use pageserver_api::models::LocationConfig;
-use pageserver_api::models::LocationConfigListResponse;
-use pageserver_api::models::LocationConfigMode;
-use pageserver_api::models::LsnLease;
-use pageserver_api::models::LsnLeaseRequest;
-use pageserver_api::models::OffloadedTimelineInfo;
-use pageserver_api::models::PageTraceEvent;
-use pageserver_api::models::ShardParameters;
-use pageserver_api::models::TenantConfigPatchRequest;
-use pageserver_api::models::TenantDetails;
-use pageserver_api::models::TenantLocationConfigRequest;
-use pageserver_api::models::TenantLocationConfigResponse;
-use pageserver_api::models::TenantScanRemoteStorageResponse;
-use pageserver_api::models::TenantScanRemoteStorageShard;
-use pageserver_api::models::TenantShardLocation;
-use pageserver_api::models::TenantShardSplitRequest;
-use pageserver_api::models::TenantShardSplitResponse;
-use pageserver_api::models::TenantSorting;
-use pageserver_api::models::TenantState;
-use pageserver_api::models::TenantWaitLsnRequest;
-use pageserver_api::models::TimelineArchivalConfigRequest;
-use pageserver_api::models::TimelineCreateRequestMode;
-use pageserver_api::models::TimelineCreateRequestModeImportPgdata;
-use pageserver_api::models::TimelinesInfoAndOffloaded;
-use pageserver_api::models::TopTenantShardItem;
-use pageserver_api::models::TopTenantShardsRequest;
-use pageserver_api::models::TopTenantShardsResponse;
-use pageserver_api::shard::ShardCount;
-use pageserver_api::shard::TenantShardId;
-use remote_storage::DownloadError;
-use remote_storage::GenericRemoteStorage;
-use remote_storage::TimeTravelError;
+use pageserver_api::models::{
+    DownloadRemoteLayersTaskSpawnRequest, IngestAuxFilesRequest, ListAuxFilesRequest,
+    LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease, LsnLeaseRequest,
+    OffloadedTimelineInfo, PageTraceEvent, ShardParameters, StatusResponse,
+    TenantConfigPatchRequest, TenantConfigRequest, TenantDetails, TenantInfo,
+    TenantLocationConfigRequest, TenantLocationConfigResponse, TenantScanRemoteStorageResponse,
+    TenantScanRemoteStorageShard, TenantShardLocation, TenantShardSplitRequest,
+    TenantShardSplitResponse, TenantSorting, TenantState, TenantWaitLsnRequest,
+    TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateRequestMode,
+    TimelineCreateRequestModeImportPgdata, TimelineGcRequest, TimelineInfo,
+    TimelinesInfoAndOffloaded, TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse,
+};
+use pageserver_api::shard::{ShardCount, TenantShardId};
+use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
 use scopeguard::defer;
-use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
+use tenant_size_model::svg::SvgBranchKind;
+use tenant_size_model::{SizeResult, StorageModel};
 use tokio::time::Instant;
 use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::auth::SwappableJwtAuth;
+use utils::generation::Generation;
+use utils::id::{TenantId, TimelineId};
+use utils::lsn::Lsn;
 
 use crate::config::PageServerConf;
-use crate::context::RequestContextBuilder;
-use crate::context::{DownloadBehavior, RequestContext};
+use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
 use crate::deletion_queue::DeletionQueueClient;
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
-use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::{
-    GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError,
-    TenantSlotUpsertError, TenantStateError,
+    GetActiveTenantError, GetTenantError, TenantManager, TenantMapError, TenantMapInsertError,
+    TenantSlot, TenantSlotError, TenantSlotUpsertError, TenantStateError, UpsertLocationError,
+};
+use crate::tenant::remote_timeline_client::{
+    download_index_part, list_remote_tenant_shards, list_remote_timelines,
 };
-use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
-use crate::tenant::remote_timeline_client;
-use crate::tenant::remote_timeline_client::download_index_part;
-use crate::tenant::remote_timeline_client::list_remote_tenant_shards;
-use crate::tenant::remote_timeline_client::list_remote_timelines;
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
-use crate::tenant::storage_layer::IoConcurrency;
-use crate::tenant::storage_layer::LayerAccessStatsReset;
-use crate::tenant::storage_layer::LayerName;
-use crate::tenant::timeline::import_pgdata;
-use crate::tenant::timeline::offload::offload_timeline;
-use crate::tenant::timeline::offload::OffloadError;
-use crate::tenant::timeline::CompactFlags;
-use crate::tenant::timeline::CompactOptions;
-use crate::tenant::timeline::CompactRequest;
-use crate::tenant::timeline::CompactionError;
-use crate::tenant::timeline::Timeline;
-use crate::tenant::timeline::WaitLsnTimeout;
-use crate::tenant::timeline::WaitLsnWaiter;
-use crate::tenant::GetTimelineError;
-use crate::tenant::OffloadedTimeline;
-use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
-use crate::DEFAULT_PG_VERSION;
-use crate::{disk_usage_eviction_task, tenant};
-use http_utils::{
-    endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
-    error::{ApiError, HttpErrorBody},
-    json::{json_request, json_request_maybe, json_response},
-    request::parse_request_param,
-    RequestExt, RouterBuilder,
+use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName};
+use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
+use crate::tenant::timeline::{
+    CompactFlags, CompactOptions, CompactRequest, CompactionError, Timeline, WaitLsnTimeout,
+    WaitLsnWaiter, import_pgdata,
 };
-use pageserver_api::models::{
-    StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
-    TimelineInfo,
-};
-use utils::{
-    auth::SwappableJwtAuth,
-    generation::Generation,
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
+use crate::tenant::{
+    GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError,
+    remote_timeline_client,
 };
+use crate::{DEFAULT_PG_VERSION, disk_usage_eviction_task, tenant};
 
 // For APIs that require an Active tenant, how long should we block waiting for that state?
 // This is not functionally necessary (clients will retry), but avoids generating a lot of
@@ -1128,12 +1086,12 @@ async fn tenant_list_handler(
             ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
         })?
         .iter()
-        .map(|(id, state, gen)| TenantInfo {
+        .map(|(id, state, gen_)| TenantInfo {
             id: *id,
             state: state.clone(),
             current_physical_size: None,
             attachment_status: state.attachment_status(),
-            generation: (*gen)
+            generation: (*gen_)
                 .into()
                 .expect("Tenants are always attached with a generation"),
             gc_blocking: None,
@@ -1670,9 +1628,8 @@ async fn block_or_unblock_gc(
     request: Request<Body>,
     block: bool,
 ) -> Result<Response<Body>, ApiError> {
-    use crate::tenant::{
-        remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized,
-    };
+    use crate::tenant::remote_timeline_client::WaitCompletionError;
+    use crate::tenant::upload_queue::NotInitialized;
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -2058,7 +2015,9 @@ async fn tenant_time_travel_remote_storage_handler(
         )));
     }
 
-    tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}");
+    tracing::info!(
+        "Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}"
+    );
 
     remote_timeline_client::upload::time_travel_recover_tenant(
         &state.remote_storage,
@@ -2459,9 +2418,10 @@ async fn timeline_detach_ancestor_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    use crate::tenant::timeline::detach_ancestor;
     use pageserver_api::models::detach_ancestor::AncestorDetached;
 
+    use crate::tenant::timeline::detach_ancestor;
+
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -2806,14 +2766,19 @@ async fn tenant_scan_remote_handler(
             .await
             {
                 Ok((index_part, index_generation, _index_mtime)) => {
-                    tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
-                        index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn());
+                    tracing::info!(
+                        "Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
+                        index_part.layer_metadata.len(),
+                        index_part.metadata.disk_consistent_lsn()
+                    );
                     generation = std::cmp::max(generation, index_generation);
                 }
                 Err(DownloadError::NotFound) => {
                     // This is normal for tenants that were created with multiple shards: they have an unsharded path
                     // containing the timeline's initdb tarball but no index.  Otherwise it is a bit strange.
-                    tracing::info!("Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping");
+                    tracing::info!(
+                        "Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping"
+                    );
                     continue;
                 }
                 Err(e) => {
@@ -3432,7 +3397,9 @@ async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow
         anyhow::bail!("unexpected non-zero bytes after the tar archive");
     }
     if trailing_bytes % 512 != 0 {
-        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
+        anyhow::bail!(
+            "unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive"
+        );
     }
     Ok(())
 }
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index a73fa5cec8..6dd005de50 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -4,14 +4,22 @@
 //!
 use std::path::{Path, PathBuf};
 
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{Context, Result, bail, ensure};
 use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
 use pageserver_api::key::rel_block_to_key;
+use pageserver_api::reltag::{RelTag, SlruKind};
+use postgres_ffi::relfile_utils::*;
+use postgres_ffi::waldecoder::WalStreamDecoder;
+use postgres_ffi::{
+    BLCKSZ, ControlFileData, DBState_DB_SHUTDOWNED, Oid, WAL_SEGMENT_SIZE, XLogFileName,
+    pg_constants,
+};
 use tokio::io::{AsyncRead, AsyncReadExt};
 use tokio_tar::Archive;
 use tracing::*;
+use utils::lsn::Lsn;
 use wal_decoder::models::InterpretedWalRecord;
 use walkdir::WalkDir;
 
@@ -20,16 +28,6 @@ use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
-use pageserver_api::reltag::{RelTag, SlruKind};
-use postgres_ffi::pg_constants;
-use postgres_ffi::relfile_utils::*;
-use postgres_ffi::waldecoder::WalStreamDecoder;
-use postgres_ffi::ControlFileData;
-use postgres_ffi::DBState_DB_SHUTDOWNED;
-use postgres_ffi::Oid;
-use postgres_ffi::XLogFileName;
-use postgres_ffi::{BLCKSZ, WAL_SEGMENT_SIZE};
-use utils::lsn::Lsn;
 
 // Returns checkpoint LSN from controlfile
 pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs
index 491c9fb96c..6cfecef0cf 100644
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -1,4 +1,5 @@
-use std::{num::NonZeroUsize, sync::Arc};
+use std::num::NonZeroUsize;
+use std::sync::Arc;
 
 #[derive(Debug, PartialEq, Eq, Clone)]
 pub enum L0FlushConfig {
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index f43cd08cf7..02767055fb 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -15,7 +15,8 @@ pub mod l0_flush;
 
 extern crate hyper0 as hyper;
 
-use futures::{stream::FuturesUnordered, StreamExt};
+use futures::StreamExt;
+use futures::stream::FuturesUnordered;
 pub use pageserver_api::keyspace;
 use tokio_util::sync::CancellationToken;
 mod assert_u64_eq_usize;
@@ -35,10 +36,8 @@ pub mod walredo;
 
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
-use tenant::{
-    mgr::{BackgroundPurges, TenantManager},
-    secondary,
-};
+use tenant::mgr::{BackgroundPurges, TenantManager};
+use tenant::secondary;
 use tracing::{info, info_span};
 
 /// Current storage format version
@@ -350,9 +349,10 @@ async fn timed_after_cancellation<Fut: std::future::Future>(
 
 #[cfg(test)]
 mod timed_tests {
-    use super::timed;
     use std::time::Duration;
 
+    use super::timed;
+
     #[tokio::test]
     async fn timed_completes_when_inner_future_completes() {
         // A future that completes on time should have its result returned
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index e1c26b0684..eb8a9b8e24 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -10,11 +10,11 @@ use std::time::{Duration, Instant};
 use enum_map::{Enum as _, EnumMap};
 use futures::Future;
 use metrics::{
+    Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
+    IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
     register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
     register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
     register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
-    Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
-    IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
 use pageserver_api::config::{
@@ -24,9 +24,8 @@ use pageserver_api::config::{
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
 use pin_project_lite::pin_project;
-use postgres_backend::{is_expected_io_error, QueryError};
+use postgres_backend::{QueryError, is_expected_io_error};
 use pq_proto::framed::ConnectionError;
-
 use strum::{EnumCount, IntoEnumIterator as _, VariantNames};
 use strum_macros::{IntoStaticStr, VariantNames};
 use utils::id::TimelineId;
@@ -35,12 +34,12 @@ use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext};
 use crate::pgdatadir_mapping::DatadirModificationStats;
 use crate::task_mgr::TaskKind;
+use crate::tenant::Timeline;
 use crate::tenant::layer_map::LayerMap;
 use crate::tenant::mgr::TenantSlot;
 use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc};
 use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::throttle::ThrottleResult;
-use crate::tenant::Timeline;
 
 /// Prometheus histogram buckets (in seconds) for operations in the critical
 /// path. In other words, operations that directly affect that latency of user
@@ -363,7 +362,7 @@ pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
 pub(crate) mod page_cache_eviction_metrics {
     use std::num::NonZeroUsize;
 
-    use metrics::{register_int_counter_vec, IntCounter, IntCounterVec};
+    use metrics::{IntCounter, IntCounterVec, register_int_counter_vec};
     use once_cell::sync::Lazy;
 
     #[derive(Clone, Copy)]
@@ -722,7 +721,7 @@ pub(crate) static RELSIZE_CACHE_MISSES_OLD: Lazy<IntCounter> = Lazy::new(|| {
 });
 
 pub(crate) mod initial_logical_size {
-    use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
+    use metrics::{IntCounter, IntCounterVec, register_int_counter, register_int_counter_vec};
     use once_cell::sync::Lazy;
 
     pub(crate) struct StartCalculation(IntCounterVec);
@@ -1105,12 +1104,17 @@ impl EvictionsWithLowResidenceDuration {
                 // - future "drop panick => abort"
                 //
                 // so just nag: (the error has the labels)
-                tracing::warn!("failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}");
+                tracing::warn!(
+                    "failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}"
+                );
             }
             Ok(()) => {
                 // to help identify cases where we double-remove the same values, let's log all
                 // deletions?
-                tracing::info!("removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}", self.data_source);
+                tracing::info!(
+                    "removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}",
+                    self.data_source
+                );
             }
         }
     }
@@ -3574,12 +3578,10 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
 }
 
 pub mod tokio_epoll_uring {
-    use std::{
-        collections::HashMap,
-        sync::{Arc, Mutex},
-    };
+    use std::collections::HashMap;
+    use std::sync::{Arc, Mutex};
 
-    use metrics::{register_histogram, register_int_counter, Histogram, LocalHistogram, UIntGauge};
+    use metrics::{Histogram, LocalHistogram, UIntGauge, register_histogram, register_int_counter};
     use once_cell::sync::Lazy;
 
     /// Shared storage for tokio-epoll-uring thread local metrics.
@@ -3588,7 +3590,9 @@ pub mod tokio_epoll_uring {
             let slots_submission_queue_depth = register_histogram!(
                 "pageserver_tokio_epoll_uring_slots_submission_queue_depth",
                 "The slots waiters queue depth of each tokio_epoll_uring system",
-                vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
+                vec![
+                    1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0
+                ],
             )
             .expect("failed to define a metric");
             ThreadLocalMetricsStorage {
@@ -3765,7 +3769,7 @@ pub mod tokio_epoll_uring {
 }
 
 pub(crate) mod tenant_throttling {
-    use metrics::{register_int_counter_vec, IntCounter};
+    use metrics::{IntCounter, register_int_counter_vec};
     use once_cell::sync::Lazy;
     use utils::shard::TenantShardId;
 
diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs
index 45bf02362a..984dd125a9 100644
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -67,23 +67,18 @@
 //! mapping is automatically removed and the slot is marked free.
 //!
 
-use std::{
-    collections::{hash_map::Entry, HashMap},
-    sync::{
-        atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
-        Arc, Weak,
-    },
-    time::Duration,
-};
+use std::collections::HashMap;
+use std::collections::hash_map::Entry;
+use std::sync::atomic::{AtomicU8, AtomicU64, AtomicUsize, Ordering};
+use std::sync::{Arc, Weak};
+use std::time::Duration;
 
 use anyhow::Context;
 use once_cell::sync::OnceCell;
 
-use crate::{
-    context::RequestContext,
-    metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
-    virtual_file::{IoBufferMut, IoPageSlice},
-};
+use crate::context::RequestContext;
+use crate::metrics::{PageCacheSizeMetrics, page_cache_eviction_metrics};
+use crate::virtual_file::{IoBufferMut, IoPageSlice};
 
 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
 const TEST_PAGE_CACHE_SIZE: usize = 50;
@@ -168,11 +163,7 @@ impl Slot {
         let count_res =
             self.usage_count
                 .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| {
-                    if val == 0 {
-                        None
-                    } else {
-                        Some(val - 1)
-                    }
+                    if val == 0 { None } else { Some(val - 1) }
                 });
 
         match count_res {
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 668f0eee36..8972515163 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1,7 +1,15 @@
 //! The Page Service listens for client connections and serves their GetPage@LSN
 //! requests.
 
-use anyhow::{bail, Context};
+use std::borrow::Cow;
+use std::num::NonZeroUsize;
+use std::os::fd::AsRawFd;
+use std::str::FromStr;
+use std::sync::Arc;
+use std::time::{Duration, Instant, SystemTime};
+use std::{io, str};
+
+use anyhow::{Context, bail};
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use futures::FutureExt;
@@ -11,72 +19,57 @@ use pageserver_api::config::{
     PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
     PageServiceProtocolPipelinedExecutionStrategy,
 };
-use pageserver_api::models::{self, TenantState};
+use pageserver_api::key::rel_block_to_key;
 use pageserver_api::models::{
-    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
+    self, PageTraceEvent, PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
     PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
     PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest,
     PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse,
-    PagestreamProtocolVersion, PagestreamRequest,
+    PagestreamProtocolVersion, PagestreamRequest, TenantState,
 };
+use pageserver_api::reltag::SlruKind;
 use pageserver_api::shard::TenantShardId;
 use postgres_backend::{
-    is_expected_io_error, AuthType, PostgresBackend, PostgresBackendReader, QueryError,
+    AuthType, PostgresBackend, PostgresBackendReader, QueryError, is_expected_io_error,
 };
+use postgres_ffi::BLCKSZ;
+use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use pq_proto::framed::ConnectionError;
-use pq_proto::FeStartupPacket;
-use pq_proto::{BeMessage, FeMessage, RowDescriptor};
-use std::borrow::Cow;
-use std::io;
-use std::num::NonZeroUsize;
-use std::str;
-use std::str::FromStr;
-use std::sync::Arc;
-use std::time::SystemTime;
-use std::time::{Duration, Instant};
+use pq_proto::{BeMessage, FeMessage, FeStartupPacket, RowDescriptor};
 use strum_macros::IntoStaticStr;
-use tokio::io::{AsyncRead, AsyncWrite};
-use tokio::io::{AsyncWriteExt, BufWriter};
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufWriter};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::auth::{Claims, Scope, SwappableJwtAuth};
+use utils::failpoint_support;
+use utils::id::{TenantId, TimelineId};
 use utils::logging::log_slow;
+use utils::lsn::Lsn;
+use utils::simple_rcu::RcuReadGuard;
 use utils::sync::gate::{Gate, GateGuard};
 use utils::sync::spsc_fold;
-use utils::{
-    auth::{Claims, Scope, SwappableJwtAuth},
-    failpoint_support,
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-    simple_rcu::RcuReadGuard,
-};
 
 use crate::auth::check_permission;
 use crate::basebackup::BasebackupError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::metrics::{self, SmgrOpTimer};
-use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
+use crate::metrics::{
+    self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer,
+};
 use crate::pgdatadir_mapping::Version;
-use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
-use crate::task_mgr::TaskKind;
-use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME};
-use crate::tenant::mgr::ShardSelector;
-use crate::tenant::mgr::TenantManager;
-use crate::tenant::mgr::{GetActiveTenantError, GetTenantError, ShardResolveResult};
+use crate::span::{
+    debug_assert_current_span_has_tenant_and_timeline_id,
+    debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
+};
+use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME, TaskKind};
+use crate::tenant::mgr::{
+    GetActiveTenantError, GetTenantError, ShardResolveResult, ShardSelector, TenantManager,
+};
 use crate::tenant::storage_layer::IoConcurrency;
 use crate::tenant::timeline::{self, WaitLsnError};
-use crate::tenant::GetTimelineError;
-use crate::tenant::PageReconstructError;
-use crate::tenant::Timeline;
+use crate::tenant::{GetTimelineError, PageReconstructError, Timeline};
 use crate::{basebackup, timed_after_cancellation};
-use pageserver_api::key::rel_block_to_key;
-use pageserver_api::models::PageTraceEvent;
-use pageserver_api::reltag::SlruKind;
-use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
-use postgres_ffi::BLCKSZ;
-use std::os::fd::AsRawFd;
 
 /// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which
 /// is not yet in state [`TenantState::Active`].
@@ -986,7 +979,7 @@ impl PageServerHandler {
                 Ok(BatchedFeMessage::GetPage {
                     span: _,
                     shard: accum_shard,
-                    pages: ref mut accum_pages,
+                    pages: accum_pages,
                     effective_request_lsn: accum_lsn,
                 }),
                 BatchedFeMessage::GetPage {
@@ -1236,12 +1229,13 @@ impl PageServerHandler {
             } => {
                 fail::fail_point!("ps::handle-pagerequest-message::exists");
                 (
-                    vec![self
-                        .handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx)
-                        .instrument(span.clone())
-                        .await
-                        .map(|msg| (msg, timer))
-                        .map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
+                    vec![
+                        self.handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx)
+                            .instrument(span.clone())
+                            .await
+                            .map(|msg| (msg, timer))
+                            .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
+                    ],
                     span,
                 )
             }
@@ -1253,12 +1247,13 @@ impl PageServerHandler {
             } => {
                 fail::fail_point!("ps::handle-pagerequest-message::nblocks");
                 (
-                    vec![self
-                        .handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx)
-                        .instrument(span.clone())
-                        .await
-                        .map(|msg| (msg, timer))
-                        .map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
+                    vec![
+                        self.handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx)
+                            .instrument(span.clone())
+                            .await
+                            .map(|msg| (msg, timer))
+                            .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
+                    ],
                     span,
                 )
             }
@@ -1297,12 +1292,13 @@ impl PageServerHandler {
             } => {
                 fail::fail_point!("ps::handle-pagerequest-message::dbsize");
                 (
-                    vec![self
-                        .handle_db_size_request(&*shard.upgrade()?, &req, ctx)
-                        .instrument(span.clone())
-                        .await
-                        .map(|msg| (msg, timer))
-                        .map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
+                    vec![
+                        self.handle_db_size_request(&*shard.upgrade()?, &req, ctx)
+                            .instrument(span.clone())
+                            .await
+                            .map(|msg| (msg, timer))
+                            .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
+                    ],
                     span,
                 )
             }
@@ -1314,12 +1310,13 @@ impl PageServerHandler {
             } => {
                 fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
                 (
-                    vec![self
-                        .handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx)
-                        .instrument(span.clone())
-                        .await
-                        .map(|msg| (msg, timer))
-                        .map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
+                    vec![
+                        self.handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx)
+                            .instrument(span.clone())
+                            .await
+                            .map(|msg| (msg, timer))
+                            .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
+                    ],
                     span,
                 )
             }
@@ -2112,7 +2109,9 @@ impl PageServerHandler {
         set_tracing_field_shard_id(&timeline);
 
         if timeline.is_archived() == Some(true) {
-            tracing::info!("timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it.");
+            tracing::info!(
+                "timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it."
+            );
             return Err(QueryError::NotFound("timeline is archived".into()));
         }
 
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index d0e2dab042..787b1b895c 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -6,6 +6,36 @@
 //! walingest.rs handles a few things like implicit relation creation and extension.
 //! Clarify that)
 //!
+use std::collections::{BTreeMap, HashMap, HashSet, hash_map};
+use std::ops::{ControlFlow, Range};
+
+use anyhow::{Context, ensure};
+use bytes::{Buf, Bytes, BytesMut};
+use enum_map::Enum;
+use itertools::Itertools;
+use pageserver_api::key::{
+    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, CompactKey, DBDIR_KEY, Key, RelDirExists,
+    TWOPHASEDIR_KEY, dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range,
+    rel_size_to_key, rel_tag_sparse_key, rel_tag_sparse_key_range, relmap_file_key,
+    repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
+    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
+};
+use pageserver_api::keyspace::SparseKeySpace;
+use pageserver_api::record::NeonWalRecord;
+use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
+use pageserver_api::shard::ShardIdentity;
+use pageserver_api::value::Value;
+use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi::{BLCKSZ, Oid, RepOriginId, TimestampTz, TransactionId};
+use serde::{Deserialize, Serialize};
+use strum::IntoEnumIterator;
+use tokio_util::sync::CancellationToken;
+use tracing::{debug, info, trace, warn};
+use utils::bin_ser::{BeSer, DeserializeError};
+use utils::lsn::Lsn;
+use utils::pausable_failpoint;
+use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
+
 use super::tenant::{PageReconstructError, Timeline};
 use crate::aux_file;
 use crate::context::RequestContext;
@@ -19,37 +49,6 @@ use crate::span::{
 };
 use crate::tenant::storage_layer::IoConcurrency;
 use crate::tenant::timeline::GetVectoredError;
-use anyhow::{ensure, Context};
-use bytes::{Buf, Bytes, BytesMut};
-use enum_map::Enum;
-use itertools::Itertools;
-use pageserver_api::key::{
-    dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
-    rel_tag_sparse_key_range, relmap_file_key, repl_origin_key, repl_origin_key_range,
-    slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key,
-    twophase_file_key, twophase_key_range, CompactKey, RelDirExists, AUX_FILES_KEY, CHECKPOINT_KEY,
-    CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
-};
-use pageserver_api::key::{rel_tag_sparse_key, Key};
-use pageserver_api::keyspace::SparseKeySpace;
-use pageserver_api::record::NeonWalRecord;
-use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
-use pageserver_api::shard::ShardIdentity;
-use pageserver_api::value::Value;
-use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
-use postgres_ffi::BLCKSZ;
-use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
-use serde::{Deserialize, Serialize};
-use std::collections::{hash_map, BTreeMap, HashMap, HashSet};
-use std::ops::ControlFlow;
-use std::ops::Range;
-use strum::IntoEnumIterator;
-use tokio_util::sync::CancellationToken;
-use tracing::{debug, info, trace, warn};
-use utils::bin_ser::DeserializeError;
-use utils::pausable_failpoint;
-use utils::{bin_ser::BeSer, lsn::Lsn};
-use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
 
 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
 pub const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -327,16 +326,16 @@ impl Timeline {
                         let clone = match &res {
                             Ok(buf) => Ok(buf.clone()),
                             Err(err) => Err(match err {
-                                PageReconstructError::Cancelled => {
-                                    PageReconstructError::Cancelled
-                                }
+                                PageReconstructError::Cancelled => PageReconstructError::Cancelled,
 
-                                x @ PageReconstructError::Other(_) |
-                                x @ PageReconstructError::AncestorLsnTimeout(_) |
-                                x @ PageReconstructError::WalRedo(_) |
-                                x @ PageReconstructError::MissingKey(_) => {
-                                    PageReconstructError::Other(anyhow::anyhow!("there was more than one request for this key in the batch, error logged once: {x:?}"))
-                                },
+                                x @ PageReconstructError::Other(_)
+                                | x @ PageReconstructError::AncestorLsnTimeout(_)
+                                | x @ PageReconstructError::WalRedo(_)
+                                | x @ PageReconstructError::MissingKey(_) => {
+                                    PageReconstructError::Other(anyhow::anyhow!(
+                                        "there was more than one request for this key in the batch, error logged once: {x:?}"
+                                    ))
+                                }
                             }),
                         };
 
@@ -355,23 +354,23 @@ impl Timeline {
                     // this whole `match` is a lot like `From<GetVectoredError> for PageReconstructError`
                     // but without taking ownership of the GetVectoredError
                     let err = match &err {
-                        GetVectoredError::Cancelled => {
-                            Err(PageReconstructError::Cancelled)
-                        }
+                        GetVectoredError::Cancelled => Err(PageReconstructError::Cancelled),
                         // TODO: restructure get_vectored API to make this error per-key
                         GetVectoredError::MissingKey(err) => {
-                            Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more of the requested keys were missing: {err:?}")))
+                            Err(PageReconstructError::Other(anyhow::anyhow!(
+                                "whole vectored get request failed because one or more of the requested keys were missing: {err:?}"
+                            )))
                         }
                         // TODO: restructure get_vectored API to make this error per-key
                         GetVectoredError::GetReadyAncestorError(err) => {
-                            Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more key required ancestor that wasn't ready: {err:?}")))
+                            Err(PageReconstructError::Other(anyhow::anyhow!(
+                                "whole vectored get request failed because one or more key required ancestor that wasn't ready: {err:?}"
+                            )))
                         }
                         // TODO: restructure get_vectored API to make this error per-key
-                        GetVectoredError::Other(err) => {
-                            Err(PageReconstructError::Other(
-                                anyhow::anyhow!("whole vectored get request failed: {err:?}"),
-                            ))
-                        }
+                        GetVectoredError::Other(err) => Err(PageReconstructError::Other(
+                            anyhow::anyhow!("whole vectored get request failed: {err:?}"),
+                        )),
                         // TODO: we can prevent this error class by moving this check into the type system
                         GetVectoredError::InvalidLsn(e) => {
                             Err(anyhow::anyhow!("invalid LSN: {e:?}").into())
@@ -379,10 +378,7 @@ impl Timeline {
                         // NB: this should never happen in practice because we limit MAX_GET_VECTORED_KEYS
                         // TODO: we can prevent this error class by moving this check into the type system
                         GetVectoredError::Oversized(err) => {
-                            Err(anyhow::anyhow!(
-                                "batching oversized: {err:?}"
-                            )
-                            .into())
+                            Err(anyhow::anyhow!("batching oversized: {err:?}").into())
                         }
                     };
 
@@ -715,7 +711,10 @@ impl Timeline {
             {
                 Ok(res) => res,
                 Err(PageReconstructError::MissingKey(e)) => {
-                    warn!("Missing key while find_lsn_for_timestamp. Either we might have already garbage-collected that data or the key is really missing. Last error: {:#}", e);
+                    warn!(
+                        "Missing key while find_lsn_for_timestamp. Either we might have already garbage-collected that data or the key is really missing. Last error: {:#}",
+                        e
+                    );
                     // Return that we didn't find any requests smaller than the LSN, and logging the error.
                     return Ok(LsnForTimestamp::Past(min_lsn));
                 }
@@ -2464,10 +2463,12 @@ impl DatadirModification<'_> {
             // modifications before ingesting DB create operations, which are the only kind that reads
             // data pages during ingest.
             if cfg!(debug_assertions) {
-                assert!(!self
-                    .pending_data_batch
-                    .as_ref()
-                    .is_some_and(|b| b.updates_key(&key)));
+                assert!(
+                    !self
+                        .pending_data_batch
+                        .as_ref()
+                        .is_some_and(|b| b.updates_key(&key))
+                );
             }
         }
 
@@ -2666,15 +2667,14 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 #[cfg(test)]
 mod tests {
     use hex_literal::hex;
-    use pageserver_api::{models::ShardParameters, shard::ShardStripeSize};
-    use utils::{
-        id::TimelineId,
-        shard::{ShardCount, ShardNumber},
-    };
+    use pageserver_api::models::ShardParameters;
+    use pageserver_api::shard::ShardStripeSize;
+    use utils::id::TimelineId;
+    use utils::shard::{ShardCount, ShardNumber};
 
     use super::*;
-
-    use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
+    use crate::DEFAULT_PG_VERSION;
+    use crate::tenant::harness::TenantHarness;
 
     /// Test a round trip of aux file updates, from DatadirModification to reading back from the Timeline
     #[tokio::test]
diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs
index 4e8be58d58..85c2ed8499 100644
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -73,11 +73,10 @@ impl Statvfs {
 
 pub mod mock {
     use camino::Utf8Path;
+    pub use pageserver_api::config::statvfs::mock::Behavior;
     use regex::Regex;
     use tracing::log::info;
 
-    pub use pageserver_api::config::statvfs::mock::Behavior;
-
     pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result<Statvfs> {
         info!("running mocked statvfs");
 
@@ -85,7 +84,7 @@ pub mod mock {
             Behavior::Success {
                 blocksize,
                 total_blocks,
-                ref name_filter,
+                name_filter,
             } => {
                 let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap();
 
@@ -134,7 +133,7 @@ pub mod mock {
                 }
                 Err(e) => {
                     return Err(anyhow::Error::new(e)
-                        .context(format!("get metadata of {:?}", entry.path())))
+                        .context(format!("get metadata of {:?}", entry.path())));
                 }
             };
             total += m.len();
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index cc93a06ccd..0b71b2cf5b 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -40,15 +40,12 @@ use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::{Arc, Mutex};
 
 use futures::FutureExt;
+use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
 use tokio::task::JoinHandle;
 use tokio::task_local;
 use tokio_util::sync::CancellationToken;
-
 use tracing::{debug, error, info, warn};
-
-use once_cell::sync::Lazy;
-
 use utils::env;
 use utils::id::TimelineId;
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 46f9c9a427..71dc3c9075 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12,150 +12,99 @@
 //! parent timeline, and the last LSN that has been written to disk.
 //!
 
-use anyhow::{bail, Context};
+use std::collections::hash_map::Entry;
+use std::collections::{BTreeMap, HashMap, HashSet};
+use std::fmt::{Debug, Display};
+use std::fs::File;
+use std::future::Future;
+use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
+use std::sync::{Arc, Mutex, Weak};
+use std::time::{Duration, Instant, SystemTime};
+use std::{fmt, fs};
+
+use anyhow::{Context, bail};
 use arc_swap::ArcSwap;
-use camino::Utf8Path;
-use camino::Utf8PathBuf;
+use camino::{Utf8Path, Utf8PathBuf};
 use chrono::NaiveDateTime;
 use enumset::EnumSet;
-use futures::stream::FuturesUnordered;
 use futures::StreamExt;
+use futures::stream::FuturesUnordered;
 use itertools::Itertools as _;
+use once_cell::sync::Lazy;
 use pageserver_api::models;
-use pageserver_api::models::CompactInfoResponse;
-use pageserver_api::models::LsnLease;
-use pageserver_api::models::TimelineArchivalState;
-use pageserver_api::models::TimelineState;
-use pageserver_api::models::TopTenantShardItem;
-use pageserver_api::models::WalRedoManagerStatus;
-use pageserver_api::shard::ShardIdentity;
-use pageserver_api::shard::ShardStripeSize;
-use pageserver_api::shard::TenantShardId;
-use remote_storage::DownloadError;
-use remote_storage::GenericRemoteStorage;
-use remote_storage::TimeoutOrCancel;
+pub use pageserver_api::models::TenantState;
+use pageserver_api::models::{
+    CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem,
+    WalRedoManagerStatus,
+};
+use pageserver_api::shard::{ShardIdentity, ShardStripeSize, TenantShardId};
+use remote_storage::{DownloadError, GenericRemoteStorage, TimeoutOrCancel};
 use remote_timeline_client::index::GcCompactionState;
 use remote_timeline_client::manifest::{
-    OffloadedTimelineManifest, TenantManifest, LATEST_TENANT_MANIFEST_VERSION,
+    LATEST_TENANT_MANIFEST_VERSION, OffloadedTimelineManifest, TenantManifest,
 };
-use remote_timeline_client::UploadQueueNotReadyError;
-use remote_timeline_client::FAILED_REMOTE_OP_RETRIES;
-use remote_timeline_client::FAILED_UPLOAD_WARN_THRESHOLD;
-use secondary::heatmap::HeatMapTenant;
-use secondary::heatmap::HeatMapTimeline;
-use std::collections::BTreeMap;
-use std::fmt;
-use std::future::Future;
-use std::sync::atomic::AtomicBool;
-use std::sync::Weak;
-use std::time::SystemTime;
+use remote_timeline_client::{
+    FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD, UploadQueueNotReadyError,
+};
+use secondary::heatmap::{HeatMapTenant, HeatMapTimeline};
 use storage_broker::BrokerClientChannel;
-use timeline::compaction::CompactionOutcome;
-use timeline::compaction::GcCompactionQueue;
-use timeline::import_pgdata;
-use timeline::offload::offload_timeline;
-use timeline::offload::OffloadError;
-use timeline::CompactFlags;
-use timeline::CompactOptions;
-use timeline::CompactionError;
-use timeline::PreviousHeatmap;
-use timeline::ShutdownMode;
+use timeline::compaction::{CompactionOutcome, GcCompactionQueue};
+use timeline::offload::{OffloadError, offload_timeline};
+use timeline::{
+    CompactFlags, CompactOptions, CompactionError, PreviousHeatmap, ShutdownMode, import_pgdata,
+};
 use tokio::io::BufReader;
-use tokio::sync::watch;
-use tokio::sync::Notify;
+use tokio::sync::{Notify, Semaphore, watch};
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use upload_queue::NotInitialized;
-use utils::backoff;
 use utils::circuit_breaker::CircuitBreaker;
-use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
-use utils::failpoint_support;
-use utils::fs_ext;
-use utils::pausable_failpoint;
-use utils::sync::gate::Gate;
-use utils::sync::gate::GateGuard;
-use utils::timeout::timeout_cancellable;
-use utils::timeout::TimeoutCancellableError;
+use utils::sync::gate::{Gate, GateGuard};
+use utils::timeout::{TimeoutCancellableError, timeout_cancellable};
 use utils::try_rcu::ArcSwapExt;
-use utils::zstd::create_zst_tarball;
-use utils::zstd::extract_zst_tarball;
+use utils::zstd::{create_zst_tarball, extract_zst_tarball};
+use utils::{backoff, completion, failpoint_support, fs_ext, pausable_failpoint};
 
-use self::config::AttachedLocationConfig;
-use self::config::AttachmentMode;
-use self::config::LocationConf;
-use self::config::TenantConf;
+use self::config::{AttachedLocationConfig, AttachmentMode, LocationConf, TenantConf};
 use self::metadata::TimelineMetadata;
-use self::mgr::GetActiveTenantError;
-use self::mgr::GetTenantError;
+use self::mgr::{GetActiveTenantError, GetTenantError};
 use self::remote_timeline_client::upload::{upload_index_part, upload_tenant_manifest};
 use self::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError};
-use self::timeline::uninit::TimelineCreateGuard;
-use self::timeline::uninit::TimelineExclusionError;
-use self::timeline::uninit::UninitializedTimeline;
-use self::timeline::EvictionTaskTenantState;
-use self::timeline::GcCutoffs;
-use self::timeline::TimelineDeleteProgress;
-use self::timeline::TimelineResources;
-use self::timeline::WaitLsnError;
+use self::timeline::uninit::{TimelineCreateGuard, TimelineExclusionError, UninitializedTimeline};
+use self::timeline::{
+    EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError,
+};
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::deletion_queue::DeletionQueueClient;
-use crate::deletion_queue::DeletionQueueError;
-use crate::import_datadir;
+use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::l0_flush::L0FlushGlobalState;
-use crate::metrics::CONCURRENT_INITDBS;
-use crate::metrics::INITDB_RUN_TIME;
-use crate::metrics::INITDB_SEMAPHORE_ACQUISITION_TIME;
-use crate::metrics::TENANT;
 use crate::metrics::{
-    remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
-    TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
+    BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS,
+    INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_STATE_METRIC,
+    TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
 };
-use crate::task_mgr;
 use crate::task_mgr::TaskKind;
-use crate::tenant::config::LocationMode;
-use crate::tenant::config::TenantConfOpt;
+use crate::tenant::config::{LocationMode, TenantConfOpt};
 use crate::tenant::gc_result::GcResult;
 pub use crate::tenant::remote_timeline_client::index::IndexPart;
-use crate::tenant::remote_timeline_client::remote_initdb_archive_path;
-use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
-use crate::tenant::remote_timeline_client::INITDB_PATH;
-use crate::tenant::storage_layer::DeltaLayer;
-use crate::tenant::storage_layer::ImageLayer;
-use crate::walingest::WalLagCooldown;
-use crate::walredo;
-use crate::InitializationOrder;
-use std::collections::hash_map::Entry;
-use std::collections::HashMap;
-use std::collections::HashSet;
-use std::fmt::Debug;
-use std::fmt::Display;
-use std::fs;
-use std::fs::File;
-use std::sync::atomic::{AtomicU64, Ordering};
-use std::sync::Arc;
-use std::sync::Mutex;
-use std::time::{Duration, Instant};
-
-use crate::span;
+use crate::tenant::remote_timeline_client::{
+    INITDB_PATH, MaybeDeletedIndexPart, remote_initdb_archive_path,
+};
+use crate::tenant::storage_layer::{DeltaLayer, ImageLayer};
 use crate::tenant::timeline::delete::DeleteTimelineFlow;
 use crate::tenant::timeline::uninit::cleanup_timeline_directory;
 use crate::virtual_file::VirtualFile;
+use crate::walingest::WalLagCooldown;
 use crate::walredo::PostgresRedoManager;
-use crate::TEMP_FILE_SUFFIX;
-use once_cell::sync::Lazy;
-pub use pageserver_api::models::TenantState;
-use tokio::sync::Semaphore;
+use crate::{InitializationOrder, TEMP_FILE_SUFFIX, import_datadir, span, task_mgr, walredo};
 
 static INIT_DB_SEMAPHORE: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(8));
-use utils::{
-    crashsafe,
-    generation::Generation,
-    id::TimelineId,
-    lsn::{Lsn, RecordLsn},
-};
+use utils::crashsafe;
+use utils::generation::Generation;
+use utils::id::TimelineId;
+use utils::lsn::{Lsn, RecordLsn};
 
 pub mod blob_io;
 pub mod block_io;
@@ -184,9 +133,9 @@ mod gc_block;
 mod gc_result;
 pub(crate) mod throttle;
 
-pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
 
+pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 // re-export for use in walreceiver
 pub use crate::tenant::timeline::WalReceiverInfo;
 
@@ -251,7 +200,9 @@ impl AttachedTenantConf {
                 Ok(Self::new(location_conf.tenant_conf, *attach_conf))
             }
             LocationMode::Secondary(_) => {
-                anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode")
+                anyhow::bail!(
+                    "Attempted to construct AttachedTenantConf from a LocationConf in secondary mode"
+                )
             }
         }
     }
@@ -465,7 +416,9 @@ impl WalredoManagerId {
         static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
         let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
         if id == 0 {
-            panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique");
+            panic!(
+                "WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique"
+            );
         }
         Self(id)
     }
@@ -1229,7 +1182,9 @@ impl Tenant {
                 match cause {
                     LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (),
                     LoadTimelineCause::ImportPgdata { .. } => {
-                        unreachable!("ImportPgdata should not be reloading timeline import is done and persisted as such in s3")
+                        unreachable!(
+                            "ImportPgdata should not be reloading timeline import is done and persisted as such in s3"
+                        )
                     }
                 }
                 let mut guard = self.timelines_creating.lock().unwrap();
@@ -1262,8 +1217,8 @@ impl Tenant {
                         // We should never try and load the same timeline twice during startup
                         Entry::Occupied(_) => {
                             unreachable!(
-                            "Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
-                        );
+                                "Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
+                            );
                         }
                         Entry::Vacant(v) => {
                             v.insert(Arc::clone(&timeline));
@@ -1657,7 +1612,9 @@ impl Tenant {
         failpoint_support::sleep_millis_async!("before-attaching-tenant");
 
         let Some(preload) = preload else {
-            anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
+            anyhow::bail!(
+                "local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624"
+            );
         };
 
         let mut offloaded_timeline_ids = HashSet::new();
@@ -2041,7 +1998,7 @@ impl Tenant {
         remote_storage: GenericRemoteStorage,
         previous_heatmap: Option<PreviousHeatmap>,
         cancel: CancellationToken,
-    ) -> impl Future<Output = TimelinePreload> {
+    ) -> impl Future<Output = TimelinePreload> + use<> {
         let client = self.build_timeline_client(timeline_id, remote_storage);
         async move {
             debug_assert_current_span_has_tenant_and_timeline_id();
@@ -2736,7 +2693,9 @@ impl Tenant {
                 timeline
             }
             CreateTimelineResult::ImportSpawned(timeline) => {
-                info!("import task spawned, timeline will become visible and activated once the import is done");
+                info!(
+                    "import task spawned, timeline will become visible and activated once the import is done"
+                );
                 timeline
             }
         };
@@ -2782,7 +2741,7 @@ impl Tenant {
         {
             StartCreatingTimelineResult::CreateGuard(guard) => guard,
             StartCreatingTimelineResult::Idempotent(timeline) => {
-                return Ok(CreateTimelineResult::Idempotent(timeline))
+                return Ok(CreateTimelineResult::Idempotent(timeline));
             }
         };
 
@@ -2916,7 +2875,9 @@ impl Tenant {
         let index_part = match index_part {
             MaybeDeletedIndexPart::Deleted(_) => {
                 // likely concurrent delete call, cplane should prevent this
-                anyhow::bail!("index part says deleted but we are not done creating yet, this should not happen but")
+                anyhow::bail!(
+                    "index part says deleted but we are not done creating yet, this should not happen but"
+                )
             }
             MaybeDeletedIndexPart::IndexPart(p) => p,
         };
@@ -3907,7 +3868,9 @@ where
     if !later.is_empty() {
         for (missing_id, orphan_ids) in later {
             for (orphan_id, _) in orphan_ids {
-                error!("could not load timeline {orphan_id} because its ancestor timeline {missing_id} could not be loaded");
+                error!(
+                    "could not load timeline {orphan_id} because its ancestor timeline {missing_id} could not be loaded"
+                );
             }
         }
         bail!("could not load tenant because some timelines are missing ancestors");
@@ -4827,7 +4790,10 @@ impl Tenant {
             let gc_info = src_timeline.gc_info.read().unwrap();
             let planned_cutoff = gc_info.min_cutoff();
             if gc_info.lsn_covered_by_lease(start_lsn) {
-                tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *applied_gc_cutoff_lsn);
+                tracing::info!(
+                    "skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease",
+                    *applied_gc_cutoff_lsn
+                );
             } else {
                 src_timeline
                     .check_lsn_is_in_scope(start_lsn, &applied_gc_cutoff_lsn)
@@ -4973,7 +4939,9 @@ impl Tenant {
                         }
                         // Idempotent <=> CreateTimelineIdempotency is identical
                         (x, y) if x == y => {
-                            info!("timeline already exists and idempotency matches, succeeding request");
+                            info!(
+                                "timeline already exists and idempotency matches, succeeding request"
+                            );
                             // fallthrough
                         }
                         (_, _) => {
@@ -5055,7 +5023,7 @@ impl Tenant {
         {
             StartCreatingTimelineResult::CreateGuard(guard) => guard,
             StartCreatingTimelineResult::Idempotent(timeline) => {
-                return Ok(CreateTimelineResult::Idempotent(timeline))
+                return Ok(CreateTimelineResult::Idempotent(timeline));
             }
         };
 
@@ -5260,7 +5228,9 @@ impl Tenant {
             .create_timeline_files(&create_guard.timeline_path)
             .await
         {
-            error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}");
+            error!(
+                "Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}"
+            );
             cleanup_timeline_directory(create_guard);
             return Err(e);
         }
@@ -5625,20 +5595,19 @@ pub async fn dump_layerfile_from_path(
 #[cfg(test)]
 pub(crate) mod harness {
     use bytes::{Bytes, BytesMut};
+    use hex_literal::hex;
     use once_cell::sync::OnceCell;
+    use pageserver_api::key::Key;
     use pageserver_api::models::ShardParameters;
+    use pageserver_api::record::NeonWalRecord;
     use pageserver_api::shard::ShardIndex;
+    use utils::id::TenantId;
     use utils::logging;
 
+    use super::*;
     use crate::deletion_queue::mock::MockDeletionQueue;
     use crate::l0_flush::L0FlushConfig;
     use crate::walredo::apply_neon;
-    use pageserver_api::key::Key;
-    use pageserver_api::record::NeonWalRecord;
-
-    use super::*;
-    use hex_literal::hex;
-    use utils::id::TenantId;
 
     pub const TIMELINE_ID: TimelineId =
         TimelineId::from_array(hex!("11223344556677881122334455667788"));
@@ -5919,34 +5888,34 @@ pub(crate) mod harness {
 mod tests {
     use std::collections::{BTreeMap, BTreeSet};
 
-    use super::*;
-    use crate::keyspace::KeySpaceAccum;
-    use crate::tenant::harness::*;
-    use crate::tenant::timeline::CompactFlags;
-    use crate::DEFAULT_PG_VERSION;
     use bytes::{Bytes, BytesMut};
     use hex_literal::hex;
     use itertools::Itertools;
-    use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX};
+    #[cfg(feature = "testing")]
+    use models::CompactLsnRange;
+    use pageserver_api::key::{AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX};
     use pageserver_api::keyspace::KeySpace;
     use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
+    #[cfg(feature = "testing")]
+    use pageserver_api::record::NeonWalRecord;
     use pageserver_api::value::Value;
     use pageserver_compaction::helpers::overlaps_with;
-    use rand::{thread_rng, Rng};
+    use rand::{Rng, thread_rng};
     use storage_layer::{IoConcurrency, PersistentLayerKey};
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
+    #[cfg(feature = "testing")]
+    use timeline::GcInfo;
+    #[cfg(feature = "testing")]
+    use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
     use timeline::{CompactOptions, DeltaLayerTestDesc};
     use utils::id::TenantId;
 
-    #[cfg(feature = "testing")]
-    use models::CompactLsnRange;
-    #[cfg(feature = "testing")]
-    use pageserver_api::record::NeonWalRecord;
-    #[cfg(feature = "testing")]
-    use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
-    #[cfg(feature = "testing")]
-    use timeline::GcInfo;
+    use super::*;
+    use crate::DEFAULT_PG_VERSION;
+    use crate::keyspace::KeySpaceAccum;
+    use crate::tenant::harness::*;
+    use crate::tenant::timeline::CompactFlags;
 
     static TEST_KEY: Lazy<Key> =
         Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -6196,11 +6165,12 @@ mod tests {
                     panic!("wrong error type")
                 };
                 assert!(err.to_string().contains("invalid branch start lsn"));
-                assert!(err
-                    .source()
-                    .unwrap()
-                    .to_string()
-                    .contains("we might've already garbage collected needed data"))
+                assert!(
+                    err.source()
+                        .unwrap()
+                        .to_string()
+                        .contains("we might've already garbage collected needed data")
+                )
             }
         }
 
@@ -6229,11 +6199,12 @@ mod tests {
                     panic!("wrong error type");
                 };
                 assert!(&err.to_string().contains("invalid branch start lsn"));
-                assert!(&err
-                    .source()
-                    .unwrap()
-                    .to_string()
-                    .contains("is earlier than latest GC cutoff"));
+                assert!(
+                    &err.source()
+                        .unwrap()
+                        .to_string()
+                        .contains("is earlier than latest GC cutoff")
+                );
             }
         }
 
@@ -7542,10 +7513,12 @@ mod tests {
             }
         }
 
-        assert!(!harness
-            .conf
-            .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID)
-            .exists());
+        assert!(
+            !harness
+                .conf
+                .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID)
+                .exists()
+        );
 
         Ok(())
     }
@@ -7746,7 +7719,10 @@ mod tests {
 
         let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len();
 
-        assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}");
+        assert!(
+            after_num_l0_delta_files < before_num_l0_delta_files,
+            "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}"
+        );
 
         assert_eq!(
             tline.get(test_key, lsn, &ctx).await?,
@@ -7913,7 +7889,10 @@ mod tests {
                 let (_, after_delta_file_accessed) =
                     scan_with_statistics(&tline, &keyspace, lsn, &ctx, io_concurrency.clone())
                         .await?;
-                assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}");
+                assert!(
+                    after_delta_file_accessed < before_delta_file_accessed,
+                    "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}"
+                );
                 // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances.
                 assert!(
                     after_delta_file_accessed <= 2,
@@ -7967,10 +7946,12 @@ mod tests {
             get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
             Some(test_img("data key 1"))
         );
-        assert!(get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx)
-            .await
-            .unwrap_err()
-            .is_missing_key_error());
+        assert!(
+            get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx)
+                .await
+                .unwrap_err()
+                .is_missing_key_error()
+        );
         assert!(
             get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx)
                 .await
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 7b55df52a5..b16a88eaa4 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -14,6 +14,9 @@
 //! len <  128: 0XXXXXXX
 //! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
+use std::cmp::min;
+use std::io::{Error, ErrorKind};
+
 use async_compression::Level;
 use bytes::{BufMut, BytesMut};
 use pageserver_api::models::ImageCompressionAlgorithm;
@@ -24,10 +27,8 @@ use tracing::warn;
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::tenant::block_io::BlockCursor;
-use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
 use crate::virtual_file::VirtualFile;
-use std::cmp::min;
-use std::io::{Error, ErrorKind};
+use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
 
 #[derive(Copy, Clone, Debug)]
 pub struct CompressionInfo {
@@ -414,12 +415,15 @@ impl BlobWriter<false> {
 
 #[cfg(test)]
 pub(crate) mod tests {
-    use super::*;
-    use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef};
     use camino::Utf8PathBuf;
     use camino_tempfile::Utf8TempDir;
     use rand::{Rng, SeedableRng};
 
+    use super::*;
+    use crate::context::DownloadBehavior;
+    use crate::task_mgr::TaskKind;
+    use crate::tenant::block_io::BlockReaderRef;
+
     async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
         round_trip_test_compressed::<BUFFERED>(blobs, false).await
     }
@@ -486,7 +490,7 @@ pub(crate) mod tests {
 
     pub(crate) fn random_array(len: usize) -> Vec<u8> {
         let mut rng = rand::thread_rng();
-        (0..len).map(|_| rng.gen()).collect::<_>()
+        (0..len).map(|_| rng.r#gen()).collect::<_>()
     }
 
     #[tokio::test]
@@ -544,9 +548,9 @@ pub(crate) mod tests {
         let mut rng = rand::rngs::StdRng::seed_from_u64(42);
         let blobs = (0..1024)
             .map(|_| {
-                let mut sz: u16 = rng.gen();
+                let mut sz: u16 = rng.r#gen();
                 // Make 50% of the arrays small
-                if rng.gen() {
+                if rng.r#gen() {
                     sz &= 63;
                 }
                 random_array(sz.into())
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 990211f80a..66c586daff 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -2,14 +2,16 @@
 //! Low-level Block-oriented I/O functions
 //!
 
+use std::ops::Deref;
+
+use bytes::Bytes;
+
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::context::RequestContext;
-use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
+use crate::page_cache::{self, FileId, PAGE_SZ, PageReadGuard, PageWriteGuard, ReadBufResult};
 #[cfg(test)]
 use crate::virtual_file::IoBufferMut;
 use crate::virtual_file::VirtualFile;
-use bytes::Bytes;
-use std::ops::Deref;
 
 /// This is implemented by anything that can read 8 kB (PAGE_SZ)
 /// blocks, using the page cache
diff --git a/pageserver/src/tenant/checks.rs b/pageserver/src/tenant/checks.rs
index f98356242e..d5b979ab2a 100644
--- a/pageserver/src/tenant/checks.rs
+++ b/pageserver/src/tenant/checks.rs
@@ -63,9 +63,9 @@ pub fn check_valid_layermap(metadata: &[LayerName]) -> Option<String> {
                     && overlaps_with(&layer.key_range, &other_layer.key_range)
                 {
                     let err = format!(
-                            "layer violates the layer map LSN split assumption: layer {} intersects with layer {}",
-                            layer, other_layer
-                        );
+                        "layer violates the layer map LSN split assumption: layer {} intersects with layer {}",
+                        layer, other_layer
+                    );
                     return Some(err);
                 }
             }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index ab4c4c935d..334fb04604 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -8,16 +8,17 @@
 //! We cannot use global or default config instead, because wrong settings
 //! may lead to a data loss.
 //!
+use std::num::NonZeroU64;
+use std::time::Duration;
+
 pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
-use pageserver_api::models::CompactionAlgorithmSettings;
-use pageserver_api::models::EvictionPolicy;
-use pageserver_api::models::{self, TenantConfigPatch};
+use pageserver_api::models::{
+    self, CompactionAlgorithmSettings, EvictionPolicy, TenantConfigPatch,
+};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
 use serde_json::Value;
-use std::num::NonZeroU64;
-use std::time::Duration;
 use utils::generation::Generation;
 use utils::postgres_client::PostgresClientProtocol;
 
@@ -739,9 +740,10 @@ impl From<TenantConfOpt> for models::TenantConfig {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use models::TenantConfig;
 
+    use super::*;
+
     #[test]
     fn de_serializing_pageserver_config_omits_empty_values() {
         let small_conf = TenantConfOpt {
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index bb9df020b5..73c105b34e 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -18,27 +18,23 @@
 //! - An Iterator interface would be more convenient for the callers than the
 //!   'visit' function
 //!
+use std::cmp::Ordering;
+use std::iter::Rev;
+use std::ops::{Range, RangeInclusive};
+use std::{io, result};
+
 use async_stream::try_stream;
-use byteorder::{ReadBytesExt, BE};
+use byteorder::{BE, ReadBytesExt};
 use bytes::{BufMut, Bytes, BytesMut};
 use either::Either;
 use futures::{Stream, StreamExt};
 use hex;
-use std::{
-    cmp::Ordering,
-    io,
-    iter::Rev,
-    ops::{Range, RangeInclusive},
-    result,
-};
 use thiserror::Error;
 use tracing::error;
 
-use crate::{
-    context::{DownloadBehavior, RequestContext},
-    task_mgr::TaskKind,
-    tenant::block_io::{BlockReader, BlockWriter},
-};
+use crate::context::{DownloadBehavior, RequestContext};
+use crate::task_mgr::TaskKind;
+use crate::tenant::block_io::{BlockReader, BlockWriter};
 
 // The maximum size of a value stored in the B-tree. 5 bytes is enough currently.
 pub const VALUE_SZ: usize = 5;
@@ -833,12 +829,14 @@ impl<const L: usize> BuildNode<L> {
 
 #[cfg(test)]
 pub(crate) mod tests {
-    use super::*;
-    use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
-    use rand::Rng;
     use std::collections::BTreeMap;
     use std::sync::atomic::{AtomicUsize, Ordering};
 
+    use rand::Rng;
+
+    use super::*;
+    use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
+
     #[derive(Clone, Default)]
     pub(crate) struct TestDisk {
         blocks: Vec<Bytes>,
@@ -1115,7 +1113,7 @@ pub(crate) mod tests {
 
         // Test get() operations on random keys, most of which will not exist
         for _ in 0..100000 {
-            let key_int = rand::thread_rng().gen::<u128>();
+            let key_int = rand::thread_rng().r#gen::<u128>();
             let search_key = u128::to_be_bytes(key_int);
             assert!(reader.get(&search_key, &ctx).await? == all_data.get(&key_int).cloned());
         }
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index ba79672bc7..cb25fa6185 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -1,6 +1,17 @@
 //! Implementation of append-only file data structure
 //! used to keep in-memory layers spilled on disk.
 
+use std::io;
+use std::sync::Arc;
+use std::sync::atomic::AtomicU64;
+
+use camino::Utf8PathBuf;
+use num_traits::Num;
+use pageserver_api::shard::TenantShardId;
+use tokio_epoll_uring::{BoundedBuf, Slice};
+use tracing::error;
+use utils::id::TimelineId;
+
 use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64};
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
@@ -9,17 +20,7 @@ use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
 use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
 use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
 use crate::virtual_file::owned_buffers_io::write::Buffer;
-use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile};
-use camino::Utf8PathBuf;
-use num_traits::Num;
-use pageserver_api::shard::TenantShardId;
-use tokio_epoll_uring::{BoundedBuf, Slice};
-use tracing::error;
-
-use std::io;
-use std::sync::atomic::AtomicU64;
-use std::sync::Arc;
-use utils::id::TimelineId;
+use crate::virtual_file::{self, IoBufferMut, VirtualFile, owned_buffers_io};
 
 pub struct EphemeralFile {
     _tenant_shard_id: TenantShardId,
@@ -319,13 +320,14 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
 
 #[cfg(test)]
 mod tests {
+    use std::fs;
+    use std::str::FromStr;
+
     use rand::Rng;
 
     use super::*;
     use crate::context::DownloadBehavior;
     use crate::task_mgr::TaskKind;
-    use std::fs;
-    use std::str::FromStr;
 
     fn harness(
         test_name: &str,
diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs
index af73acb2be..7aa920c953 100644
--- a/pageserver/src/tenant/gc_block.rs
+++ b/pageserver/src/tenant/gc_block.rs
@@ -1,4 +1,5 @@
-use std::{collections::HashMap, sync::Arc};
+use std::collections::HashMap;
+use std::sync::Arc;
 
 use utils::id::TimelineId;
 
diff --git a/pageserver/src/tenant/gc_result.rs b/pageserver/src/tenant/gc_result.rs
index c805aafeab..7a7d6d19cb 100644
--- a/pageserver/src/tenant/gc_result.rs
+++ b/pageserver/src/tenant/gc_result.rs
@@ -1,8 +1,9 @@
-use anyhow::Result;
-use serde::Serialize;
 use std::ops::AddAssign;
 use std::time::Duration;
 
+use anyhow::Result;
+use serde::Serialize;
+
 ///
 /// Result of performing GC
 ///
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index a69cce932e..59f5a6bd90 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -46,24 +46,24 @@
 mod historic_layer_coverage;
 mod layer_coverage;
 
-use crate::context::RequestContext;
-use crate::keyspace::KeyPartitioning;
-use crate::tenant::storage_layer::InMemoryLayer;
-use anyhow::Result;
-use pageserver_api::key::Key;
-use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
-use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
 use std::collections::{HashMap, VecDeque};
 use std::iter::Peekable;
 use std::ops::Range;
 use std::sync::Arc;
+
+use anyhow::Result;
+use historic_layer_coverage::BufferedHistoricLayerCoverage;
+pub use historic_layer_coverage::LayerKey;
+use pageserver_api::key::Key;
+use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
+use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
 use tokio::sync::watch;
 use utils::lsn::Lsn;
 
-use historic_layer_coverage::BufferedHistoricLayerCoverage;
-pub use historic_layer_coverage::LayerKey;
-
 use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc};
+use crate::context::RequestContext;
+use crate::keyspace::KeyPartitioning;
+use crate::tenant::storage_layer::InMemoryLayer;
 
 ///
 /// LayerMap tracks what layers exist on a timeline.
@@ -1066,18 +1066,17 @@ impl LayerMap {
 
 #[cfg(test)]
 mod tests {
-    use crate::tenant::{storage_layer::LayerName, IndexPart};
-    use pageserver_api::{
-        key::DBDIR_KEY,
-        keyspace::{KeySpace, KeySpaceRandomAccum},
-    };
-    use std::{collections::HashMap, path::PathBuf};
-    use utils::{
-        id::{TenantId, TimelineId},
-        shard::TenantShardId,
-    };
+    use std::collections::HashMap;
+    use std::path::PathBuf;
+
+    use pageserver_api::key::DBDIR_KEY;
+    use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
+    use utils::id::{TenantId, TimelineId};
+    use utils::shard::TenantShardId;
 
     use super::*;
+    use crate::tenant::IndexPart;
+    use crate::tenant::storage_layer::LayerName;
 
     #[derive(Clone)]
     struct LayerDesc {
@@ -1417,9 +1416,11 @@ mod tests {
         assert!(!shadow.ranges.is_empty());
 
         // At least some layers should be marked covered
-        assert!(layer_visibilities
-            .iter()
-            .any(|i| matches!(i.1, LayerVisibilityHint::Covered)));
+        assert!(
+            layer_visibilities
+                .iter()
+                .any(|i| matches!(i.1, LayerVisibilityHint::Covered))
+        );
 
         let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
 
diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
index 136f68bc36..f8bec48886 100644
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -3,9 +3,8 @@ use std::ops::Range;
 
 use tracing::info;
 
-use crate::tenant::storage_layer::PersistentLayerDesc;
-
 use super::layer_coverage::LayerCoverageTuple;
+use crate::tenant::storage_layer::PersistentLayerDesc;
 
 /// Layers in this module are identified and indexed by this data.
 ///
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 15c6955260..77f9a3579d 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -19,8 +19,9 @@
 
 use anyhow::ensure;
 use serde::{Deserialize, Serialize};
-use utils::bin_ser::SerializeError;
-use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
+use utils::bin_ser::{BeSer, SerializeError};
+use utils::id::TimelineId;
+use utils::lsn::Lsn;
 
 /// Use special format number to enable backward compatibility.
 const METADATA_FORMAT_VERSION: u16 = 4;
@@ -345,9 +346,10 @@ impl TimelineMetadata {
 }
 
 pub(crate) mod modern_serde {
-    use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader};
     use serde::{Deserialize, Serialize};
 
+    use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader};
+
     pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result<TimelineMetadata, D::Error>
     where
         D: serde::de::Deserializer<'de>,
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 22ee560dbf..003f84e640 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1,34 +1,42 @@
 //! This module acts as a switchboard to access different repositories managed by this
 //! page server.
 
-use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
-use futures::StreamExt;
-use itertools::Itertools;
-use pageserver_api::key::Key;
-use pageserver_api::models::LocationConfigMode;
-use pageserver_api::shard::{
-    ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId,
-};
-use pageserver_api::upcall_api::ReAttachResponseTenant;
-use rand::{distributions::Alphanumeric, Rng};
-use remote_storage::TimeoutOrCancel;
 use std::borrow::Cow;
 use std::cmp::Ordering;
 use std::collections::{BTreeMap, HashMap, HashSet};
 use std::ops::Deref;
 use std::sync::Arc;
 use std::time::Duration;
-use sysinfo::SystemExt;
-use tokio::fs;
 
 use anyhow::Context;
+use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
+use futures::StreamExt;
+use itertools::Itertools;
 use once_cell::sync::Lazy;
+use pageserver_api::key::Key;
+use pageserver_api::models::LocationConfigMode;
+use pageserver_api::shard::{
+    ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId,
+};
+use pageserver_api::upcall_api::ReAttachResponseTenant;
+use rand::Rng;
+use rand::distributions::Alphanumeric;
+use remote_storage::TimeoutOrCancel;
+use sysinfo::SystemExt;
+use tokio::fs;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-
+use utils::crashsafe::path_with_suffix_extension;
+use utils::fs_ext::PathExt;
+use utils::generation::Generation;
+use utils::id::{TenantId, TimelineId};
 use utils::{backoff, completion, crashsafe};
 
+use super::remote_timeline_client::remote_tenant_path;
+use super::secondary::SecondaryTenant;
+use super::timeline::detach_ancestor::{self, PreparedTimelineDetach};
+use super::{GlobalShutDown, TenantSharedResources};
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::controller_upcall_client::{
@@ -37,7 +45,7 @@ use crate::controller_upcall_client::{
 use crate::deletion_queue::DeletionQueueClient;
 use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
 use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
-use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
+use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind};
 use crate::tenant::config::{
     AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
 };
@@ -48,16 +56,6 @@ use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Ten
 use crate::virtual_file::MaybeFatalIo;
 use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
 
-use utils::crashsafe::path_with_suffix_extension;
-use utils::fs_ext::PathExt;
-use utils::generation::Generation;
-use utils::id::{TenantId, TimelineId};
-
-use super::remote_timeline_client::remote_tenant_path;
-use super::secondary::SecondaryTenant;
-use super::timeline::detach_ancestor::{self, PreparedTimelineDetach};
-use super::{GlobalShutDown, TenantSharedResources};
-
 /// For a tenant that appears in TenantsMap, it may either be
 /// - `Attached`: has a full Tenant object, is elegible to service
 ///    reads and ingest WAL.
@@ -140,7 +138,7 @@ impl TenantStartupMode {
     /// If this returns None, the re-attach struct is in an invalid state and
     /// should be ignored in the response.
     fn from_reattach_tenant(rart: ReAttachResponseTenant) -> Option<Self> {
-        match (rart.mode, rart.gen) {
+        match (rart.mode, rart.r#gen) {
             (LocationConfigMode::Detached, _) => None,
             (LocationConfigMode::Secondary, _) => Some(Self::Secondary),
             (LocationConfigMode::AttachedMulti, Some(g)) => {
@@ -376,7 +374,7 @@ async fn init_load_generations(
                 TenantStartupMode::Attached((_mode, generation)) => Some(generation),
                 TenantStartupMode::Secondary => None,
             }
-            .map(|gen| (*id, *gen))
+            .map(|gen_| (*id, *gen_))
         })
         .collect();
     resources.deletion_queue_client.recover(attached_tenants)?;
@@ -502,7 +500,9 @@ pub async fn init_tenant_mgr(
             .total_memory();
     let max_ephemeral_layer_bytes =
         conf.ephemeral_bytes_per_memory_kb as u64 * (system_memory / 1024);
-    tracing::info!("Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory");
+    tracing::info!(
+        "Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory"
+    );
     inmemory_layer::GLOBAL_RESOURCES.max_dirty_bytes.store(
         max_ephemeral_layer_bytes,
         std::sync::atomic::Ordering::Relaxed,
@@ -700,10 +700,11 @@ fn tenant_spawn(
     // to avoid impacting prod runtime performance.
     assert!(!crate::is_temporary(tenant_path));
     debug_assert!(tenant_path.is_dir());
-    debug_assert!(conf
-        .tenant_location_config_path(&tenant_shard_id)
-        .try_exists()
-        .unwrap());
+    debug_assert!(
+        conf.tenant_location_config_path(&tenant_shard_id)
+            .try_exists()
+            .unwrap()
+    );
 
     Tenant::spawn(
         conf,
@@ -791,7 +792,9 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
                 (total_in_progress, total_attached)
             }
             TenantsMap::ShuttingDown(_) => {
-                error!("already shutting down, this function isn't supposed to be called more than once");
+                error!(
+                    "already shutting down, this function isn't supposed to be called more than once"
+                );
                 return;
             }
         }
@@ -1016,9 +1019,9 @@ impl TenantManager {
                             Ok(Ok(_)) => return Ok(Some(tenant)),
                             Err(_) => {
                                 tracing::warn!(
-                                timeout_ms = flush_timeout.as_millis(),
-                                "Timed out waiting for flush to remote storage, proceeding anyway."
-                            )
+                                    timeout_ms = flush_timeout.as_millis(),
+                                    "Timed out waiting for flush to remote storage, proceeding anyway."
+                                )
                             }
                         }
                     }
@@ -1194,7 +1197,9 @@ impl TenantManager {
                     }
                     TenantSlot::Attached(tenant) => {
                         let (_guard, progress) = utils::completion::channel();
-                        info!("Shutting down just-spawned tenant, because tenant manager is shut down");
+                        info!(
+                            "Shutting down just-spawned tenant, because tenant manager is shut down"
+                        );
                         match tenant.shutdown(progress, ShutdownMode::Hard).await {
                             Ok(()) => {
                                 info!("Finished shutting down just-spawned tenant");
@@ -1784,7 +1789,7 @@ impl TenantManager {
                             _ => {
                                 return Err(anyhow::anyhow!(e).context(format!(
                                     "Hard linking {relative_layer} into {child_prefix}"
-                                )))
+                                )));
                             }
                         }
                     }
@@ -2025,8 +2030,8 @@ impl TenantManager {
                 .wait_to_become_active(std::time::Duration::from_secs(9999))
                 .await
                 .map_err(|e| {
-                    use pageserver_api::models::TenantState;
                     use GetActiveTenantError::{Cancelled, WillNotBecomeActive};
+                    use pageserver_api::models::TenantState;
                     match e {
                         Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) => {
                             Error::ShuttingDown
@@ -2089,7 +2094,7 @@ impl TenantManager {
 
                     match selector {
                         ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
-                            return ShardResolveResult::Found(tenant.clone())
+                            return ShardResolveResult::Found(tenant.clone());
                         }
                         ShardSelector::Page(key) => {
                             // First slot we see for this tenant, calculate the expected shard number
@@ -2486,7 +2491,7 @@ impl SlotGuard {
                 TenantsMap::Initializing => {
                     return Err(TenantSlotUpsertError::MapState(
                         TenantMapError::StillInitializing,
-                    ))
+                    ));
                 }
                 TenantsMap::ShuttingDown(_) => {
                     return Err(TenantSlotUpsertError::ShuttingDown((
@@ -2815,21 +2820,22 @@ where
     }
 }
 
-use {
-    crate::tenant::gc_result::GcResult, http_utils::error::ApiError,
-    pageserver_api::models::TimelineGcRequest,
-};
+use http_utils::error::ApiError;
+use pageserver_api::models::TimelineGcRequest;
+
+use crate::tenant::gc_result::GcResult;
 
 #[cfg(test)]
 mod tests {
     use std::collections::BTreeMap;
     use std::sync::Arc;
+
     use tracing::Instrument;
 
+    use super::super::harness::TenantHarness;
+    use super::TenantsMap;
     use crate::tenant::mgr::TenantSlot;
 
-    use super::{super::harness::TenantHarness, TenantsMap};
-
     #[tokio::test(start_paused = true)]
     async fn shutdown_awaits_in_progress_tenant() {
         // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index e01da48052..4ba5844fea 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -179,78 +179,64 @@ pub mod index;
 pub mod manifest;
 pub(crate) mod upload;
 
-use anyhow::Context;
-use camino::Utf8Path;
-use chrono::{NaiveDateTime, Utc};
-
-pub(crate) use download::download_initdb_tar_zst;
-use index::GcCompactionState;
-use pageserver_api::models::TimelineArchivalState;
-use pageserver_api::shard::{ShardIndex, TenantShardId};
-use regex::Regex;
-use scopeguard::ScopeGuard;
-use tokio_util::sync::CancellationToken;
-use utils::backoff::{
-    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
-};
-use utils::pausable_failpoint;
-use utils::shard::ShardNumber;
-
 use std::collections::{HashMap, HashSet, VecDeque};
+use std::ops::DerefMut;
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex, OnceLock};
 use std::time::Duration;
 
+use anyhow::Context;
+use camino::Utf8Path;
+use chrono::{NaiveDateTime, Utc};
+pub(crate) use download::{
+    download_index_part, download_initdb_tar_zst, download_tenant_manifest, is_temp_download_file,
+    list_remote_tenant_shards, list_remote_timelines,
+};
+use index::GcCompactionState;
+pub(crate) use index::LayerFileMetadata;
+use pageserver_api::models::TimelineArchivalState;
+use pageserver_api::shard::{ShardIndex, TenantShardId};
+use regex::Regex;
 use remote_storage::{
     DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel,
 };
-use std::ops::DerefMut;
-use tracing::{debug, error, info, instrument, warn};
-use tracing::{info_span, Instrument};
-use utils::lsn::Lsn;
-
-use crate::context::RequestContext;
-use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
-use crate::metrics::{
-    MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
-    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
-    REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
+use scopeguard::ScopeGuard;
+use tokio_util::sync::CancellationToken;
+use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
+pub(crate) use upload::upload_initdb_dir;
+use utils::backoff::{
+    self, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff,
 };
-use crate::task_mgr::shutdown_token;
-use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::remote_timeline_client::download::download_retry;
-use crate::tenant::storage_layer::AsLayerDesc;
-use crate::tenant::upload_queue::{Delete, OpType, UploadQueueStoppedDeletable};
-use crate::tenant::TIMELINES_SEGMENT_NAME;
-use crate::{
-    config::PageServerConf,
-    task_mgr,
-    task_mgr::TaskKind,
-    task_mgr::BACKGROUND_RUNTIME,
-    tenant::metadata::TimelineMetadata,
-    tenant::upload_queue::{
-        UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
-    },
-    TENANT_HEATMAP_BASENAME,
-};
-
 use utils::id::{TenantId, TimelineId};
+use utils::lsn::Lsn;
+use utils::pausable_failpoint;
+use utils::shard::ShardNumber;
 
 use self::index::IndexPart;
-
 use super::config::AttachedLocationConfig;
 use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerName, ResidentLayer};
 use super::timeline::import_pgdata;
 use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
 use super::{DeleteTimelineError, Generation};
-
-pub(crate) use download::{
-    download_index_part, download_tenant_manifest, is_temp_download_file,
-    list_remote_tenant_shards, list_remote_timelines,
+use crate::config::PageServerConf;
+use crate::context::RequestContext;
+use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
+use crate::metrics::{
+    MeasureRemoteOp, REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
+    RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
+    RemoteTimelineClientMetricsCallTrackSize,
 };
-pub(crate) use index::LayerFileMetadata;
-pub(crate) use upload::upload_initdb_dir;
+use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind, shutdown_token};
+use crate::tenant::metadata::TimelineMetadata;
+use crate::tenant::remote_timeline_client::download::download_retry;
+use crate::tenant::storage_layer::AsLayerDesc;
+use crate::tenant::upload_queue::{
+    Delete, OpType, UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped,
+    UploadQueueStoppedDeletable, UploadTask,
+};
+use crate::tenant::{TIMELINES_SEGMENT_NAME, debug_assert_current_span_has_tenant_and_timeline_id};
+use crate::{TENANT_HEATMAP_BASENAME, task_mgr};
 
 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
@@ -1091,7 +1077,11 @@ impl RemoteTimelineClient {
                     if !wanted(x) && wanted(y) {
                         // this could be avoided by having external in-memory synchronization, like
                         // timeline detach ancestor
-                        warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason");
+                        warn!(
+                            ?reason,
+                            op = "insert",
+                            "unexpected: two racing processes to enable and disable a gc blocking reason"
+                        );
                     }
 
                     // at this point, the metadata must always show that there is a parent
@@ -1145,7 +1135,11 @@ impl RemoteTimelineClient {
                 (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
                 (x, y) => {
                     if !wanted(x) && wanted(y) {
-                        warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)");
+                        warn!(
+                            ?reason,
+                            op = "remove",
+                            "unexpected: two racing processes to enable and disable a gc blocking reason (remove)"
+                        );
                     }
 
                     upload_queue.dirty.gc_blocking =
@@ -1287,12 +1281,14 @@ impl RemoteTimelineClient {
 
         #[cfg(feature = "testing")]
         for (name, metadata) in &with_metadata {
-            let gen = metadata.generation;
-            if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), gen) {
-                if unexpected == gen {
+            let gen_ = metadata.generation;
+            if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), gen_) {
+                if unexpected == gen_ {
                     tracing::error!("{name} was unlinked twice with same generation");
                 } else {
-                    tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}");
+                    tracing::error!(
+                        "{name} was unlinked twice with different generations {gen_:?} and {unexpected:?}"
+                    );
                 }
             }
         }
@@ -1354,11 +1350,11 @@ impl RemoteTimelineClient {
 
         #[cfg(feature = "testing")]
         for (name, meta) in &with_metadata {
-            let gen = meta.generation;
+            let gen_ = meta.generation;
             match upload_queue.dangling_files.remove(name) {
-                Some(same) if same == gen => { /* expected */ }
+                Some(same) if same == gen_ => { /* expected */ }
                 Some(other) => {
-                    tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}");
+                    tracing::error!("{name} was unlinked with {other:?} but deleted with {gen_:?}");
                 }
                 None => {
                     tracing::error!("{name} was unlinked but was not dangling");
@@ -1455,7 +1451,9 @@ impl RemoteTimelineClient {
         // proper stop is yet to be called. On cancel the original or some later task must call
         // `stop` or `shutdown`.
         let sg = scopeguard::guard((), |_| {
-            tracing::error!("RemoteTimelineClient::shutdown was cancelled; this should not happen, do not make this into an allowed_error")
+            tracing::error!(
+                "RemoteTimelineClient::shutdown was cancelled; this should not happen, do not make this into an allowed_error"
+            )
         });
 
         let fut = {
@@ -1471,7 +1469,7 @@ impl RemoteTimelineClient {
                     scopeguard::ScopeGuard::into_inner(sg);
                     return;
                 }
-                UploadQueue::Initialized(ref mut init) => init,
+                UploadQueue::Initialized(init) => init,
             };
 
             // if the queue is already stuck due to a shutdown operation which was cancelled, then
@@ -1831,7 +1829,9 @@ impl RemoteTimelineClient {
                     .map(|n| n.starts_with(IndexPart::FILE_NAME))
                     .unwrap_or(false)
             })
-            .filter_map(|o| parse_remote_index_path(o.key.clone()).map(|gen| (o.key.clone(), gen)))
+            .filter_map(|o| {
+                parse_remote_index_path(o.key.clone()).map(|gen_| (o.key.clone(), gen_))
+            })
             .max_by_key(|i| i.1)
             .map(|i| i.0.clone())
             .unwrap_or(
@@ -2023,7 +2023,7 @@ impl RemoteTimelineClient {
             }
 
             let upload_result: anyhow::Result<()> = match &task.op {
-                UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => {
+                UploadOp::UploadLayer(layer, layer_metadata, mode) => {
                     // TODO: check if this mechanism can be removed now that can_bypass() performs
                     // conflict checks during scheduling.
                     if let Some(OpType::FlushDeletion) = mode {
@@ -2113,7 +2113,7 @@ impl RemoteTimelineClient {
                     )
                     .await
                 }
-                UploadOp::UploadMetadata { ref uploaded } => {
+                UploadOp::UploadMetadata { uploaded } => {
                     let res = upload::upload_index_part(
                         &self.storage_impl,
                         &self.tenant_shard_id,
@@ -2229,11 +2229,11 @@ impl RemoteTimelineClient {
         let lsn_update = {
             let mut upload_queue_guard = self.upload_queue.lock().unwrap();
             let upload_queue = match upload_queue_guard.deref_mut() {
-                UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
-                UploadQueue::Stopped(_stopped) => {
-                    None
-                },
-                UploadQueue::Initialized(qi) => { Some(qi) }
+                UploadQueue::Uninitialized => panic!(
+                    "callers are responsible for ensuring this is only called on an initialized queue"
+                ),
+                UploadQueue::Stopped(_stopped) => None,
+                UploadQueue::Initialized(qi) => Some(qi),
             };
 
             let upload_queue = match upload_queue {
@@ -2255,7 +2255,11 @@ impl RemoteTimelineClient {
                     let is_later = last_updater.is_some_and(|task_id| task_id < task.task_id);
                     let monotone = is_later || last_updater.is_none();
 
-                    assert!(monotone, "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}", task.task_id);
+                    assert!(
+                        monotone,
+                        "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}",
+                        task.task_id
+                    );
 
                     // not taking ownership is wasteful
                     upload_queue.clean.0.clone_from(uploaded);
@@ -2654,20 +2658,16 @@ pub fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option<Generation>
 
 #[cfg(test)]
 mod tests {
-    use super::*;
-    use crate::{
-        context::RequestContext,
-        tenant::{
-            config::AttachmentMode,
-            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::layer::local_layer_path,
-            Tenant, Timeline,
-        },
-        DEFAULT_PG_VERSION,
-    };
-
     use std::collections::HashSet;
 
+    use super::*;
+    use crate::DEFAULT_PG_VERSION;
+    use crate::context::RequestContext;
+    use crate::tenant::config::AttachmentMode;
+    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
+    use crate::tenant::storage_layer::layer::local_layer_path;
+    use crate::tenant::{Tenant, Timeline};
+
     pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
         format!("contents for {name}").into()
     }
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index b4d45dca75..92be2145ce 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -8,41 +8,39 @@ use std::future::Future;
 use std::str::FromStr;
 use std::time::SystemTime;
 
-use anyhow::{anyhow, Context};
+use anyhow::{Context, anyhow};
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::shard::TenantShardId;
+use remote_storage::{
+    DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath,
+};
 use tokio::fs::{self, File, OpenOptions};
 use tokio::io::{AsyncSeekExt, AsyncWriteExt};
 use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
-use utils::backoff;
+use utils::crashsafe::path_with_suffix_extension;
+use utils::id::{TenantId, TimelineId};
+use utils::{backoff, pausable_failpoint};
 
+use super::index::{IndexPart, LayerFileMetadata};
+use super::manifest::TenantManifest;
+use super::{
+    FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH, parse_remote_index_path,
+    parse_remote_tenant_manifest_path, remote_index_path, remote_initdb_archive_path,
+    remote_initdb_preserved_archive_path, remote_tenant_manifest_path,
+    remote_tenant_manifest_prefix, remote_tenant_path,
+};
+use crate::TEMP_FILE_SUFFIX;
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::span::{
     debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_id,
 };
+use crate::tenant::Generation;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerName;
-use crate::tenant::Generation;
-use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
-use crate::TEMP_FILE_SUFFIX;
-use remote_storage::{
-    DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath,
-};
-use utils::crashsafe::path_with_suffix_extension;
-use utils::id::{TenantId, TimelineId};
-use utils::pausable_failpoint;
-
-use super::index::{IndexPart, LayerFileMetadata};
-use super::manifest::TenantManifest;
-use super::{
-    parse_remote_index_path, parse_remote_tenant_manifest_path, remote_index_path,
-    remote_initdb_archive_path, remote_initdb_preserved_archive_path, remote_tenant_manifest_path,
-    remote_tenant_manifest_prefix, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
-    FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
-};
+use crate::virtual_file::{MaybeFatalIo, VirtualFile, on_fatal_io_error};
 
 ///
 /// If 'metadata' is given, we will validate that the downloaded file's size matches that
@@ -207,9 +205,9 @@ async fn download_object(
         }
         #[cfg(target_os = "linux")]
         crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
-            use crate::virtual_file::owned_buffers_io;
-            use crate::virtual_file::IoBufferMut;
             use std::sync::Arc;
+
+            use crate::virtual_file::{IoBufferMut, owned_buffers_io};
             async {
                 let destination_file = Arc::new(
                     VirtualFile::create(dst_path, ctx)
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 727b25fbf4..ceaed58bbd 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -7,16 +7,16 @@ use std::collections::HashMap;
 
 use chrono::NaiveDateTime;
 use pageserver_api::models::AuxFilePolicy;
+use pageserver_api::shard::ShardIndex;
 use serde::{Deserialize, Serialize};
+use utils::id::TimelineId;
+use utils::lsn::Lsn;
 
 use super::is_same_remote_layer_path;
+use crate::tenant::Generation;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::import_pgdata;
-use crate::tenant::Generation;
-use pageserver_api::shard::ShardIndex;
-use utils::id::TimelineId;
-use utils::lsn::Lsn;
 
 /// In-memory representation of an `index_part.json` file
 ///
@@ -435,10 +435,12 @@ impl GcBlocking {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use std::str::FromStr;
+
     use utils::id::TimelineId;
 
+    use super::*;
+
     #[test]
     fn v1_indexpart_is_parsed() {
         let example = r#"{
diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs
index 2029847a12..543ccc219d 100644
--- a/pageserver/src/tenant/remote_timeline_client/manifest.rs
+++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs
@@ -1,6 +1,7 @@
 use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
-use utils::{id::TimelineId, lsn::Lsn};
+use utils::id::TimelineId;
+use utils::lsn::Lsn;
 
 /// Tenant-shard scoped manifest
 #[derive(Clone, Serialize, Deserialize, PartialEq, Eq)]
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index af4dbbbfb6..7d9f47665a 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -1,28 +1,28 @@
 //! Helper functions to upload files to remote storage with a RemoteStorage
 
-use anyhow::{bail, Context};
+use std::io::{ErrorKind, SeekFrom};
+use std::time::SystemTime;
+
+use anyhow::{Context, bail};
 use bytes::Bytes;
 use camino::Utf8Path;
 use fail::fail_point;
 use pageserver_api::shard::TenantShardId;
-use std::io::{ErrorKind, SeekFrom};
-use std::time::SystemTime;
+use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError};
 use tokio::fs::{self, File};
 use tokio::io::AsyncSeekExt;
 use tokio_util::sync::CancellationToken;
+use tracing::info;
+use utils::id::{TenantId, TimelineId};
 use utils::{backoff, pausable_failpoint};
 
+use super::Generation;
 use super::index::IndexPart;
 use super::manifest::TenantManifest;
-use super::Generation;
 use crate::tenant::remote_timeline_client::{
     remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path,
     remote_tenant_manifest_path,
 };
-use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError};
-use utils::id::{TenantId, TimelineId};
-
-use tracing::info;
 
 /// Serializes and uploads the given index part data to the remote storage.
 pub(crate) async fn upload_index_part(
@@ -134,7 +134,9 @@ pub(super) async fn upload_timeline_layer<'a>(
         .len();
 
     if metadata_size != fs_size {
-        bail!("File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
+        bail!(
+            "File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"
+        );
     }
 
     let fs_size = usize::try_from(fs_size)
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 4bc208331b..8f8622c796 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -3,40 +3,31 @@ pub mod heatmap;
 mod heatmap_uploader;
 mod scheduler;
 
-use std::{sync::Arc, time::SystemTime};
+use std::sync::Arc;
+use std::time::SystemTime;
 
-use crate::{
-    context::RequestContext,
-    disk_usage_eviction_task::DiskUsageEvictionInfo,
-    metrics::SECONDARY_HEATMAP_TOTAL_SIZE,
-    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-};
-
-use self::{
-    downloader::{downloader_task, SecondaryDetail},
-    heatmap_uploader::heatmap_uploader_task,
-};
-
-use super::{
-    config::{SecondaryLocationConfig, TenantConfOpt},
-    mgr::TenantManager,
-    span::debug_assert_current_span_has_tenant_id,
-    storage_layer::LayerName,
-    GetTenantError,
-};
-
-use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE;
 use metrics::UIntGauge;
-use pageserver_api::{
-    models,
-    shard::{ShardIdentity, TenantShardId},
-};
+use pageserver_api::models;
+use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use remote_storage::GenericRemoteStorage;
-
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::instrument;
-use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate};
+use utils::completion::Barrier;
+use utils::id::TimelineId;
+use utils::sync::gate::Gate;
+
+use self::downloader::{SecondaryDetail, downloader_task};
+use self::heatmap_uploader::heatmap_uploader_task;
+use super::GetTenantError;
+use super::config::{SecondaryLocationConfig, TenantConfOpt};
+use super::mgr::TenantManager;
+use super::span::debug_assert_current_span_has_tenant_id;
+use super::storage_layer::LayerName;
+use crate::context::RequestContext;
+use crate::disk_usage_eviction_task::DiskUsageEvictionInfo;
+use crate::metrics::{SECONDARY_HEATMAP_TOTAL_SIZE, SECONDARY_RESIDENT_PHYSICAL_SIZE};
+use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind};
 
 enum DownloadCommand {
     Download(TenantShardId),
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 2e8c3946bd..a13b9323ac 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -1,47 +1,8 @@
-use std::{
-    collections::{HashMap, HashSet},
-    pin::Pin,
-    str::FromStr,
-    sync::Arc,
-    time::{Duration, Instant, SystemTime},
-};
-
-use crate::{
-    config::PageServerConf,
-    context::RequestContext,
-    disk_usage_eviction_task::{
-        finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer,
-    },
-    metrics::SECONDARY_MODE,
-    tenant::{
-        config::SecondaryLocationConfig,
-        debug_assert_current_span_has_tenant_and_timeline_id,
-        ephemeral_file::is_ephemeral_file,
-        remote_timeline_client::{
-            index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-        },
-        span::debug_assert_current_span_has_tenant_id,
-        storage_layer::{layer::local_layer_path, LayerName, LayerVisibilityHint},
-        tasks::{warn_when_period_overrun, BackgroundLoopKind},
-    },
-    virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
-    TEMP_FILE_SUFFIX,
-};
-
-use super::{
-    heatmap::HeatMapLayer,
-    scheduler::{
-        self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult,
-        TenantBackgroundJobs,
-    },
-    GetTenantError, SecondaryTenant, SecondaryTenantError,
-};
-
-use crate::tenant::{
-    mgr::TenantManager,
-    remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
-};
+use std::collections::{HashMap, HashSet};
+use std::pin::Pin;
+use std::str::FromStr;
+use std::sync::Arc;
+use std::time::{Duration, Instant, SystemTime};
 
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
@@ -50,18 +11,43 @@ use metrics::UIntGauge;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{DownloadError, DownloadKind, DownloadOpts, Etag, GenericRemoteStorage};
-
 use tokio_util::sync::CancellationToken;
-use tracing::{info_span, instrument, warn, Instrument};
-use utils::{
-    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
-    id::TimelineId, pausable_failpoint, serde_system_time,
-};
+use tracing::{Instrument, info_span, instrument, warn};
+use utils::completion::Barrier;
+use utils::crashsafe::path_with_suffix_extension;
+use utils::id::TimelineId;
+use utils::{backoff, failpoint_support, fs_ext, pausable_failpoint, serde_system_time};
 
-use super::{
-    heatmap::{HeatMapTenant, HeatMapTimeline},
-    CommandRequest, DownloadCommand,
+use super::heatmap::{HeatMapLayer, HeatMapTenant, HeatMapTimeline};
+use super::scheduler::{
+    self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs, period_jitter,
+    period_warmup,
 };
+use super::{
+    CommandRequest, DownloadCommand, GetTenantError, SecondaryTenant, SecondaryTenantError,
+};
+use crate::TEMP_FILE_SUFFIX;
+use crate::config::PageServerConf;
+use crate::context::RequestContext;
+use crate::disk_usage_eviction_task::{
+    DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer, finite_f32,
+};
+use crate::metrics::SECONDARY_MODE;
+use crate::tenant::config::SecondaryLocationConfig;
+use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::tenant::ephemeral_file::is_ephemeral_file;
+use crate::tenant::mgr::TenantManager;
+use crate::tenant::remote_timeline_client::download::download_layer_file;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::remote_timeline_client::{
+    FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, is_temp_download_file,
+    remote_heatmap_path,
+};
+use crate::tenant::span::debug_assert_current_span_has_tenant_id;
+use crate::tenant::storage_layer::layer::local_layer_path;
+use crate::tenant::storage_layer::{LayerName, LayerVisibilityHint};
+use crate::tenant::tasks::{BackgroundLoopKind, warn_when_period_overrun};
+use crate::virtual_file::{MaybeFatalIo, VirtualFile, on_fatal_io_error};
 
 /// For each tenant, default period for how long must have passed since the last download_tenant call before
 /// calling it again.  This default is replaced with the value of [`HeatMapTenant::upload_period_ms`] after first
diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs
index 0fa10ca294..4a938e9095 100644
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -1,11 +1,13 @@
-use std::{collections::HashMap, time::SystemTime};
-
-use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName};
+use std::collections::HashMap;
+use std::time::SystemTime;
 
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
+use serde_with::{DisplayFromStr, TimestampSeconds, serde_as};
+use utils::generation::Generation;
+use utils::id::TimelineId;
 
-use utils::{generation::Generation, id::TimelineId};
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::storage_layer::LayerName;
 
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapTenant {
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index d72c337369..3375714a66 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -1,42 +1,33 @@
-use std::{
-    collections::HashMap,
-    pin::Pin,
-    sync::{Arc, Weak},
-    time::{Duration, Instant},
-};
-
-use crate::{
-    metrics::SECONDARY_MODE,
-    tenant::{
-        config::AttachmentMode,
-        mgr::{GetTenantError, TenantManager},
-        remote_timeline_client::remote_heatmap_path,
-        span::debug_assert_current_span_has_tenant_id,
-        tasks::{warn_when_period_overrun, BackgroundLoopKind},
-        Tenant,
-    },
-    virtual_file::VirtualFile,
-    TEMP_FILE_SUFFIX,
-};
+use std::collections::HashMap;
+use std::pin::Pin;
+use std::sync::{Arc, Weak};
+use std::time::{Duration, Instant};
 
 use futures::Future;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{GenericRemoteStorage, TimeoutOrCancel};
-
-use super::{
-    heatmap::HeatMapTenant,
-    scheduler::{
-        self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult,
-        TenantBackgroundJobs,
-    },
-    CommandRequest, SecondaryTenantError, UploadCommand,
-};
 use tokio_util::sync::CancellationToken;
-use tracing::{info_span, instrument, Instrument};
-use utils::{
-    backoff, completion::Barrier, crashsafe::path_with_suffix_extension,
-    yielding_loop::yielding_loop,
+use tracing::{Instrument, info_span, instrument};
+use utils::backoff;
+use utils::completion::Barrier;
+use utils::crashsafe::path_with_suffix_extension;
+use utils::yielding_loop::yielding_loop;
+
+use super::heatmap::HeatMapTenant;
+use super::scheduler::{
+    self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs, period_jitter,
+    period_warmup,
 };
+use super::{CommandRequest, SecondaryTenantError, UploadCommand};
+use crate::TEMP_FILE_SUFFIX;
+use crate::metrics::SECONDARY_MODE;
+use crate::tenant::Tenant;
+use crate::tenant::config::AttachmentMode;
+use crate::tenant::mgr::{GetTenantError, TenantManager};
+use crate::tenant::remote_timeline_client::remote_heatmap_path;
+use crate::tenant::span::debug_assert_current_span_has_tenant_id;
+use crate::tenant::tasks::{BackgroundLoopKind, warn_when_period_overrun};
+use crate::virtual_file::VirtualFile;
 
 pub(super) async fn heatmap_uploader_task(
     tenant_manager: Arc<TenantManager>,
diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs
index e963c722b9..f948f9114f 100644
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -1,16 +1,15 @@
-use futures::Future;
-use rand::Rng;
-use std::{
-    collections::HashMap,
-    marker::PhantomData,
-    pin::Pin,
-    time::{Duration, Instant},
-};
+use std::collections::HashMap;
+use std::marker::PhantomData;
+use std::pin::Pin;
+use std::time::{Duration, Instant};
 
+use futures::Future;
 use pageserver_api::shard::TenantShardId;
+use rand::Rng;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
-use utils::{completion::Barrier, yielding_loop::yielding_loop};
+use utils::completion::Barrier;
+use utils::yielding_loop::yielding_loop;
 
 use super::{CommandRequest, CommandResponse, SecondaryTenantError};
 
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 1e84a9d9dc..ed6b351c75 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -4,21 +4,18 @@ use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
 use tenant_size_model::svg::SvgBranchKind;
-use tokio::sync::oneshot::error::RecvError;
+use tenant_size_model::{Segment, StorageModel};
 use tokio::sync::Semaphore;
+use tokio::sync::oneshot::error::RecvError;
 use tokio_util::sync::CancellationToken;
-
-use crate::context::RequestContext;
-use crate::pgdatadir_mapping::CalculateLogicalSizeError;
-
-use super::{GcError, LogicalSizeCalculationCause, Tenant};
-use crate::tenant::{MaybeOffloaded, Timeline};
+use tracing::*;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
 
-use tracing::*;
-
-use tenant_size_model::{Segment, StorageModel};
+use super::{GcError, LogicalSizeCalculationCause, Tenant};
+use crate::context::RequestContext;
+use crate::pgdatadir_mapping::CalculateLogicalSizeError;
+use crate::tenant::{MaybeOffloaded, Timeline};
 
 /// Inputs to the actual tenant sizing model
 ///
@@ -498,7 +495,9 @@ async fn fill_logical_sizes(
             }
             Err(join_error) => {
                 // cannot really do anything, as this panic is likely a bug
-                error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}");
+                error!(
+                    "task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}"
+                );
 
                 have_any_error = Some(CalculateSyntheticSizeError::Fatal(
                     anyhow::anyhow!(join_error)
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index f9f843ef6b..7f313f46a2 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -10,42 +10,39 @@ mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;
 
-use crate::config::PageServerConf;
-use crate::context::{AccessStatsBehavior, RequestContext};
-use bytes::Bytes;
-use futures::stream::FuturesUnordered;
-use futures::StreamExt;
-use pageserver_api::key::Key;
-use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
-use pageserver_api::record::NeonWalRecord;
-use pageserver_api::value::Value;
 use std::cmp::Ordering;
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
 use std::future::Future;
 use std::ops::Range;
 use std::pin::Pin;
-use std::sync::atomic::AtomicUsize;
 use std::sync::Arc;
+use std::sync::atomic::AtomicUsize;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
-use tracing::{trace, Instrument};
-use utils::sync::gate::GateGuard;
-
-use utils::lsn::Lsn;
 
 pub use batch_split_writer::{BatchLayerWriter, SplitDeltaLayerWriter, SplitImageLayerWriter};
+use bytes::Bytes;
 pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
+use futures::StreamExt;
+use futures::stream::FuturesUnordered;
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
+pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
 pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName};
-
-pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
+use pageserver_api::key::Key;
+use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
+use pageserver_api::record::NeonWalRecord;
+use pageserver_api::value::Value;
+use tracing::{Instrument, trace};
+use utils::lsn::Lsn;
+use utils::sync::gate::GateGuard;
 
 use self::inmemory_layer::InMemoryLayerFileId;
-
-use super::timeline::{GetVectoredError, ReadPath};
 use super::PageReconstructError;
+use super::timeline::{GetVectoredError, ReadPath};
+use crate::config::PageServerConf;
+use crate::context::{AccessStatsBehavior, RequestContext};
 
 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
@@ -510,6 +507,7 @@ impl IoConcurrency {
     #[cfg(test)]
     pub(crate) fn spawn_for_test() -> impl std::ops::DerefMut<Target = Self> {
         use std::ops::{Deref, DerefMut};
+
         use tracing::info;
         use utils::sync::gate::Gate;
 
diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
index 7da51c27df..fd50e4805d 100644
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -1,17 +1,22 @@
-use std::{future::Future, ops::Range, sync::Arc};
+use std::future::Future;
+use std::ops::Range;
+use std::sync::Arc;
 
 use bytes::Bytes;
-use pageserver_api::key::{Key, KEY_SIZE};
-use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
-
-use crate::tenant::storage_layer::Layer;
-use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline};
+use pageserver_api::key::{KEY_SIZE, Key};
 use pageserver_api::value::Value;
+use utils::id::TimelineId;
+use utils::lsn::Lsn;
+use utils::shard::TenantShardId;
 
 use super::layer::S3_UPLOAD_LIMIT;
 use super::{
     DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer,
 };
+use crate::config::PageServerConf;
+use crate::context::RequestContext;
+use crate::tenant::Timeline;
+use crate::tenant::storage_layer::Layer;
 
 pub(crate) enum BatchWriterResult {
     Produced(ResidentLayer),
@@ -423,15 +428,10 @@ mod tests {
     use itertools::Itertools;
     use rand::{RngCore, SeedableRng};
 
-    use crate::{
-        tenant::{
-            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::AsLayerDesc,
-        },
-        DEFAULT_PG_VERSION,
-    };
-
     use super::*;
+    use crate::DEFAULT_PG_VERSION;
+    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
+    use crate::tenant::storage_layer::AsLayerDesc;
 
     fn get_key(id: u32) -> Key {
         let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 7ba0e3679f..d9afdc2405 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -27,6 +27,38 @@
 //! "values" part.  The actual page images and WAL records are stored in the
 //! "values" part.
 //!
+use std::collections::{HashMap, VecDeque};
+use std::fs::File;
+use std::io::SeekFrom;
+use std::ops::Range;
+use std::os::unix::fs::FileExt;
+use std::str::FromStr;
+use std::sync::Arc;
+
+use anyhow::{Context, Result, bail, ensure};
+use camino::{Utf8Path, Utf8PathBuf};
+use futures::StreamExt;
+use itertools::Itertools;
+use pageserver_api::config::MaxVectoredReadBytes;
+use pageserver_api::key::{DBDIR_KEY, KEY_SIZE, Key};
+use pageserver_api::keyspace::KeySpace;
+use pageserver_api::models::ImageCompressionAlgorithm;
+use pageserver_api::shard::TenantShardId;
+use pageserver_api::value::Value;
+use rand::Rng;
+use rand::distributions::Alphanumeric;
+use serde::{Deserialize, Serialize};
+use tokio::sync::OnceCell;
+use tokio_epoll_uring::IoBuf;
+use tracing::*;
+use utils::bin_ser::BeSer;
+use utils::id::{TenantId, TimelineId};
+use utils::lsn::Lsn;
+
+use super::{
+    AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer,
+    ValuesReconstructState,
+};
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::page_cache::{self, FileId, PAGE_SZ};
@@ -42,43 +74,8 @@ use crate::tenant::vectored_blob_io::{
     VectoredReadPlanner,
 };
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
-use crate::virtual_file::IoBufferMut;
-use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
-use crate::TEMP_FILE_SUFFIX;
-use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
-use anyhow::{bail, ensure, Context, Result};
-use camino::{Utf8Path, Utf8PathBuf};
-use futures::StreamExt;
-use itertools::Itertools;
-use pageserver_api::config::MaxVectoredReadBytes;
-use pageserver_api::key::{Key, DBDIR_KEY, KEY_SIZE};
-use pageserver_api::keyspace::KeySpace;
-use pageserver_api::models::ImageCompressionAlgorithm;
-use pageserver_api::shard::TenantShardId;
-use pageserver_api::value::Value;
-use rand::{distributions::Alphanumeric, Rng};
-use serde::{Deserialize, Serialize};
-use std::collections::{HashMap, VecDeque};
-use std::fs::File;
-use std::io::SeekFrom;
-use std::ops::Range;
-use std::os::unix::fs::FileExt;
-use std::str::FromStr;
-use std::sync::Arc;
-use tokio::sync::OnceCell;
-use tokio_epoll_uring::IoBuf;
-use tracing::*;
-
-use utils::{
-    bin_ser::BeSer,
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
-
-use super::{
-    AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer,
-    ValuesReconstructState,
-};
+use crate::virtual_file::{self, IoBufferMut, MaybeFatalIo, VirtualFile};
+use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 
 ///
 /// Header stored in the beginning of the file
@@ -1130,10 +1127,11 @@ impl DeltaLayerInner {
         until: Lsn,
         ctx: &RequestContext,
     ) -> anyhow::Result<usize> {
+        use futures::stream::TryStreamExt;
+
         use crate::tenant::vectored_blob_io::{
             BlobMeta, ChunkedVectoredReadBuilder, VectoredReadExtended,
         };
-        use futures::stream::TryStreamExt;
 
         #[derive(Debug)]
         enum Item {
@@ -1599,23 +1597,21 @@ impl DeltaLayerIterator<'_> {
 pub(crate) mod test {
     use std::collections::BTreeMap;
 
+    use bytes::Bytes;
     use itertools::MinMaxResult;
-    use rand::prelude::{SeedableRng, SliceRandom, StdRng};
+    use pageserver_api::value::Value;
     use rand::RngCore;
+    use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 
     use super::*;
-    use crate::tenant::harness::TIMELINE_ID;
+    use crate::DEFAULT_PG_VERSION;
+    use crate::context::DownloadBehavior;
+    use crate::task_mgr::TaskKind;
+    use crate::tenant::disk_btree::tests::TestDisk;
+    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
     use crate::tenant::storage_layer::{Layer, ResidentLayer};
     use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
     use crate::tenant::{Tenant, Timeline};
-    use crate::{
-        context::DownloadBehavior,
-        task_mgr::TaskKind,
-        tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
-        DEFAULT_PG_VERSION,
-    };
-    use bytes::Bytes;
-    use pageserver_api::value::Value;
 
     /// Construct an index for a fictional delta layer and and then
     /// traverse in order to plan vectored reads for a query. Finally,
diff --git a/pageserver/src/tenant/storage_layer/filter_iterator.rs b/pageserver/src/tenant/storage_layer/filter_iterator.rs
index 8660be1fcc..8d172a1c19 100644
--- a/pageserver/src/tenant/storage_layer/filter_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs
@@ -1,18 +1,14 @@
-use std::{ops::Range, sync::Arc};
+use std::ops::Range;
+use std::sync::Arc;
 
 use anyhow::bail;
-use pageserver_api::{
-    key::Key,
-    keyspace::{KeySpace, SparseKeySpace},
-};
+use pageserver_api::key::Key;
+use pageserver_api::keyspace::{KeySpace, SparseKeySpace};
+use pageserver_api::value::Value;
 use utils::lsn::Lsn;
 
-use pageserver_api::value::Value;
-
-use super::{
-    merge_iterator::{MergeIterator, MergeIteratorItem},
-    PersistentLayerKey,
-};
+use super::PersistentLayerKey;
+use super::merge_iterator::{MergeIterator, MergeIteratorItem};
 
 /// A filter iterator over merge iterators (and can be easily extended to other types of iterators).
 ///
@@ -98,19 +94,14 @@ impl<'a> FilterIterator<'a> {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
-
     use itertools::Itertools;
     use pageserver_api::key::Key;
     use utils::lsn::Lsn;
 
-    use crate::{
-        tenant::{
-            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::delta_layer::test::produce_delta_layer,
-        },
-        DEFAULT_PG_VERSION,
-    };
+    use super::*;
+    use crate::DEFAULT_PG_VERSION;
+    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
+    use crate::tenant::storage_layer::delta_layer::test::produce_delta_layer;
 
     async fn assert_filter_iter_equal(
         filter_iter: &mut FilterIterator<'_>,
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index dc611bd6e1..0db9e8c845 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -25,6 +25,39 @@
 //! layer, and offsets to the other parts. The "index" is a B-tree,
 //! mapping from Key to an offset in the "values" part.  The
 //! actual page images are stored in the "values" part.
+use std::collections::{HashMap, VecDeque};
+use std::fs::File;
+use std::io::SeekFrom;
+use std::ops::Range;
+use std::os::unix::prelude::FileExt;
+use std::str::FromStr;
+use std::sync::Arc;
+
+use anyhow::{Context, Result, bail, ensure};
+use bytes::Bytes;
+use camino::{Utf8Path, Utf8PathBuf};
+use hex;
+use itertools::Itertools;
+use pageserver_api::config::MaxVectoredReadBytes;
+use pageserver_api::key::{DBDIR_KEY, KEY_SIZE, Key};
+use pageserver_api::keyspace::KeySpace;
+use pageserver_api::shard::{ShardIdentity, TenantShardId};
+use pageserver_api::value::Value;
+use rand::Rng;
+use rand::distributions::Alphanumeric;
+use serde::{Deserialize, Serialize};
+use tokio::sync::OnceCell;
+use tokio_stream::StreamExt;
+use tracing::*;
+use utils::bin_ser::BeSer;
+use utils::id::{TenantId, TimelineId};
+use utils::lsn::Lsn;
+
+use super::layer_name::ImageLayerName;
+use super::{
+    AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer,
+    ValuesReconstructState,
+};
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::page_cache::{self, FileId, PAGE_SZ};
@@ -39,43 +72,8 @@ use crate::tenant::vectored_blob_io::{
     VectoredReadPlanner,
 };
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
-use crate::virtual_file::IoBufferMut;
-use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
+use crate::virtual_file::{self, IoBufferMut, MaybeFatalIo, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
-use anyhow::{bail, ensure, Context, Result};
-use bytes::Bytes;
-use camino::{Utf8Path, Utf8PathBuf};
-use hex;
-use itertools::Itertools;
-use pageserver_api::config::MaxVectoredReadBytes;
-use pageserver_api::key::{Key, DBDIR_KEY, KEY_SIZE};
-use pageserver_api::keyspace::KeySpace;
-use pageserver_api::shard::{ShardIdentity, TenantShardId};
-use pageserver_api::value::Value;
-use rand::{distributions::Alphanumeric, Rng};
-use serde::{Deserialize, Serialize};
-use std::collections::{HashMap, VecDeque};
-use std::fs::File;
-use std::io::SeekFrom;
-use std::ops::Range;
-use std::os::unix::prelude::FileExt;
-use std::str::FromStr;
-use std::sync::Arc;
-use tokio::sync::OnceCell;
-use tokio_stream::StreamExt;
-use tracing::*;
-
-use utils::{
-    bin_ser::BeSer,
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
-
-use super::layer_name::ImageLayerName;
-use super::{
-    AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer,
-    ValuesReconstructState,
-};
 
 ///
 /// Header stored in the beginning of the file
@@ -1135,34 +1133,26 @@ impl ImageLayerIterator<'_> {
 
 #[cfg(test)]
 mod test {
-    use std::{sync::Arc, time::Duration};
+    use std::sync::Arc;
+    use std::time::Duration;
 
     use bytes::Bytes;
     use itertools::Itertools;
-    use pageserver_api::{
-        key::Key,
-        shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
-        value::Value,
-    };
-    use utils::{
-        generation::Generation,
-        id::{TenantId, TimelineId},
-        lsn::Lsn,
-    };
-
-    use crate::{
-        context::RequestContext,
-        tenant::{
-            config::TenantConf,
-            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::{Layer, ResidentLayer},
-            vectored_blob_io::StreamingVectoredReadPlanner,
-            Tenant, Timeline,
-        },
-        DEFAULT_PG_VERSION,
-    };
+    use pageserver_api::key::Key;
+    use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
+    use pageserver_api::value::Value;
+    use utils::generation::Generation;
+    use utils::id::{TenantId, TimelineId};
+    use utils::lsn::Lsn;
 
     use super::{ImageLayerIterator, ImageLayerWriter};
+    use crate::DEFAULT_PG_VERSION;
+    use crate::context::RequestContext;
+    use crate::tenant::config::TenantConf;
+    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
+    use crate::tenant::storage_layer::{Layer, ResidentLayer};
+    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
+    use crate::tenant::{Tenant, Timeline};
 
     #[tokio::test]
     async fn image_layer_rewrite() {
@@ -1172,10 +1162,10 @@ mod test {
             ..TenantConf::default()
         };
         let tenant_id = TenantId::generate();
-        let mut gen = Generation::new(0xdead0001);
+        let mut gen_ = Generation::new(0xdead0001);
         let mut get_next_gen = || {
-            let ret = gen;
-            gen = gen.next();
+            let ret = gen_;
+            gen_ = gen_.next();
             ret
         };
         // The LSN at which we will create an image layer to filter
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 61a0fdea8c..ffdfe1dc27 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -4,38 +4,39 @@
 //! held in an ephemeral file, not in memory. The metadata for each page version, i.e.
 //! its position in the file, is kept in memory, though.
 //!
-use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64};
+use std::cmp::Ordering;
+use std::collections::{BTreeMap, HashMap};
+use std::fmt::Write;
+use std::ops::Range;
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering as AtomicOrdering};
+use std::sync::{Arc, OnceLock};
+use std::time::Instant;
+
+use anyhow::Result;
+use camino::Utf8PathBuf;
+use pageserver_api::key::{CompactKey, Key};
+use pageserver_api::keyspace::KeySpace;
+use pageserver_api::models::InMemoryLayerInfo;
+use pageserver_api::shard::TenantShardId;
+use tokio::sync::RwLock;
+use tracing::*;
+use utils::id::TimelineId;
+use utils::lsn::Lsn;
+use utils::vec_map::VecMap;
+use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta};
+
+use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState};
+use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64, u64_to_usize};
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
+// avoid binding to Write (conflicts with std::io::Write)
+// while being able to use std::fmt::Write's methods
+use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{OnDiskValue, OnDiskValueIo};
 use crate::tenant::timeline::GetVectoredError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::{l0_flush, page_cache};
-use anyhow::Result;
-use camino::Utf8PathBuf;
-use pageserver_api::key::CompactKey;
-use pageserver_api::key::Key;
-use pageserver_api::keyspace::KeySpace;
-use pageserver_api::models::InMemoryLayerInfo;
-use pageserver_api::shard::TenantShardId;
-use std::collections::{BTreeMap, HashMap};
-use std::sync::{Arc, OnceLock};
-use std::time::Instant;
-use tracing::*;
-use utils::{id::TimelineId, lsn::Lsn, vec_map::VecMap};
-use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta};
-// avoid binding to Write (conflicts with std::io::Write)
-// while being able to use std::fmt::Write's methods
-use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
-use std::cmp::Ordering;
-use std::fmt::Write;
-use std::ops::Range;
-use std::sync::atomic::Ordering as AtomicOrdering;
-use std::sync::atomic::{AtomicU64, AtomicUsize};
-use tokio::sync::RwLock;
-
-use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState};
 
 pub(crate) mod vectored_dio_read;
 
@@ -555,7 +556,9 @@ impl InMemoryLayer {
         gate: &utils::sync::gate::Gate,
         ctx: &RequestContext,
     ) -> Result<InMemoryLayer> {
-        trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
+        trace!(
+            "initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"
+        );
 
         let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate, ctx).await?;
         let key = InMemoryLayerFileId(file.page_cache_file_id());
@@ -816,8 +819,7 @@ mod tests {
     #[test]
     fn test_index_entry() {
         const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS;
-        use IndexEntryNewArgs as Args;
-        use IndexEntryUnpacked as Unpacked;
+        use {IndexEntryNewArgs as Args, IndexEntryUnpacked as Unpacked};
 
         let roundtrip = |args, expect: Unpacked| {
             let res = IndexEntry::new(args).expect("this tests expects no errors");
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
index 1d86015fab..90455fd0ca 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
@@ -1,16 +1,13 @@
-use std::{
-    collections::BTreeMap,
-    sync::{Arc, RwLock},
-};
+use std::collections::BTreeMap;
+use std::sync::{Arc, RwLock};
 
 use itertools::Itertools;
 use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
 
-use crate::{
-    assert_u64_eq_usize::{U64IsUsize, UsizeIsU64},
-    context::RequestContext,
-    virtual_file::{owned_buffers_io::io_buf_aligned::IoBufAlignedMut, IoBufferMut},
-};
+use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64};
+use crate::context::RequestContext;
+use crate::virtual_file::IoBufferMut;
+use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
 
 /// The file interface we require. At runtime, this is a [`crate::tenant::ephemeral_file::EphemeralFile`].
 pub trait File: Send {
@@ -132,7 +129,9 @@ where
         let req_len = match cur {
             LogicalReadState::NotStarted(buf) => {
                 if buf.len() != 0 {
-                    panic!("The `LogicalRead`s that are passed in must be freshly created using `LogicalRead::new`");
+                    panic!(
+                        "The `LogicalRead`s that are passed in must be freshly created using `LogicalRead::new`"
+                    );
                 }
                 // buf.cap() == 0 is ok
 
@@ -141,7 +140,9 @@ where
                 *state = LogicalReadState::Ongoing(buf);
                 req_len
             }
-            x => panic!("must only call with fresh LogicalReads, got another state, leaving Undefined state behind state={x:?}"),
+            x => panic!(
+                "must only call with fresh LogicalReads, got another state, leaving Undefined state behind state={x:?}"
+            ),
         };
 
         // plan which chunks we need to read from
@@ -422,15 +423,15 @@ impl Buffer for Vec<u8> {
 #[cfg(test)]
 #[allow(clippy::assertions_on_constants)]
 mod tests {
+    use std::cell::RefCell;
+    use std::collections::VecDeque;
+
     use rand::Rng;
 
-    use crate::{
-        context::DownloadBehavior, task_mgr::TaskKind,
-        virtual_file::owned_buffers_io::slice::SliceMutExt,
-    };
-
     use super::*;
-    use std::{cell::RefCell, collections::VecDeque};
+    use crate::context::DownloadBehavior;
+    use crate::task_mgr::TaskKind;
+    use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
 
     struct InMemoryFile {
         content: Vec<u8>,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 0bf606cf0a..ae06aca63b 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1,32 +1,32 @@
+use std::ops::Range;
+use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+use std::sync::{Arc, Weak};
+use std::time::{Duration, SystemTime};
+
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::HistoricLayerInfo;
 use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
-use std::ops::Range;
-use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
-use std::sync::{Arc, Weak};
-use std::time::{Duration, SystemTime};
 use tracing::Instrument;
+use utils::generation::Generation;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
 use utils::sync::{gate, heavier_once_cell};
 
-use crate::config::PageServerConf;
-use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
-use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::task_mgr::TaskKind;
-use crate::tenant::timeline::{CompactionError, GetVectoredError};
-use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
-
 use super::delta_layer::{self};
 use super::image_layer::{self};
 use super::{
     AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
     LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState,
 };
-
-use utils::generation::Generation;
+use crate::config::PageServerConf;
+use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
+use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::task_mgr::TaskKind;
+use crate::tenant::Timeline;
+use crate::tenant::remote_timeline_client::LayerFileMetadata;
+use crate::tenant::timeline::{CompactionError, GetVectoredError};
 
 #[cfg(test)]
 mod tests;
@@ -1873,8 +1873,8 @@ impl ResidentLayer {
         self.owner.record_access(ctx);
 
         let res = match inner {
-            Delta(ref d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await,
-            Image(ref i) => image_layer::ImageLayerInner::load_keys(i, ctx).await,
+            Delta(d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await,
+            Image(i) => image_layer::ImageLayerInner::load_keys(i, ctx).await,
         };
         res.with_context(|| format!("Layer index is corrupted for {self}"))
     }
@@ -1920,7 +1920,7 @@ impl ResidentLayer {
         let owner = &self.owner.0;
 
         match self.downloaded.get(owner, ctx).await? {
-            Delta(ref d) => d
+            Delta(d) => d
                 .copy_prefix(writer, until, ctx)
                 .await
                 .with_context(|| format!("copy_delta_prefix until {until} of {self}")),
@@ -1943,7 +1943,7 @@ impl ResidentLayer {
     ) -> anyhow::Result<&delta_layer::DeltaLayerInner> {
         use LayerKind::*;
         match self.downloaded.get(&self.owner.0, ctx).await? {
-            Delta(ref d) => Ok(d),
+            Delta(d) => Ok(d),
             Image(_) => Err(anyhow::anyhow!("image layer")),
         }
     }
@@ -1955,7 +1955,7 @@ impl ResidentLayer {
     ) -> anyhow::Result<&image_layer::ImageLayerInner> {
         use LayerKind::*;
         match self.downloaded.get(&self.owner.0, ctx).await? {
-            Image(ref d) => Ok(d),
+            Image(d) => Ok(d),
             Delta(_) => Err(anyhow::anyhow!("delta layer")),
         }
     }
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index d93c378ffc..724150d27f 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1,22 +1,16 @@
 use std::time::UNIX_EPOCH;
 
-use pageserver_api::key::{Key, CONTROLFILE_KEY};
+use pageserver_api::key::{CONTROLFILE_KEY, Key};
 use tokio::task::JoinSet;
-use utils::{
-    completion::{self, Completion},
-    id::TimelineId,
-};
+use utils::completion::{self, Completion};
+use utils::id::TimelineId;
 
 use super::failpoints::{Failpoint, FailpointKind};
 use super::*;
-use crate::{
-    context::DownloadBehavior,
-    tenant::{
-        harness::test_img,
-        storage_layer::{IoConcurrency, LayerVisibilityHint},
-    },
-};
-use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
+use crate::context::DownloadBehavior;
+use crate::task_mgr::TaskKind;
+use crate::tenant::harness::{TenantHarness, test_img};
+use crate::tenant::storage_layer::{IoConcurrency, LayerVisibilityHint};
 
 /// Used in tests to advance a future to wanted await point, and not futher.
 const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600);
@@ -771,10 +765,12 @@ async fn evict_and_wait_does_not_wait_for_download() {
     let (arrival, _download_arrived) = utils::completion::channel();
     layer.enable_failpoint(Failpoint::WaitBeforeDownloading(Some(arrival), barrier));
 
-    let mut download = std::pin::pin!(layer
-        .0
-        .get_or_maybe_download(true, None)
-        .instrument(download_span));
+    let mut download = std::pin::pin!(
+        layer
+            .0
+            .get_or_maybe_download(true, None)
+            .instrument(download_span)
+    );
 
     assert!(
         !layer.is_likely_resident(),
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index 2097e90764..ed16dcaa0d 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -1,16 +1,15 @@
 use core::fmt::Display;
-use pageserver_api::shard::TenantShardId;
 use std::ops::Range;
-use utils::{id::TimelineId, lsn::Lsn};
 
 use pageserver_api::key::Key;
-
-use super::{DeltaLayerName, ImageLayerName, LayerName};
-
+use pageserver_api::shard::TenantShardId;
 use serde::{Deserialize, Serialize};
-
 #[cfg(test)]
 use utils::id::TenantId;
+use utils::id::TimelineId;
+use utils::lsn::Lsn;
+
+use super::{DeltaLayerName, ImageLayerName, LayerName};
 
 /// A unique identifier of a persistent layer.
 ///
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index addf3b85d9..0f7995f87b 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -1,12 +1,12 @@
 //!
 //! Helper functions for dealing with filenames of the image and delta layer files.
 //!
-use pageserver_api::key::Key;
 use std::cmp::Ordering;
 use std::fmt;
 use std::ops::Range;
 use std::str::FromStr;
 
+use pageserver_api::key::Key;
 use utils::lsn::Lsn;
 
 use super::PersistentLayerDesc;
@@ -305,7 +305,7 @@ impl FromStr for LayerName {
             (None, None) => {
                 return Err(format!(
                     "neither delta nor image layer file name: {value:?}"
-                ))
+                ));
             }
             (Some(delta), None) => Self::Delta(delta),
             (None, Some(image)) => Self::Image(image),
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index 19cfcb0867..76cdddd06a 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -1,21 +1,16 @@
-use std::{
-    cmp::Ordering,
-    collections::{binary_heap, BinaryHeap},
-    sync::Arc,
-};
+use std::cmp::Ordering;
+use std::collections::{BinaryHeap, binary_heap};
+use std::sync::Arc;
 
 use anyhow::bail;
 use pageserver_api::key::Key;
+use pageserver_api::value::Value;
 use utils::lsn::Lsn;
 
+use super::delta_layer::{DeltaLayerInner, DeltaLayerIterator};
+use super::image_layer::{ImageLayerInner, ImageLayerIterator};
+use super::{PersistentLayerDesc, PersistentLayerKey};
 use crate::context::RequestContext;
-use pageserver_api::value::Value;
-
-use super::{
-    delta_layer::{DeltaLayerInner, DeltaLayerIterator},
-    image_layer::{ImageLayerInner, ImageLayerIterator},
-    PersistentLayerDesc, PersistentLayerKey,
-};
 
 #[derive(Clone, Copy)]
 pub(crate) enum LayerRef<'a> {
@@ -349,24 +344,18 @@ impl<'a> MergeIterator<'a> {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
-
     use itertools::Itertools;
     use pageserver_api::key::Key;
-    use utils::lsn::Lsn;
-
-    use crate::{
-        tenant::{
-            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
-        },
-        DEFAULT_PG_VERSION,
-    };
-
-    #[cfg(feature = "testing")]
-    use crate::tenant::storage_layer::delta_layer::test::sort_delta_value;
     #[cfg(feature = "testing")]
     use pageserver_api::record::NeonWalRecord;
+    use utils::lsn::Lsn;
+
+    use super::*;
+    use crate::DEFAULT_PG_VERSION;
+    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
+    #[cfg(feature = "testing")]
+    use crate::tenant::storage_layer::delta_layer::test::sort_delta_value;
+    use crate::tenant::storage_layer::delta_layer::test::{produce_delta_layer, sort_delta};
 
     async fn assert_merge_iter_equal(
         merge_iter: &mut MergeIterator<'_>,
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index b12655b0f3..670f9ad87f 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -8,24 +8,24 @@ use std::sync::Arc;
 use std::time::{Duration, Instant};
 
 use once_cell::sync::Lazy;
+use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD;
 use rand::Rng;
 use scopeguard::defer;
 use tokio::sync::{Semaphore, SemaphorePermit};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-
-use crate::context::{DownloadBehavior, RequestContext};
-use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
-use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS};
-use crate::tenant::throttle::Stats;
-use crate::tenant::timeline::compaction::CompactionOutcome;
-use crate::tenant::timeline::CompactionError;
-use crate::tenant::{Tenant, TenantState};
-use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD;
 use utils::backoff::exponential_backoff_duration;
 use utils::completion::Barrier;
 use utils::pausable_failpoint;
 
+use crate::context::{DownloadBehavior, RequestContext};
+use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
+use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind};
+use crate::tenant::throttle::Stats;
+use crate::tenant::timeline::CompactionError;
+use crate::tenant::timeline::compaction::CompactionOutcome;
+use crate::tenant::{Tenant, TenantState};
+
 /// Semaphore limiting concurrent background tasks (across all tenants).
 ///
 /// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work.
@@ -287,11 +287,12 @@ fn log_compaction_error(
     sleep_duration: Duration,
     task_cancelled: bool,
 ) {
-    use crate::pgdatadir_mapping::CollectKeySpaceError;
-    use crate::tenant::upload_queue::NotInitialized;
-    use crate::tenant::PageReconstructError;
     use CompactionError::*;
 
+    use crate::pgdatadir_mapping::CollectKeySpaceError;
+    use crate::tenant::PageReconstructError;
+    use crate::tenant::upload_queue::NotInitialized;
+
     let level = match err {
         ShuttingDown => return,
         Offload(_) => Level::ERROR,
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index 300d779125..6c37c3771b 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -1,10 +1,6 @@
-use std::{
-    sync::{
-        atomic::{AtomicU64, Ordering},
-        Arc,
-    },
-    time::Instant,
-};
+use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::time::Instant;
 
 use arc_swap::ArcSwap;
 use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter};
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a80d407d54..cbbcf5d358 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,55 +14,6 @@ pub mod span;
 pub mod uninit;
 mod walreceiver;
 
-use anyhow::{anyhow, bail, ensure, Context, Result};
-use arc_swap::{ArcSwap, ArcSwapOption};
-use bytes::Bytes;
-use camino::Utf8Path;
-use chrono::{DateTime, Utc};
-use compaction::{CompactionOutcome, GcCompactionCombinedSettings};
-use enumset::EnumSet;
-use fail::fail_point;
-use futures::FutureExt;
-use futures::{stream::FuturesUnordered, StreamExt};
-use handle::ShardTimelineId;
-use layer_manager::Shutdown;
-use offload::OffloadError;
-use once_cell::sync::Lazy;
-use pageserver_api::models::PageTraceEvent;
-use pageserver_api::{
-    key::{
-        KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
-        SPARSE_RANGE,
-    },
-    keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
-    models::{
-        CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings,
-        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
-        InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState,
-    },
-    reltag::BlockNumber,
-    shard::{ShardIdentity, ShardNumber, TenantShardId},
-};
-use rand::Rng;
-use remote_storage::DownloadError;
-use serde_with::serde_as;
-use storage_broker::BrokerClientChannel;
-use tokio::runtime::Handle;
-use tokio::sync::mpsc::Sender;
-use tokio::sync::{oneshot, watch, Notify};
-use tokio_util::sync::CancellationToken;
-use tracing::*;
-use utils::critical;
-use utils::rate_limit::RateLimit;
-use utils::{
-    fs_ext,
-    guard_arc_swap::GuardArcSwap,
-    pausable_failpoint,
-    postgres_client::PostgresClientProtocol,
-    sync::gate::{Gate, GateGuard},
-};
-use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
-
 use std::array;
 use std::cmp::{max, min};
 use std::collections::btree_map::Entry;
@@ -72,74 +23,58 @@ use std::sync::atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
-use crate::l0_flush::{self, L0FlushGlobalState};
-use crate::tenant::storage_layer::ImageLayerName;
-use crate::{
-    aux_file::AuxFileSizeEstimator,
-    page_service::TenantManagerTypes,
-    tenant::{
-        config::AttachmentMode,
-        layer_map::{LayerMap, SearchResult},
-        metadata::TimelineMetadata,
-        storage_layer::{
-            inmemory_layer::IndexEntry, BatchLayerWriter, IoConcurrency, PersistentLayerDesc,
-            ValueReconstructSituation,
-        },
-    },
-    walingest::WalLagCooldown,
-    walredo,
-};
-use crate::{
-    context::{DownloadBehavior, RequestContext},
-    disk_usage_eviction_task::DiskUsageEvictionInfo,
-    pgdatadir_mapping::CollectKeySpaceError,
-};
-use crate::{
-    disk_usage_eviction_task::finite_f32,
-    tenant::storage_layer::{
-        AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
-        LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructState,
-        ValuesReconstructState,
-    },
-};
-use crate::{
-    disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
-};
-use crate::{
-    metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
-};
-use crate::{
-    pgdatadir_mapping::DirectoryKind,
-    virtual_file::{MaybeFatalIo, VirtualFile},
-};
-use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
-use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey};
+use anyhow::{Context, Result, anyhow, bail, ensure};
+use arc_swap::{ArcSwap, ArcSwapOption};
+use bytes::Bytes;
+use camino::Utf8Path;
+use chrono::{DateTime, Utc};
+use compaction::{CompactionOutcome, GcCompactionCombinedSettings};
+use enumset::EnumSet;
+use fail::fail_point;
+use futures::stream::FuturesUnordered;
+use futures::{FutureExt, StreamExt};
+use handle::ShardTimelineId;
+use layer_manager::Shutdown;
+use offload::OffloadError;
+use once_cell::sync::Lazy;
 use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
-
-use crate::config::PageServerConf;
-use crate::keyspace::{KeyPartitioning, KeySpace};
-use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL};
-use crate::pgdatadir_mapping::{CalculateLogicalSizeError, MetricsUpdate};
-use crate::tenant::config::TenantConfOpt;
-use pageserver_api::reltag::RelTag;
-use pageserver_api::shard::ShardIndex;
-
-use postgres_connection::PgConnectionConfig;
-use postgres_ffi::{to_pg_timestamp, v14::xlog_utils, WAL_SEGMENT_SIZE};
-use utils::{
-    completion,
-    generation::Generation,
-    id::TimelineId,
-    lsn::{AtomicLsn, Lsn, RecordLsn},
-    seqwait::SeqWait,
-    simple_rcu::{Rcu, RcuReadGuard},
+use pageserver_api::key::{
+    KEY_SIZE, Key, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
+    SPARSE_RANGE,
 };
-
-use crate::task_mgr;
-use crate::task_mgr::TaskKind;
-use crate::tenant::gc_result::GcResult;
-use crate::ZERO_PAGE;
-use pageserver_api::key::Key;
+use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning};
+use pageserver_api::models::{
+    CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings,
+    DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
+    InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, TimelineState,
+};
+use pageserver_api::reltag::{BlockNumber, RelTag};
+use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId};
+#[cfg(test)]
+use pageserver_api::value::Value;
+use postgres_connection::PgConnectionConfig;
+use postgres_ffi::v14::xlog_utils;
+use postgres_ffi::{WAL_SEGMENT_SIZE, to_pg_timestamp};
+use rand::Rng;
+use remote_storage::DownloadError;
+use serde_with::serde_as;
+use storage_broker::BrokerClientChannel;
+use tokio::runtime::Handle;
+use tokio::sync::mpsc::Sender;
+use tokio::sync::{Notify, oneshot, watch};
+use tokio_util::sync::CancellationToken;
+use tracing::*;
+use utils::generation::Generation;
+use utils::guard_arc_swap::GuardArcSwap;
+use utils::id::TimelineId;
+use utils::lsn::{AtomicLsn, Lsn, RecordLsn};
+use utils::postgres_client::PostgresClientProtocol;
+use utils::rate_limit::RateLimit;
+use utils::seqwait::SeqWait;
+use utils::simple_rcu::{Rcu, RcuReadGuard};
+use utils::sync::gate::{Gate, GateGuard};
+use utils::{completion, critical, fs_ext, pausable_failpoint};
+use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
 
 use self::delete::DeleteTimelineFlow;
 pub(super) use self::eviction_task::EvictionTaskTenantState;
@@ -147,24 +82,48 @@ use self::eviction_task::EvictionTaskTimelineState;
 use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
-
-use super::remote_timeline_client::index::GcCompactionState;
+use super::config::TenantConf;
+use super::remote_timeline_client::index::{GcCompactionState, IndexPart};
+use super::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError};
+use super::secondary::heatmap::HeatMapLayer;
+use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
+use super::upload_queue::NotInitialized;
 use super::{
-    config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized,
-    MaybeOffloaded,
+    AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded,
+    debug_assert_current_span_has_tenant_and_timeline_id,
 };
-use super::{
-    debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf, HeatMapTimeline,
+use crate::aux_file::AuxFileSizeEstimator;
+use crate::config::PageServerConf;
+use crate::context::{DownloadBehavior, RequestContext};
+use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, finite_f32};
+use crate::keyspace::{KeyPartitioning, KeySpace};
+use crate::l0_flush::{self, L0FlushGlobalState};
+use crate::metrics::{
+    DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics,
 };
-use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
-use super::{
-    remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
-    storage_layer::ReadableLayer,
+use crate::page_service::TenantManagerTypes;
+use crate::pgdatadir_mapping::{
+    CalculateLogicalSizeError, CollectKeySpaceError, DirectoryKind, LsnForTimestamp,
+    MAX_AUX_FILE_V2_DELTAS, MetricsUpdate,
 };
-use super::{secondary::heatmap::HeatMapLayer, GcError};
-
-#[cfg(test)]
-use pageserver_api::value::Value;
+use crate::task_mgr::TaskKind;
+use crate::tenant::config::{AttachmentMode, TenantConfOpt};
+use crate::tenant::gc_result::GcResult;
+use crate::tenant::layer_map::{LayerMap, SearchResult};
+use crate::tenant::metadata::TimelineMetadata;
+use crate::tenant::storage_layer::delta_layer::DeltaEntry;
+use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
+use crate::tenant::storage_layer::{
+    AsLayerDesc, BatchLayerWriter, DeltaLayerWriter, EvictionError, ImageLayerName,
+    ImageLayerWriter, InMemoryLayer, IoConcurrency, Layer, LayerAccessStatsReset, LayerName,
+    PersistentLayerDesc, PersistentLayerKey, ResidentLayer, ValueReconstructSituation,
+    ValueReconstructState, ValuesReconstructState,
+};
+use crate::tenant::tasks::BackgroundLoopKind;
+use crate::tenant::timeline::logical_size::CurrentLogicalSize;
+use crate::virtual_file::{MaybeFatalIo, VirtualFile};
+use crate::walingest::WalLagCooldown;
+use crate::{ZERO_PAGE, task_mgr, walredo};
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(crate) enum FlushLoopState {
@@ -1474,13 +1433,22 @@ impl Timeline {
                 | TaskKind::WalReceiverConnectionHandler
                 | TaskKind::WalReceiverConnectionPoller => {
                     let is_myself = match who_is_waiting {
-                        WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself),
-                        WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService | WaitLsnWaiter::HttpEndpoint => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()),
+                        WaitLsnWaiter::Timeline(waiter) => {
+                            Weak::ptr_eq(&waiter.myself, &self.myself)
+                        }
+                        WaitLsnWaiter::Tenant
+                        | WaitLsnWaiter::PageService
+                        | WaitLsnWaiter::HttpEndpoint => unreachable!(
+                            "tenant or page_service context are not expected to have task kind {:?}",
+                            ctx.task_kind()
+                        ),
                     };
                     if is_myself {
                         if let Err(current) = self.last_record_lsn.would_wait_for(lsn) {
                             // walingest is the only one that can advance last_record_lsn; it should make sure to never reach here
-                            panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock");
+                            panic!(
+                                "this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock"
+                            );
                         }
                     } else {
                         // if another  timeline's  is waiting for us, there's no deadlock risk because
@@ -1509,12 +1477,12 @@ impl Timeline {
                         drop(_timer);
                         let walreceiver_status = self.walreceiver_status();
                         Err(WaitLsnError::Timeout(format!(
-                        "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
-                        lsn,
-                        self.get_last_record_lsn(),
-                        self.get_disk_consistent_lsn(),
-                        walreceiver_status,
-                    )))
+                            "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
+                            lsn,
+                            self.get_last_record_lsn(),
+                            self.get_disk_consistent_lsn(),
+                            walreceiver_status,
+                        )))
                     }
                 }
             }
@@ -1618,10 +1586,18 @@ impl Timeline {
                     if init || validate {
                         let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn();
                         if lsn < *latest_gc_cutoff_lsn {
-                            bail!("tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
+                            bail!(
+                                "tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}",
+                                lsn,
+                                *latest_gc_cutoff_lsn
+                            );
                         }
                         if lsn < planned_cutoff {
-                            bail!("tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", lsn, planned_cutoff);
+                            bail!(
+                                "tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}",
+                                lsn,
+                                planned_cutoff
+                            );
                         }
                     }
 
@@ -1745,7 +1721,9 @@ impl Timeline {
                 // This is not harmful, but it only happens in relatively rare cases where
                 // time-based checkpoints are not happening fast enough to keep the amount of
                 // ephemeral data within configured limits.  It's a sign of stress on the system.
-                tracing::info!("Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure");
+                tracing::info!(
+                    "Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure"
+                );
             }
         }
 
@@ -1871,7 +1849,9 @@ impl Timeline {
 
         // Last record Lsn could be zero in case the timeline was just created
         if !last_record_lsn.is_valid() {
-            warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
+            warn!(
+                "Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}"
+            );
             return Ok(CompactionOutcome::Skipped);
         }
 
@@ -2033,7 +2013,9 @@ impl Timeline {
                 // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but
                 // we also do a final check here to ensure that the queue is empty.
                 if !self.remote_client.no_pending_work() {
-                    warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
+                    warn!(
+                        "still have pending work in remote upload queue, but continuing shutting down anyways"
+                    );
                 }
             }
         }
@@ -2042,7 +2024,9 @@ impl Timeline {
             // drain the upload queue
             self.remote_client.shutdown().await;
             if !self.remote_client.no_pending_work() {
-                warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
+                warn!(
+                    "still have pending work in remote upload queue, but continuing shutting down anyways"
+                );
             }
         }
 
@@ -2946,8 +2930,9 @@ impl Timeline {
         disk_consistent_lsn: Lsn,
         index_part: IndexPart,
     ) -> anyhow::Result<()> {
-        use init::{Decision::*, Discovered, DismissedLayer};
         use LayerName::*;
+        use init::Decision::*;
+        use init::{Discovered, DismissedLayer};
 
         let mut guard = self.layers.write().await;
 
@@ -3162,11 +3147,15 @@ impl Timeline {
                             }
                             TimelineState::Loading => {
                                 // Import does not return an activated timeline.
-                                info!("discarding priority boost for logical size calculation because timeline is not yet active");
+                                info!(
+                                    "discarding priority boost for logical size calculation because timeline is not yet active"
+                                );
                             }
                             TimelineState::Active => {
                                 // activation should be setting the once cell
-                                warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work");
+                                warn!(
+                                    "unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work"
+                                );
                                 debug_assert!(false);
                             }
                         }
@@ -4306,10 +4295,14 @@ impl Timeline {
                 // This path is only taken for tenants with multiple shards: single sharded tenants should
                 // never encounter a gap in the wal.
                 let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
-                tracing::debug!("Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}");
+                tracing::debug!(
+                    "Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}"
+                );
                 if self.set_disk_consistent_lsn(frozen_to_lsn) {
                     if let Err(e) = self.schedule_uploads(frozen_to_lsn, vec![]) {
-                        tracing::warn!("Failed to schedule metadata upload after updating disk_consistent_lsn: {e}");
+                        tracing::warn!(
+                            "Failed to schedule metadata upload after updating disk_consistent_lsn: {e}"
+                        );
                     }
                 }
             }
@@ -4534,7 +4527,10 @@ impl Timeline {
     /// This function must only be used from the layer flush task.
     fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
         let old_value = self.disk_consistent_lsn.fetch_max(new_value);
-        assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}");
+        assert!(
+            new_value >= old_value,
+            "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}"
+        );
 
         self.metrics
             .disk_consistent_lsn_gauge
@@ -4829,7 +4825,9 @@ impl Timeline {
                                 // any metadata keys, keys, as that would lead to actual data
                                 // loss.
                                 if img_key.is_rel_fsm_block_key() || img_key.is_rel_vm_block_key() {
-                                    warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
+                                    warn!(
+                                        "could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}"
+                                    );
                                     ZERO_PAGE.clone()
                                 } else {
                                     return Err(CreateImageLayersError::from(err));
@@ -4908,7 +4906,8 @@ impl Timeline {
 
         let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
         info!(
-            "metadata key compaction: trigger_generation={trigger_generation}, delta_files_accessed={delta_files_accessed}, total_kb_retrieved={total_kb_retrieved}, total_keys_retrieved={total_keys_retrieved}, read_time={}s", elapsed.as_secs_f64()
+            "metadata key compaction: trigger_generation={trigger_generation}, delta_files_accessed={delta_files_accessed}, total_kb_retrieved={total_kb_retrieved}, total_keys_retrieved={total_keys_retrieved}, read_time={}s",
+            elapsed.as_secs_f64()
         );
 
         if !trigger_generation && mode == ImageLayerCreationMode::Try {
@@ -5230,7 +5229,8 @@ impl Timeline {
                         if should_yield {
                             tracing::info!(
                                 "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers",
-                                partition.start().unwrap(), partition.end().unwrap()
+                                partition.start().unwrap(),
+                                partition.end().unwrap()
                             );
                             last_partition_processed = Some(partition.clone());
                             all_generated = false;
@@ -5588,7 +5588,9 @@ impl Timeline {
                 // because we have not implemented L0 => L0 compaction.
                 duplicated_layers.insert(l.layer_desc().key());
             } else if LayerMap::is_l0(&l.layer_desc().key_range, l.layer_desc().is_delta) {
-                return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
+                return Err(CompactionError::Other(anyhow::anyhow!(
+                    "compaction generates a L0 layer file as output, which will cause infinite compaction."
+                )));
             } else {
                 insert_layers.push(l.clone());
             }
@@ -5712,8 +5714,10 @@ impl Timeline {
                 .await
             {
                 Ok((index_part, index_generation, _index_mtime)) => {
-                    tracing::info!("GC loaded shard zero metadata (gen {index_generation:?}): latest_gc_cutoff_lsn: {}",
-                        index_part.metadata.latest_gc_cutoff_lsn());
+                    tracing::info!(
+                        "GC loaded shard zero metadata (gen {index_generation:?}): latest_gc_cutoff_lsn: {}",
+                        index_part.metadata.latest_gc_cutoff_lsn()
+                    );
                     Ok(Some(index_part.metadata.latest_gc_cutoff_lsn()))
                 }
                 Err(DownloadError::NotFound) => {
@@ -6122,9 +6126,7 @@ impl Timeline {
             if let Some((img_lsn, img)) = &data.img {
                 trace!(
                     "found page image for key {} at {}, no WAL redo required, req LSN {}",
-                    key,
-                    img_lsn,
-                    request_lsn,
+                    key, img_lsn, request_lsn,
                 );
                 Ok(img.clone())
             } else {
@@ -6153,7 +6155,12 @@ impl Timeline {
                         request_lsn
                     );
                 } else {
-                    trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
+                    trace!(
+                        "found {} WAL records that will init the page for {} at {}, performing WAL redo",
+                        data.records.len(),
+                        key,
+                        request_lsn
+                    );
                 };
                 let res = self
                     .walredo_mgr
@@ -6697,7 +6704,9 @@ impl TimelineWriter<'_> {
 
         if let Some(wait_threshold) = wait_threshold {
             if l0_count >= wait_threshold {
-                debug!("layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers");
+                debug!(
+                    "layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers"
+                );
                 self.tl.wait_flush_completion(flush_id).await?;
             }
         }
@@ -6884,17 +6893,15 @@ mod tests {
     use pageserver_api::key::Key;
     use pageserver_api::value::Value;
     use tracing::Instrument;
-    use utils::{id::TimelineId, lsn::Lsn};
-
-    use crate::tenant::{
-        harness::{test_img, TenantHarness},
-        layer_map::LayerMap,
-        storage_layer::{Layer, LayerName, LayerVisibilityHint},
-        timeline::{DeltaLayerTestDesc, EvictionError},
-        PreviousHeatmap, Timeline,
-    };
+    use utils::id::TimelineId;
+    use utils::lsn::Lsn;
 
     use super::HeatMapTimeline;
+    use crate::tenant::harness::{TenantHarness, test_img};
+    use crate::tenant::layer_map::LayerMap;
+    use crate::tenant::storage_layer::{Layer, LayerName, LayerVisibilityHint};
+    use crate::tenant::timeline::{DeltaLayerTestDesc, EvictionError};
+    use crate::tenant::{PreviousHeatmap, Timeline};
 
     fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) {
         assert_eq!(lhs.layers.len(), rhs.layers.len());
diff --git a/pageserver/src/tenant/timeline/analysis.rs b/pageserver/src/tenant/timeline/analysis.rs
index 6009b0b79a..96864ec44b 100644
--- a/pageserver/src/tenant/timeline/analysis.rs
+++ b/pageserver/src/tenant/timeline/analysis.rs
@@ -1,4 +1,5 @@
-use std::{collections::BTreeSet, ops::Range};
+use std::collections::BTreeSet;
+use std::ops::Range;
 
 use utils::lsn::Lsn;
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index c6ef5165ef..d221bf53d2 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -8,30 +8,35 @@ use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
 
-use super::layer_manager::LayerManager;
-use super::{
-    CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError,
-    ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError, RecordedDuration,
-    Timeline,
-};
-
-use anyhow::{anyhow, bail, Context};
+use anyhow::{Context, anyhow, bail};
 use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
 use once_cell::sync::Lazy;
-use pageserver_api::key::KEY_SIZE;
-use pageserver_api::keyspace::ShardedRange;
+use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE;
+use pageserver_api::key::{KEY_SIZE, Key};
+use pageserver_api::keyspace::{KeySpace, ShardedRange};
 use pageserver_api::models::CompactInfoResponse;
+use pageserver_api::record::NeonWalRecord;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
+use pageserver_api::value::Value;
+use pageserver_compaction::helpers::{fully_contains, overlaps_with};
+use pageserver_compaction::interface::*;
 use serde::Serialize;
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, info_span, trace, warn, Instrument};
+use tracing::{Instrument, debug, error, info, info_span, trace, warn};
 use utils::critical;
 use utils::id::TimelineId;
+use utils::lsn::Lsn;
 
+use super::layer_manager::LayerManager;
+use super::{
+    CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder,
+    GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError,
+    RecordedDuration, Timeline,
+};
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
 use crate::pgdatadir_mapping::CollectKeySpaceError;
@@ -39,8 +44,8 @@ use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::gc_block::GcBlock;
 use crate::tenant::layer_map::LayerMap;
-use crate::tenant::remote_timeline_client::index::GcCompactionState;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
+use crate::tenant::remote_timeline_client::index::GcCompactionState;
 use crate::tenant::storage_layer::batch_split_writer::{
     BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
 };
@@ -49,24 +54,12 @@ use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::{
     AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
 };
-use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
-use crate::tenant::timeline::{ImageLayerCreationOutcome, IoConcurrency};
-use crate::tenant::timeline::{Layer, ResidentLayer};
-use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded};
+use crate::tenant::timeline::{
+    DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer,
+    ResidentLayer, drop_rlock,
+};
+use crate::tenant::{DeltaLayer, MaybeOffloaded, gc_block};
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
-use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE;
-
-use pageserver_api::key::Key;
-use pageserver_api::keyspace::KeySpace;
-use pageserver_api::record::NeonWalRecord;
-use pageserver_api::value::Value;
-
-use utils::lsn::Lsn;
-
-use pageserver_compaction::helpers::{fully_contains, overlaps_with};
-use pageserver_compaction::interface::*;
-
-use super::CompactionError;
 
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
 const COMPACTION_DELTA_THRESHOLD: usize = 5;
@@ -282,8 +275,7 @@ impl GcCompactionQueue {
             if l2_size == 0 && l1_size >= gc_compaction_initial_threshold_kb * 1024 {
                 info!(
                     "trigger auto-compaction because l1_size={} >= gc_compaction_initial_threshold_kb={}",
-                    l1_size,
-                    gc_compaction_initial_threshold_kb
+                    l1_size, gc_compaction_initial_threshold_kb
                 );
                 return true;
             }
@@ -294,9 +286,7 @@ impl GcCompactionQueue {
             if l1_size as f64 / l2_size as f64 >= (gc_compaction_ratio_percent as f64 / 100.0) {
                 info!(
                     "trigger auto-compaction because l1_size={} / l2_size={} > gc_compaction_ratio_percent={}",
-                    l1_size,
-                    l2_size,
-                    gc_compaction_ratio_percent
+                    l1_size, l2_size, gc_compaction_ratio_percent
                 );
                 return true;
             }
@@ -355,7 +345,9 @@ impl GcCompactionQueue {
         gc_block: &GcBlock,
         auto: bool,
     ) -> Result<(), CompactionError> {
-        info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
+        info!(
+            "running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"
+        );
         let jobs = timeline
             .gc_compaction_split_jobs(
                 GcCompactJob::from_compact_options(options.clone()),
@@ -419,7 +411,10 @@ impl GcCompactionQueue {
                     guard.queued.push_front(item);
                 }
             }
-            info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
+            info!(
+                "scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs",
+                jobs_len
+            );
         }
         Ok(())
     }
@@ -433,7 +428,9 @@ impl GcCompactionQueue {
         timeline: &Arc<Timeline>,
     ) -> Result<CompactionOutcome, CompactionError> {
         let Ok(_one_op_at_a_time_guard) = self.consumer_lock.try_lock() else {
-            return Err(CompactionError::AlreadyRunning("cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue."));
+            return Err(CompactionError::AlreadyRunning(
+                "cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue.",
+            ));
         };
         let has_pending_tasks;
         let Some((id, item)) = ({
@@ -459,9 +456,14 @@ impl GcCompactionQueue {
                     .flags
                     .contains(CompactFlags::EnhancedGcBottomMostCompaction)
                 {
-                    warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", options);
+                    warn!(
+                        "ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}",
+                        options
+                    );
                 } else if options.sub_compaction {
-                    info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
+                    info!(
+                        "running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"
+                    );
                     self.handle_sub_compaction(id, options, timeline, gc_block, auto)
                         .await?;
                 } else {
@@ -964,7 +966,9 @@ impl Timeline {
                 self.upload_new_image_layers(image_layers)?;
                 if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
                     // Yield and do not do any other kind of compaction.
-                    info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
+                    info!(
+                        "skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction)."
+                    );
                     return Ok(CompactionOutcome::YieldForL0);
                 }
             }
@@ -990,7 +994,7 @@ impl Timeline {
             Err(err) => error!("could not compact, repartitioning keyspace failed: {err:?}"),
         };
 
-        let partition_count = self.partitioning.read().0 .0.parts.len();
+        let partition_count = self.partitioning.read().0.0.parts.len();
 
         // 4. Shard ancestor compaction
 
@@ -1199,7 +1203,7 @@ impl Timeline {
             Ok(()) => (),
             Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
             Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
-                return Err(CompactionError::ShuttingDown)
+                return Err(CompactionError::ShuttingDown);
             }
         }
 
@@ -1494,7 +1498,7 @@ impl Timeline {
             let last_record_lsn = self.get_last_record_lsn();
             let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
             let min_hole_coverage_size = 3; // TODO: something more flexible?
-                                            // min-heap (reserve space for one more element added before eviction)
+            // min-heap (reserve space for one more element added before eviction)
             let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
             let mut prev: Option<Key> = None;
 
@@ -2357,8 +2361,14 @@ impl Timeline {
         let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */
         if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space
         {
-            return Err(anyhow!("not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}",
-                available_space, allocated_space, all_layer_size, remote_layer_size, all_layer_size + remote_layer_size));
+            return Err(anyhow!(
+                "not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}",
+                available_space,
+                allocated_space,
+                all_layer_size,
+                remote_layer_size,
+                all_layer_size + remote_layer_size
+            ));
         }
         Ok(())
     }
@@ -2397,7 +2407,9 @@ impl Timeline {
         };
 
         if compact_below_lsn == Lsn::INVALID {
-            tracing::warn!("no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction");
+            tracing::warn!(
+                "no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction"
+            );
             return Ok(vec![]);
         }
 
@@ -2542,7 +2554,9 @@ impl Timeline {
         let sub_compaction = options.sub_compaction;
         let job = GcCompactJob::from_compact_options(options.clone());
         if sub_compaction {
-            info!("running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
+            info!(
+                "running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"
+            );
             let jobs = self
                 .gc_compaction_split_jobs(job, options.sub_compaction_max_job_size_mb)
                 .await?;
@@ -2594,7 +2608,13 @@ impl Timeline {
 
         let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing");
 
-        info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", compact_key_range.start, compact_key_range.end, compact_lsn_range.start, compact_lsn_range.end);
+        info!(
+            "running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}",
+            compact_key_range.start,
+            compact_key_range.end,
+            compact_lsn_range.start,
+            compact_lsn_range.end
+        );
 
         scopeguard::defer! {
             info!("done enhanced gc bottom-most compaction");
@@ -2623,7 +2643,9 @@ impl Timeline {
                 let mut gc_cutoff = if compact_lsn_range.end == Lsn::MAX {
                     if real_gc_cutoff == Lsn::INVALID {
                         // If the gc_cutoff is not generated yet, we should not compact anything.
-                        tracing::warn!("no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction");
+                        tracing::warn!(
+                            "no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction"
+                        );
                         return Ok(());
                     }
                     real_gc_cutoff
@@ -2631,7 +2653,10 @@ impl Timeline {
                     compact_lsn_range.end
                 };
                 if gc_cutoff > real_gc_cutoff {
-                    warn!("provided compact_lsn_range.end={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff);
+                    warn!(
+                        "provided compact_lsn_range.end={} is larger than the real_gc_cutoff={}, using the real gc cutoff",
+                        gc_cutoff, real_gc_cutoff
+                    );
                     gc_cutoff = real_gc_cutoff;
                 }
                 gc_cutoff
@@ -2655,7 +2680,10 @@ impl Timeline {
                 .map(|desc| desc.get_lsn_range().end)
                 .max()
             else {
-                info!("no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff);
+                info!(
+                    "no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}",
+                    gc_cutoff
+                );
                 return Ok(());
             };
             // Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below
@@ -2673,7 +2701,10 @@ impl Timeline {
                 .map(|desc| desc.get_lsn_range().start)
                 .min()
             else {
-                info!("no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}", compact_lsn_range.end);
+                info!(
+                    "no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}",
+                    compact_lsn_range.end
+                );
                 return Ok(());
             };
             // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
@@ -2696,7 +2727,10 @@ impl Timeline {
                 }
             }
             if selected_layers.is_empty() {
-                info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compact_key_range.start, compact_key_range.end);
+                info!(
+                    "no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}",
+                    gc_cutoff, compact_key_range.start, compact_key_range.end
+                );
                 return Ok(());
             }
             retain_lsns_below_horizon.sort();
@@ -2778,7 +2812,10 @@ impl Timeline {
             .map(|layer| layer.layer_desc().layer_name())
             .collect_vec();
         if let Some(err) = check_valid_layermap(&layer_names) {
-            bail!("gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss", err);
+            bail!(
+                "gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss",
+                err
+            );
         }
         // The maximum LSN we are processing in this compaction loop
         let end_lsn = job_desc
@@ -3185,7 +3222,10 @@ impl Timeline {
         // the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are
         // in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails.
         if let Some(err) = check_valid_layermap(&final_layers) {
-            bail!("gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss", err);
+            bail!(
+                "gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss",
+                err
+            );
         }
 
         // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only
@@ -3250,7 +3290,8 @@ impl Timeline {
                 if let Some(to) = compact_to_set.get(&layer.layer_desc().key()) {
                     tracing::info!(
                         "skipping delete {} because found same layer key at different generation {}",
-                        layer, to
+                        layer,
+                        to
                     );
                 } else {
                     compact_from.push(layer.clone());
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index f4ae1ea166..7cdc69e55f 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -1,26 +1,26 @@
-use std::{
-    ops::{Deref, DerefMut},
-    sync::Arc,
-};
+use std::ops::{Deref, DerefMut};
+use std::sync::Arc;
 
 use anyhow::Context;
-use pageserver_api::{models::TimelineState, shard::TenantShardId};
+use pageserver_api::models::TimelineState;
+use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use tokio::sync::OwnedMutexGuard;
-use tracing::{error, info, info_span, instrument, Instrument};
-use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
+use tracing::{Instrument, error, info, info_span, instrument};
+use utils::id::TimelineId;
+use utils::{crashsafe, fs_ext, pausable_failpoint};
 
-use crate::{
-    config::PageServerConf,
-    task_mgr::{self, TaskKind},
-    tenant::{
-        metadata::TimelineMetadata,
-        remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
-        CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
-        TenantManifestError, Timeline, TimelineOrOffloaded,
-    },
-    virtual_file::MaybeFatalIo,
+use crate::config::PageServerConf;
+use crate::task_mgr::{self, TaskKind};
+use crate::tenant::metadata::TimelineMetadata;
+use crate::tenant::remote_timeline_client::{
+    PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
 };
+use crate::tenant::{
+    CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant, TenantManifestError,
+    Timeline, TimelineOrOffloaded,
+};
+use crate::virtual_file::MaybeFatalIo;
 
 /// Mark timeline as deleted in S3 so we won't pick it up next time
 /// during attach or pageserver restart.
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index e0084d3eef..c3a7433062 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -1,25 +1,27 @@
-use std::{collections::HashSet, sync::Arc};
+use std::collections::HashSet;
+use std::sync::Arc;
 
-use super::{layer_manager::LayerManager, FlushLayerError, Timeline};
-use crate::{
-    context::{DownloadBehavior, RequestContext},
-    task_mgr::TaskKind,
-    tenant::{
-        remote_timeline_client::index::GcBlockingReason::DetachAncestor,
-        storage_layer::{
-            layer::local_layer_path, AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer,
-        },
-        Tenant,
-    },
-    virtual_file::{MaybeFatalIo, VirtualFile},
-};
 use anyhow::Context;
 use http_utils::error::ApiError;
-use pageserver_api::{models::detach_ancestor::AncestorDetached, shard::ShardIdentity};
+use pageserver_api::models::detach_ancestor::AncestorDetached;
+use pageserver_api::shard::ShardIdentity;
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
-use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn};
+use utils::completion;
+use utils::generation::Generation;
+use utils::id::TimelineId;
+use utils::lsn::Lsn;
+
+use super::layer_manager::LayerManager;
+use super::{FlushLayerError, Timeline};
+use crate::context::{DownloadBehavior, RequestContext};
+use crate::task_mgr::TaskKind;
+use crate::tenant::Tenant;
+use crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor;
+use crate::tenant::storage_layer::layer::local_layer_path;
+use crate::tenant::storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer};
+use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum Error {
@@ -64,9 +66,10 @@ impl Error {
     where
         F: Fn(anyhow::Error) -> Error,
     {
+        use remote_storage::TimeoutOrCancel;
+
         use crate::tenant::remote_timeline_client::WaitCompletionError;
         use crate::tenant::upload_queue::NotInitialized;
-        use remote_storage::TimeoutOrCancel;
 
         if e.is::<NotInitialized>()
             || TimeoutOrCancel::caused_by_cancel(&e)
@@ -780,7 +783,7 @@ pub(super) async fn detach_and_reparent(
             // TODO: make sure there are no `?` before tenant_reset from after a questionmark from
             // here.
             panic!(
-            "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor"
+                "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor"
             );
         }
     };
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 77c33349e0..187d9f248e 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -13,34 +13,27 @@
 //! Items with parentheses are not (yet) touched by this task.
 //!
 //! See write-up on restart on-demand download spike: <https://gist.github.com/problame/2265bf7b8dc398be834abfead36c76b5>
-use std::{
-    collections::HashMap,
-    ops::ControlFlow,
-    sync::Arc,
-    time::{Duration, SystemTime},
-};
+use std::collections::HashMap;
+use std::ops::ControlFlow;
+use std::sync::Arc;
+use std::time::{Duration, SystemTime};
 
 use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, info, info_span, instrument, warn, Instrument};
-
-use crate::{
-    context::{DownloadBehavior, RequestContext},
-    pgdatadir_mapping::CollectKeySpaceError,
-    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    tenant::{
-        size::CalculateSyntheticSizeError,
-        storage_layer::LayerVisibilityHint,
-        tasks::{sleep_random, BackgroundLoopKind, BackgroundLoopSemaphorePermit},
-        timeline::EvictionError,
-        LogicalSizeCalculationCause, Tenant,
-    },
-};
-
-use utils::{completion, sync::gate::GateGuard};
+use tracing::{Instrument, debug, info, info_span, instrument, warn};
+use utils::completion;
+use utils::sync::gate::GateGuard;
 
 use super::Timeline;
+use crate::context::{DownloadBehavior, RequestContext};
+use crate::pgdatadir_mapping::CollectKeySpaceError;
+use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind};
+use crate::tenant::size::CalculateSyntheticSizeError;
+use crate::tenant::storage_layer::LayerVisibilityHint;
+use crate::tenant::tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit, sleep_random};
+use crate::tenant::timeline::EvictionError;
+use crate::tenant::{LogicalSizeCalculationCause, Tenant};
 
 #[derive(Default)]
 pub struct EvictionTaskTimelineState {
diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs
index 5b39daaaf8..67fb89c433 100644
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -202,18 +202,13 @@
 //! to the parent shard during a shard split. Eventually, the shard split task will
 //! shut down the parent => case (1).
 
-use std::collections::hash_map;
-use std::collections::HashMap;
-use std::sync::Arc;
-use std::sync::Mutex;
-use std::sync::Weak;
+use std::collections::{HashMap, hash_map};
+use std::sync::{Arc, Mutex, Weak};
 
 use pageserver_api::shard::ShardIdentity;
-use tracing::instrument;
-use tracing::trace;
+use tracing::{instrument, trace};
 use utils::id::TimelineId;
-use utils::shard::ShardIndex;
-use utils::shard::ShardNumber;
+use utils::shard::{ShardIndex, ShardNumber};
 
 use crate::tenant::mgr::ShardSelector;
 
@@ -631,12 +626,10 @@ impl<T: Types> HandleInner<T> {
 mod tests {
     use std::sync::Weak;
 
-    use pageserver_api::{
-        key::{rel_block_to_key, Key, DBDIR_KEY},
-        models::ShardParameters,
-        reltag::RelTag,
-        shard::ShardStripeSize,
-    };
+    use pageserver_api::key::{DBDIR_KEY, Key, rel_block_to_key};
+    use pageserver_api::models::ShardParameters;
+    use pageserver_api::reltag::RelTag;
+    use pageserver_api::shard::ShardStripeSize;
     use utils::shard::ShardCount;
 
     use super::*;
diff --git a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
index 0ba9753e85..27243ba378 100644
--- a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
+++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
@@ -3,9 +3,10 @@
 //! Provides utilities to spawn and abort a background task where the downloads happen.
 //! See /v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers.
 
+use std::sync::{Arc, Mutex};
+
 use futures::StreamExt;
 use http_utils::error::ApiError;
-use std::sync::{Arc, Mutex};
 use tokio_util::sync::CancellationToken;
 use utils::sync::gate::Gate;
 
diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs
index 6940179ae9..8b94a114d6 100644
--- a/pageserver/src/tenant/timeline/import_pgdata.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata.rs
@@ -1,14 +1,14 @@
 use std::sync::Arc;
 
-use anyhow::{bail, Context};
+use anyhow::{Context, bail};
 use remote_storage::RemotePath;
 use tokio_util::sync::CancellationToken;
-use tracing::{info, info_span, Instrument};
+use tracing::{Instrument, info, info_span};
 use utils::lsn::Lsn;
 
-use crate::{context::RequestContext, tenant::metadata::TimelineMetadata};
-
 use super::Timeline;
+use crate::context::RequestContext;
+use crate::tenant::metadata::TimelineMetadata;
 
 mod flow;
 mod importbucket_client;
diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
index 4388072606..3ef82b3658 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
@@ -28,52 +28,38 @@
 //! An incomplete set of TODOs from the Hackathon:
 //! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest)
 
+use std::collections::HashSet;
+use std::ops::Range;
 use std::sync::Arc;
 
 use anyhow::{bail, ensure};
 use bytes::Bytes;
-
 use itertools::Itertools;
-use pageserver_api::{
-    key::{rel_block_to_key, rel_dir_to_key, rel_size_to_key, relmap_file_key, DBDIR_KEY},
-    reltag::RelTag,
-    shard::ShardIdentity,
-};
-use postgres_ffi::{pg_constants, relfile_utils::parse_relfilename, BLCKSZ};
-use tokio::task::JoinSet;
-use tracing::{debug, info_span, instrument, Instrument};
-
-use crate::{
-    assert_u64_eq_usize::UsizeIsU64,
-    pgdatadir_mapping::{SlruSegmentDirectory, TwoPhaseDirectory},
-};
-use crate::{
-    context::{DownloadBehavior, RequestContext},
-    pgdatadir_mapping::{DbDirectory, RelDirectory},
-    task_mgr::TaskKind,
-    tenant::storage_layer::{ImageLayerWriter, Layer},
-};
-
-use pageserver_api::key::Key;
 use pageserver_api::key::{
-    slru_block_to_key, slru_dir_to_key, slru_segment_size_to_key, CHECKPOINT_KEY, CONTROLFILE_KEY,
-    TWOPHASEDIR_KEY,
+    CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, Key, TWOPHASEDIR_KEY, rel_block_to_key,
+    rel_dir_to_key, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
+    slru_segment_size_to_key,
 };
-use pageserver_api::keyspace::singleton_range;
-use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range};
-use pageserver_api::reltag::SlruKind;
+use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range, singleton_range};
+use pageserver_api::reltag::{RelTag, SlruKind};
+use pageserver_api::shard::ShardIdentity;
+use postgres_ffi::relfile_utils::parse_relfilename;
+use postgres_ffi::{BLCKSZ, pg_constants};
+use remote_storage::RemotePath;
+use tokio::task::JoinSet;
+use tracing::{Instrument, debug, info_span, instrument};
 use utils::bin_ser::BeSer;
 use utils::lsn::Lsn;
 
-use std::collections::HashSet;
-use std::ops::Range;
-
-use super::{
-    importbucket_client::{ControlFile, RemoteStorageWrapper},
-    Timeline,
+use super::Timeline;
+use super::importbucket_client::{ControlFile, RemoteStorageWrapper};
+use crate::assert_u64_eq_usize::UsizeIsU64;
+use crate::context::{DownloadBehavior, RequestContext};
+use crate::pgdatadir_mapping::{
+    DbDirectory, RelDirectory, SlruSegmentDirectory, TwoPhaseDirectory,
 };
-
-use remote_storage::RemotePath;
+use crate::task_mgr::TaskKind;
+use crate::tenant::storage_layer::{ImageLayerWriter, Layer};
 
 pub async fn run(
     timeline: Arc<Timeline>,
diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
index 68937e535d..a17a10d56b 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
@@ -1,4 +1,5 @@
-use std::{ops::Bound, sync::Arc};
+use std::ops::Bound;
+use std::sync::Arc;
 
 use anyhow::Context;
 use bytes::Bytes;
@@ -12,9 +13,9 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, instrument};
 use utils::lsn::Lsn;
 
-use crate::{assert_u64_eq_usize::U64IsUsize, config::PageServerConf};
-
 use super::{importbucket_format, index_part_format};
+use crate::assert_u64_eq_usize::U64IsUsize;
+use crate::config::PageServerConf;
 
 pub async fn new(
     conf: &'static PageServerConf,
diff --git a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs
index 310d97a6a9..ea7a41b25f 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs
@@ -1,7 +1,6 @@
-use serde::{Deserialize, Serialize};
-
 #[cfg(feature = "testing")]
 use camino::Utf8PathBuf;
+use serde::{Deserialize, Serialize};
 
 #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
 pub enum Root {
diff --git a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
index c5210f9a30..7c7a4de2fc 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
@@ -1,13 +1,12 @@
 //! FIXME: most of this is copy-paste from mgmt_api.rs ; dedupe into a `reqwest_utils::Client` crate.
 use pageserver_client::mgmt_api::{Error, ResponseErrorMessageExt};
+use reqwest::Method;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
 use tracing::error;
 
-use crate::config::PageServerConf;
-use reqwest::Method;
-
 use super::importbucket_format::Spec;
+use crate::config::PageServerConf;
 
 pub struct Client {
     base_url: String,
diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs
index 6634d07a0d..e952df0845 100644
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -1,22 +1,16 @@
-use crate::{
-    is_temporary,
-    tenant::{
-        ephemeral_file::is_ephemeral_file,
-        remote_timeline_client::{
-            self,
-            index::{IndexPart, LayerFileMetadata},
-        },
-        storage_layer::LayerName,
-    },
-};
+use std::collections::{HashMap, hash_map};
+use std::str::FromStr;
+
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
-use std::{
-    collections::{hash_map, HashMap},
-    str::FromStr,
-};
 use utils::lsn::Lsn;
 
+use crate::is_temporary;
+use crate::tenant::ephemeral_file::is_ephemeral_file;
+use crate::tenant::remote_timeline_client::index::{IndexPart, LayerFileMetadata};
+use crate::tenant::remote_timeline_client::{self};
+use crate::tenant::storage_layer::LayerName;
+
 /// Identified files in the timeline directory.
 pub(super) enum Discovered {
     /// The only one we care about
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 60e36a5d4d..e552ea83de 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,27 +1,22 @@
-use anyhow::{bail, ensure, Context};
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use anyhow::{Context, bail, ensure};
 use itertools::Itertools;
 use pageserver_api::shard::TenantShardId;
-use std::{collections::HashMap, sync::Arc};
 use tracing::trace;
-use utils::{
-    id::TimelineId,
-    lsn::{AtomicLsn, Lsn},
-};
-
-use crate::{
-    config::PageServerConf,
-    context::RequestContext,
-    metrics::TimelineMetrics,
-    tenant::{
-        layer_map::{BatchedUpdates, LayerMap},
-        storage_layer::{
-            AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc,
-            PersistentLayerKey, ResidentLayer,
-        },
-    },
-};
+use utils::id::TimelineId;
+use utils::lsn::{AtomicLsn, Lsn};
 
 use super::TimelineWriterState;
+use crate::config::PageServerConf;
+use crate::context::RequestContext;
+use crate::metrics::TimelineMetrics;
+use crate::tenant::layer_map::{BatchedUpdates, LayerMap};
+use crate::tenant::storage_layer::{
+    AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc,
+    PersistentLayerKey, ResidentLayer,
+};
 
 /// Provides semantic APIs to manipulate the layer map.
 pub(crate) enum LayerManager {
@@ -214,9 +209,7 @@ impl OpenLayerManager {
 
             trace!(
                 "creating in-memory layer at {}/{} for record at {}",
-                timeline_id,
-                start_lsn,
-                lsn
+                timeline_id, start_lsn, lsn
             );
 
             let new_layer =
diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs
index f4a4eea54a..397037ca9f 100644
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -1,11 +1,10 @@
-use anyhow::Context;
+use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
 
+use anyhow::Context;
 use once_cell::sync::OnceCell;
 use tokio_util::sync::CancellationToken;
 use utils::lsn::Lsn;
 
-use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
-
 /// Internal structure to hold all data needed for logical size calculation.
 ///
 /// Calculation consists of two stages:
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 424a75005d..43ffaa6aab 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -2,11 +2,11 @@ use std::sync::Arc;
 
 use pageserver_api::models::{TenantState, TimelineState};
 
-use super::delete::{delete_local_timeline_directory, DeletionGuard};
 use super::Timeline;
+use super::delete::{DeletionGuard, delete_local_timeline_directory};
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
-use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind};
+use crate::tenant::timeline::delete::{TimelineDeleteGuardKind, make_timeline_delete_guard};
 use crate::tenant::{
     DeleteTimelineError, OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded,
 };
diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs
index 3074463384..f66c0ffa0f 100644
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -1,18 +1,21 @@
-use std::{collections::hash_map::Entry, fs, future::Future, sync::Arc};
+use std::collections::hash_map::Entry;
+use std::fs;
+use std::future::Future;
+use std::sync::Arc;
 
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use tracing::{error, info, info_span};
-use utils::{fs_ext, id::TimelineId, lsn::Lsn, sync::gate::GateGuard};
-
-use crate::{
-    context::RequestContext,
-    import_datadir,
-    span::debug_assert_current_span_has_tenant_and_timeline_id,
-    tenant::{CreateTimelineError, CreateTimelineIdempotency, Tenant, TimelineOrOffloaded},
-};
+use utils::fs_ext;
+use utils::id::TimelineId;
+use utils::lsn::Lsn;
+use utils::sync::gate::GateGuard;
 
 use super::Timeline;
+use crate::context::RequestContext;
+use crate::import_datadir;
+use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::tenant::{CreateTimelineError, CreateTimelineIdempotency, Tenant, TimelineOrOffloaded};
 
 /// A timeline with some of its files on disk, being initialized.
 /// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or
@@ -128,7 +131,7 @@ impl<'t> UninitializedTimeline<'t> {
                 // We do not call Self::abort here.  Because we don't cleanly shut down our Timeline, [`Self::drop`] should
                 // skip trying to delete the timeline directory too.
                 anyhow::bail!(
-                "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map"
+                    "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map"
                 )
             }
             Entry::Vacant(v) => {
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index 67429bff98..4f80073cc3 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -23,17 +23,11 @@
 mod connection_manager;
 mod walreceiver_connection;
 
-use crate::context::{DownloadBehavior, RequestContext};
-use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME};
-use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::timeline::walreceiver::connection_manager::{
-    connection_manager_loop_step, ConnectionManagerState,
-};
-
 use std::future::Future;
 use std::num::NonZeroU64;
 use std::sync::Arc;
 use std::time::Duration;
+
 use storage_broker::BrokerClientChannel;
 use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
@@ -41,8 +35,13 @@ use tracing::*;
 use utils::postgres_client::PostgresClientProtocol;
 
 use self::connection_manager::ConnectionManagerStatus;
-
 use super::Timeline;
+use crate::context::{DownloadBehavior, RequestContext};
+use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME};
+use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::tenant::timeline::walreceiver::connection_manager::{
+    ConnectionManagerState, connection_manager_loop_step,
+};
 
 #[derive(Clone)]
 pub struct WalReceiverConf {
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 1955345315..df2663f6bb 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -9,45 +9,42 @@
 //! then a (re)connection happens, if necessary.
 //! Only WAL streaming task expects to be finished, other loops (storage broker, connection management) never exit unless cancelled explicitly via the dedicated channel.
 
-use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration};
+use std::collections::HashMap;
+use std::num::NonZeroU64;
+use std::ops::ControlFlow;
+use std::sync::Arc;
+use std::time::Duration;
 
-use super::{TaskStateUpdate, WalReceiverConf};
+use anyhow::Context;
+use chrono::{NaiveDateTime, Utc};
+use pageserver_api::models::TimelineState;
+use postgres_connection::PgConnectionConfig;
+use storage_broker::proto::{
+    FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
+    SubscribeByFilterRequest, TenantTimelineId as ProtoTenantTimelineId, TypeSubscription,
+    TypedMessage,
+};
+use storage_broker::{BrokerClientChannel, Code, Streaming};
+use tokio_util::sync::CancellationToken;
+use tracing::*;
+use utils::backoff::{
+    DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff,
+};
+use utils::id::{NodeId, TenantTimelineId};
+use utils::lsn::Lsn;
+use utils::postgres_client::{
+    ConnectionConfigArgs, PostgresClientProtocol, wal_stream_connection_config,
+};
+
+use super::walreceiver_connection::{WalConnectionStatus, WalReceiverError};
+use super::{TaskEvent, TaskHandle, TaskStateUpdate, WalReceiverConf};
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::{
     WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
     WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
 };
 use crate::task_mgr::TaskKind;
-use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
-use anyhow::Context;
-use chrono::{NaiveDateTime, Utc};
-use pageserver_api::models::TimelineState;
-
-use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
-use storage_broker::proto::{
-    FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
-    SubscribeByFilterRequest, TypeSubscription, TypedMessage,
-};
-use storage_broker::{BrokerClientChannel, Code, Streaming};
-use tokio_util::sync::CancellationToken;
-use tracing::*;
-
-use postgres_connection::PgConnectionConfig;
-use utils::backoff::{
-    exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
-};
-use utils::postgres_client::{
-    wal_stream_connection_config, ConnectionConfigArgs, PostgresClientProtocol,
-};
-use utils::{
-    id::{NodeId, TenantTimelineId},
-    lsn::Lsn,
-};
-
-use super::{
-    walreceiver_connection::WalConnectionStatus, walreceiver_connection::WalReceiverError,
-    TaskEvent, TaskHandle,
-};
+use crate::tenant::{Timeline, debug_assert_current_span_has_tenant_and_timeline_id};
 
 pub(crate) struct Cancelled;
 
@@ -349,7 +346,9 @@ async fn subscribe_for_timeline_updates(
             Err(e) => {
                 // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
                 // entire WAL is streamed. Keep this noticeable with logging, but do not warn/error.
-                info!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}");
+                info!(
+                    "Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}"
+                );
                 continue;
             }
         }
@@ -512,11 +511,11 @@ impl ConnectionManagerState {
     fn spawn<Fut>(
         &self,
         task: impl FnOnce(
-                tokio::sync::watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
-                CancellationToken,
-            ) -> Fut
-            + Send
-            + 'static,
+            tokio::sync::watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
+            CancellationToken,
+        ) -> Fut
+        + Send
+        + 'static,
     ) -> TaskHandle<WalConnectionStatus>
     where
         Fut: std::future::Future<Output = anyhow::Result<()>> + Send,
@@ -880,8 +879,7 @@ impl ConnectionManagerState {
                     discovered_new_wal = if candidate_commit_lsn > current_commit_lsn {
                         trace!(
                             "New candidate has commit_lsn {}, higher than current_commit_lsn {}",
-                            candidate_commit_lsn,
-                            current_commit_lsn
+                            candidate_commit_lsn, current_commit_lsn
                         );
                         Some(NewCommittedWAL {
                             lsn: candidate_commit_lsn,
@@ -1048,7 +1046,9 @@ impl ConnectionManagerState {
 
         if !node_ids_to_remove.is_empty() {
             for node_id in node_ids_to_remove {
-                info!("Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections");
+                info!(
+                    "Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections"
+                );
                 self.wal_connection_retries.remove(&node_id);
                 WALRECEIVER_CANDIDATES_REMOVED.inc();
             }
@@ -1119,11 +1119,12 @@ impl ReconnectReason {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
-    use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
     use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL;
     use url::Host;
 
+    use super::*;
+    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
+
     fn dummy_broker_sk_timeline(
         commit_lsn: u64,
         safekeeper_connstr: &str,
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index bb34a181da..f41a9cfe82 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -1,46 +1,48 @@
 //! Actual Postgres connection handler to stream WAL to the server.
 
-use std::{
-    error::Error,
-    pin::pin,
-    str::FromStr,
-    sync::Arc,
-    time::{Duration, SystemTime},
-};
+use std::error::Error;
+use std::pin::pin;
+use std::str::FromStr;
+use std::sync::Arc;
+use std::time::{Duration, SystemTime};
 
-use anyhow::{anyhow, Context};
+use anyhow::{Context, anyhow};
 use bytes::BytesMut;
 use chrono::{NaiveDateTime, Utc};
 use fail::fail_point;
 use futures::StreamExt;
-use postgres_ffi::WAL_SEGMENT_SIZE;
-use postgres_ffi::{v14::xlog_utils::normalize_lsn, waldecoder::WalDecodeError};
-use postgres_protocol::message::backend::ReplicationMessage;
-use postgres_types::PgLsn;
-use tokio::{select, sync::watch, time};
-use tokio_postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow};
-use tokio_postgres::{replication::ReplicationStream, Client};
-use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, trace, warn, Instrument};
-use wal_decoder::{
-    models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords},
-    wire_format::FromWireFormat,
-};
-
-use super::TaskStateUpdate;
-use crate::{
-    context::RequestContext,
-    metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
-    pgdatadir_mapping::DatadirModification,
-    task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
-    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
-    walingest::WalIngest,
-};
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
-use postgres_ffi::waldecoder::WalStreamDecoder;
-use utils::{critical, id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol};
-use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
+use postgres_ffi::WAL_SEGMENT_SIZE;
+use postgres_ffi::v14::xlog_utils::normalize_lsn;
+use postgres_ffi::waldecoder::{WalDecodeError, WalStreamDecoder};
+use postgres_protocol::message::backend::ReplicationMessage;
+use postgres_types::PgLsn;
+use tokio::sync::watch;
+use tokio::{select, time};
+use tokio_postgres::error::SqlState;
+use tokio_postgres::replication::ReplicationStream;
+use tokio_postgres::{Client, SimpleQueryMessage, SimpleQueryRow};
+use tokio_util::sync::CancellationToken;
+use tracing::{Instrument, debug, error, info, trace, warn};
+use utils::critical;
+use utils::id::NodeId;
+use utils::lsn::Lsn;
+use utils::pageserver_feedback::PageserverFeedback;
+use utils::postgres_client::PostgresClientProtocol;
+use utils::sync::gate::GateError;
+use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords};
+use wal_decoder::wire_format::FromWireFormat;
+
+use super::TaskStateUpdate;
+use crate::context::RequestContext;
+use crate::metrics::{LIVE_CONNECTIONS, WAL_INGEST, WALRECEIVER_STARTED_CONNECTIONS};
+use crate::pgdatadir_mapping::DatadirModification;
+use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME};
+use crate::tenant::{
+    Timeline, WalReceiverInfo, debug_assert_current_span_has_tenant_and_timeline_id,
+};
+use crate::walingest::WalIngest;
 
 /// Status of the connection.
 #[derive(Debug, Clone, Copy)]
@@ -149,7 +151,9 @@ pub(super) async fn handle_walreceiver_connection(
                 // Timing out to connect to a safekeeper node could happen long time, due to
                 // many reasons that pageserver cannot control.
                 // Do not produce an error, but make it visible, that timeouts happen by logging the `event.
-                info!("Timed out while waiting {connect_timeout:?} for walreceiver connection to open");
+                info!(
+                    "Timed out while waiting {connect_timeout:?} for walreceiver connection to open"
+                );
                 return Ok(());
             }
         }
@@ -166,7 +170,9 @@ pub(super) async fn handle_walreceiver_connection(
         node: safekeeper_node,
     };
     if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
-        warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}");
+        warn!(
+            "Wal connection event listener dropped right after connection init, aborting the connection: {e}"
+        );
         return Ok(());
     }
 
@@ -227,7 +233,9 @@ pub(super) async fn handle_walreceiver_connection(
     connection_status.latest_wal_update = Utc::now().naive_utc();
     connection_status.commit_lsn = Some(end_of_wal);
     if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
-        warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}");
+        warn!(
+            "Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}"
+        );
         return Ok(());
     }
 
@@ -254,7 +262,9 @@ pub(super) async fn handle_walreceiver_connection(
     //  to the safekeepers.
     startpoint = normalize_lsn(startpoint, WAL_SEGMENT_SIZE);
 
-    info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}...");
+    info!(
+        "last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}..."
+    );
 
     let query = format!("START_REPLICATION PHYSICAL {startpoint}");
 
@@ -626,7 +636,9 @@ pub(super) async fn handle_walreceiver_connection(
                 let timestamp = keepalive.timestamp();
                 let reply_requested = keepalive.reply() != 0;
 
-                trace!("received PrimaryKeepAlive(wal_end: {wal_end}, timestamp: {timestamp:?} reply: {reply_requested})");
+                trace!(
+                    "received PrimaryKeepAlive(wal_end: {wal_end}, timestamp: {timestamp:?} reply: {reply_requested})"
+                );
 
                 if reply_requested {
                     Some(last_rec_lsn)
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index d302205ffe..d5dc9666ce 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,21 +1,18 @@
 use std::collections::{HashMap, HashSet, VecDeque};
 use std::fmt::Debug;
-use std::sync::atomic::AtomicU32;
 use std::sync::Arc;
-
-use super::remote_timeline_client::is_same_remote_layer_path;
-use super::storage_layer::AsLayerDesc as _;
-use super::storage_layer::LayerName;
-use super::storage_layer::ResidentLayer;
-use crate::tenant::metadata::TimelineMetadata;
-use crate::tenant::remote_timeline_client::index::IndexPart;
-use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
-use utils::generation::Generation;
-use utils::lsn::{AtomicLsn, Lsn};
+use std::sync::atomic::AtomicU32;
 
 use chrono::NaiveDateTime;
 use once_cell::sync::Lazy;
 use tracing::info;
+use utils::generation::Generation;
+use utils::lsn::{AtomicLsn, Lsn};
+
+use super::remote_timeline_client::is_same_remote_layer_path;
+use super::storage_layer::{AsLayerDesc as _, LayerName, ResidentLayer};
+use crate::tenant::metadata::TimelineMetadata;
+use crate::tenant::remote_timeline_client::index::{IndexPart, LayerFileMetadata};
 
 /// Kill switch for upload queue reordering in case it causes problems.
 /// TODO: remove this once we have confidence in it.
@@ -225,7 +222,7 @@ impl UploadQueueInitialized {
             // most one of them can be an index upload (enforced by can_bypass).
             .scan(&self.clean.0, |next_active_index, op| {
                 let active_index = *next_active_index;
-                if let UploadOp::UploadMetadata { ref uploaded } = op {
+                if let UploadOp::UploadMetadata { uploaded } = op {
                     *next_active_index = uploaded; // stash index for next operation after this
                 }
                 Some((op, active_index))
@@ -562,16 +559,18 @@ impl UploadOp {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
-    use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
-    use crate::tenant::storage_layer::layer::local_layer_path;
-    use crate::tenant::storage_layer::Layer;
-    use crate::tenant::Timeline;
-    use crate::DEFAULT_PG_VERSION;
-    use itertools::Itertools as _;
     use std::str::FromStr as _;
+
+    use itertools::Itertools as _;
     use utils::shard::{ShardCount, ShardIndex, ShardNumber};
 
+    use super::*;
+    use crate::DEFAULT_PG_VERSION;
+    use crate::tenant::Timeline;
+    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
+    use crate::tenant::storage_layer::Layer;
+    use crate::tenant::storage_layer::layer::local_layer_path;
+
     /// Test helper which asserts that two operations are the same, in lieu of UploadOp PartialEq.
     #[track_caller]
     fn assert_same_op(a: &UploadOp, b: &UploadOp) {
@@ -690,10 +689,22 @@ mod tests {
         let tli = make_timeline();
 
         let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter
-        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
-        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
-        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
-        let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer0 = make_layer(
+            &tli,
+            "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
+        let layer1 = make_layer(
+            &tli,
+            "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
+        let layer2 = make_layer(
+            &tli,
+            "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
+        let layer3 = make_layer(
+            &tli,
+            "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
         let (barrier, _) = tokio::sync::watch::channel(());
 
         // Enqueue non-conflicting upload, delete, and index before and after a barrier.
@@ -757,10 +768,22 @@ mod tests {
         let tli = make_timeline();
 
         // Enqueue a bunch of deletes, some with conflicting names.
-        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
-        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
-        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
-        let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer0 = make_layer(
+            &tli,
+            "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
+        let layer1 = make_layer(
+            &tli,
+            "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
+        let layer2 = make_layer(
+            &tli,
+            "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
+        let layer3 = make_layer(
+            &tli,
+            "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
 
         let ops = [
             UploadOp::Delete(Delete {
@@ -802,9 +825,21 @@ mod tests {
         let tli = make_timeline();
 
         // Enqueue three versions of the same layer, with different file sizes.
-        let layer0a = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 1);
-        let layer0b = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 2);
-        let layer0c = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 3);
+        let layer0a = make_layer_with_size(
+            &tli,
+            "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+            1,
+        );
+        let layer0b = make_layer_with_size(
+            &tli,
+            "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+            2,
+        );
+        let layer0c = make_layer_with_size(
+            &tli,
+            "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+            3,
+        );
 
         let ops = [
             UploadOp::UploadLayer(layer0a.clone(), layer0a.metadata(), None),
@@ -836,8 +871,14 @@ mod tests {
 
         // Enqueue two layer uploads, with a delete of both layers in between them. These should be
         // scheduled one at a time, since deletes can't bypass uploads and vice versa.
-        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
-        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer0 = make_layer(
+            &tli,
+            "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
+        let layer1 = make_layer(
+            &tli,
+            "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
 
         let ops = [
             UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
@@ -878,10 +919,22 @@ mod tests {
         //
         // Also enqueue non-conflicting uploads and deletes at the end. These can bypass the queue
         // and run immediately.
-        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
-        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
-        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
-        let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer0 = make_layer(
+            &tli,
+            "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
+        let layer1 = make_layer(
+            &tli,
+            "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
+        let layer2 = make_layer(
+            &tli,
+            "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
+        let layer3 = make_layer(
+            &tli,
+            "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
 
         let ops = [
             UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
@@ -916,9 +969,18 @@ mod tests {
         let tli = make_timeline();
 
         // Enqueue three different layer uploads.
-        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
-        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
-        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer0 = make_layer(
+            &tli,
+            "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
+        let layer1 = make_layer(
+            &tli,
+            "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
+        let layer2 = make_layer(
+            &tli,
+            "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
 
         let ops = [
             UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
@@ -981,11 +1043,20 @@ mod tests {
 
         // Enqueue three uploads of the current empty index.
         let index = Box::new(queue.clean.0.clone());
-        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer0 = make_layer(
+            &tli,
+            "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
         let index0 = index_with(&index, &layer0);
-        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(
+            &tli,
+            "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
         let index1 = index_with(&index0, &layer1);
-        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer2 = make_layer(
+            &tli,
+            "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
         let index2 = index_with(&index1, &layer2);
 
         let ops = [
@@ -1045,7 +1116,10 @@ mod tests {
         let tli = make_timeline();
 
         // Create a layer to upload.
-        let layer = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer = make_layer(
+            &tli,
+            "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
         let index_upload = index_with(&queue.clean.0, &layer);
 
         // Remove the layer reference in a new index, then delete the layer.
@@ -1090,7 +1164,10 @@ mod tests {
         let tli = make_timeline();
 
         // Create a layer to upload.
-        let layer = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer = make_layer(
+            &tli,
+            "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
 
         // Upload the layer. Then dereference the layer, and upload/reference it again.
         let index_upload = index_with(&queue.clean.0, &layer);
@@ -1138,10 +1215,22 @@ mod tests {
         let tli = make_timeline();
 
         let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter
-        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
-        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
-        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
-        let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer0 = make_layer(
+            &tli,
+            "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
+        let layer1 = make_layer(
+            &tli,
+            "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
+        let layer2 = make_layer(
+            &tli,
+            "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
+        let layer3 = make_layer(
+            &tli,
+            "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
 
         // Enqueue non-conflicting upload, delete, and index before and after a shutdown.
         let ops = [
@@ -1197,10 +1286,22 @@ mod tests {
         let tli = make_timeline();
 
         // Enqueue a bunch of uploads.
-        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
-        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
-        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
-        let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer0 = make_layer(
+            &tli,
+            "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
+        let layer1 = make_layer(
+            &tli,
+            "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
+        let layer2 = make_layer(
+            &tli,
+            "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
+        let layer3 = make_layer(
+            &tli,
+            "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51",
+        );
 
         let ops = [
             UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 47fb4a276b..dcf17a376c 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -27,8 +27,7 @@ use utils::vec_map::VecMap;
 
 use crate::context::RequestContext;
 use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
-use crate::virtual_file::IoBufferMut;
-use crate::virtual_file::{self, VirtualFile};
+use crate::virtual_file::{self, IoBufferMut, VirtualFile};
 
 /// Metadata bundled with the start and end offset of a blob.
 #[derive(Copy, Clone, Debug)]
@@ -139,7 +138,10 @@ impl VectoredBlob {
             bits => {
                 let error = std::io::Error::new(
                     std::io::ErrorKind::InvalidData,
-                    format!("Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}", self.meta.key, self.meta.lsn, self.start, self.end),
+                    format!(
+                        "Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}",
+                        self.meta.key, self.meta.lsn, self.start, self.end
+                    ),
                 );
                 Err(error)
             }
@@ -677,13 +679,12 @@ impl StreamingVectoredReadPlanner {
 mod tests {
     use anyhow::Error;
 
+    use super::super::blob_io::tests::{random_array, write_maybe_compressed};
+    use super::*;
     use crate::context::DownloadBehavior;
     use crate::page_cache::PAGE_SZ;
     use crate::task_mgr::TaskKind;
 
-    use super::super::blob_io::tests::{random_array, write_maybe_compressed};
-    use super::*;
-
     fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
         const ALIGN: u64 = virtual_file::get_io_buffer_alignment() as u64;
         assert_eq!(read.start % ALIGN, 0);
diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs
index 093a944777..29d1a31aaf 100644
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -3,13 +3,15 @@
 //! The metric is exposed via `GET /v1/utilization`. Refer and maintain it's openapi spec as the
 //! truth.
 
-use anyhow::Context;
 use std::path::Path;
+
+use anyhow::Context;
+use pageserver_api::models::PageserverUtilization;
 use utils::serde_percent::Percent;
 
-use pageserver_api::models::PageserverUtilization;
-
-use crate::{config::PageServerConf, metrics::NODE_UTILIZATION_SCORE, tenant::mgr::TenantManager};
+use crate::config::PageServerConf;
+use crate::metrics::NODE_UTILIZATION_SCORE;
+use crate::tenant::mgr::TenantManager;
 
 pub(crate) fn regenerate(
     conf: &PageServerConf,
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index c966ad813f..b47aecf8a6 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -11,11 +11,13 @@
 //! This is similar to PostgreSQL's virtual file descriptor facility in
 //! src/backend/storage/file/fd.c
 //!
-use crate::context::RequestContext;
-use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
+use std::fs::File;
+use std::io::{Error, ErrorKind, Seek, SeekFrom};
+use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
+#[cfg(target_os = "linux")]
+use std::os::unix::fs::OpenOptionsExt;
+use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering};
 
-use crate::page_cache::{PageWriteGuard, PAGE_SZ};
-use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use owned_buffers_io::aligned_buffer::buffer::AlignedBuffer;
@@ -23,31 +25,30 @@ use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlig
 use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut};
 use owned_buffers_io::io_buf_ext::FullSlice;
 use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
+pub use pageserver_api::models::virtual_file as api;
 use pageserver_api::shard::TenantShardId;
-use std::fs::File;
-use std::io::{Error, ErrorKind, Seek, SeekFrom};
-#[cfg(target_os = "linux")]
-use std::os::unix::fs::OpenOptionsExt;
-use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};
-
-use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
-use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
+use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};
 
-pub use pageserver_api::models::virtual_file as api;
+use crate::context::RequestContext;
+use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC, StorageIoOperation};
+use crate::page_cache::{PAGE_SZ, PageWriteGuard};
+use crate::tenant::TENANTS_SEGMENT_NAME;
 pub(crate) mod io_engine;
-pub use io_engine::feature_test as io_engine_feature_test;
-pub use io_engine::io_engine_for_bench;
-pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
+pub use io_engine::{
+    FeatureTestResult as IoEngineFeatureTestResult, feature_test as io_engine_feature_test,
+    io_engine_for_bench,
+};
 mod metadata;
 mod open_options;
-use self::owned_buffers_io::write::OwnedAsyncWriter;
 pub(crate) use api::IoMode;
 pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
 
+use self::owned_buffers_io::write::OwnedAsyncWriter;
+
 pub(crate) mod owned_buffers_io {
     //! Abstractions for IO with owned buffers.
     //!
@@ -1078,7 +1079,8 @@ where
 #[cfg(test)]
 mod test_read_exact_at_impl {
 
-    use std::{collections::VecDeque, sync::Arc};
+    use std::collections::VecDeque;
+    use std::sync::Arc;
 
     use tokio_epoll_uring::{BoundedBuf, BoundedBufMut};
 
@@ -1424,19 +1426,19 @@ static SYNC_MODE: AtomicU8 = AtomicU8::new(SyncMode::Sync as u8);
 
 #[cfg(test)]
 mod tests {
-    use crate::context::DownloadBehavior;
-    use crate::task_mgr::TaskKind;
-
-    use super::*;
-    use owned_buffers_io::io_buf_ext::IoBufExt;
-    use owned_buffers_io::slice::SliceMutExt;
-    use rand::seq::SliceRandom;
-    use rand::thread_rng;
-    use rand::Rng;
     use std::io::Write;
     use std::os::unix::fs::FileExt;
     use std::sync::Arc;
 
+    use owned_buffers_io::io_buf_ext::IoBufExt;
+    use owned_buffers_io::slice::SliceMutExt;
+    use rand::seq::SliceRandom;
+    use rand::{Rng, thread_rng};
+
+    use super::*;
+    use crate::context::DownloadBehavior;
+    use crate::task_mgr::TaskKind;
+
     enum MaybeVirtualFile {
         VirtualFile(VirtualFile),
         File(File),
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index ccde90ee1a..758dd6e377 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -80,7 +80,9 @@ pub(crate) fn get() -> IoEngine {
                     Ok(v) => match v.parse::<IoEngineKind>() {
                         Ok(engine_kind) => engine_kind,
                         Err(e) => {
-                            panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}")
+                            panic!(
+                                "invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}"
+                            )
                         }
                     },
                     Err(std::env::VarError::NotPresent) => {
@@ -107,15 +109,12 @@ pub(crate) fn get() -> IoEngine {
     }
 }
 
-use std::{
-    os::unix::prelude::FileExt,
-    sync::atomic::{AtomicU8, Ordering},
-};
+use std::os::unix::prelude::FileExt;
+use std::sync::atomic::{AtomicU8, Ordering};
 
-use super::{
-    owned_buffers_io::{io_buf_ext::FullSlice, slice::SliceMutExt},
-    FileGuard, Metadata,
-};
+use super::owned_buffers_io::io_buf_ext::FullSlice;
+use super::owned_buffers_io::slice::SliceMutExt;
+use super::{FileGuard, Metadata};
 
 #[cfg(target_os = "linux")]
 fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error<std::io::Error>) -> std::io::Error {
diff --git a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
index c67215492f..ad17405b64 100644
--- a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
+++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
@@ -5,18 +5,16 @@
 //! on older kernels, such as some (but not all) older kernels in the Linux 5.10 series.
 //! See <https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391> for more details.
 
-use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
 use std::sync::Arc;
-
-use tokio_util::sync::CancellationToken;
-use tracing::{error, info, info_span, warn, Instrument};
-use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
+use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
 
 use tokio_epoll_uring::{System, SystemHandle};
-
-use crate::virtual_file::on_fatal_io_error;
+use tokio_util::sync::CancellationToken;
+use tracing::{Instrument, error, info, info_span, warn};
+use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 
 use crate::metrics::tokio_epoll_uring::{self as metrics, THREAD_LOCAL_METRICS_STORAGE};
+use crate::virtual_file::on_fatal_io_error;
 
 #[derive(Clone)]
 struct ThreadLocalState(Arc<ThreadLocalStateInner>);
@@ -194,7 +192,7 @@ impl std::ops::Deref for Handle {
 
     fn deref(&self) -> &Self::Target {
         self.0
-             .0
+            .0
             .cell
             .get()
             .expect("must be already initialized when using this")
diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs
index 7f951270d1..e188b8649b 100644
--- a/pageserver/src/virtual_file/open_options.rs
+++ b/pageserver/src/virtual_file/open_options.rs
@@ -1,7 +1,9 @@
 //! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`];
 
+use std::os::fd::OwnedFd;
+use std::path::Path;
+
 use super::io_engine::IoEngine;
-use std::{os::fd::OwnedFd, path::Path};
 
 #[derive(Debug, Clone)]
 pub enum OpenOptions {
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs
index a5c26cd746..090d2ece85 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs
@@ -1,9 +1,9 @@
-use std::{
-    ops::{Deref, Range, RangeBounds},
-    sync::Arc,
-};
+use std::ops::{Deref, Range, RangeBounds};
+use std::sync::Arc;
 
-use super::{alignment::Alignment, raw::RawAlignedBuffer, AlignedBufferMut, ConstAlign};
+use super::alignment::Alignment;
+use super::raw::RawAlignedBuffer;
+use super::{AlignedBufferMut, ConstAlign};
 
 /// An shared, immutable aligned buffer type.
 #[derive(Clone, Debug)]
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
index d2f5e206bb..df5c911e50 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
@@ -1,13 +1,9 @@
-use std::{
-    mem::MaybeUninit,
-    ops::{Deref, DerefMut},
-};
+use std::mem::MaybeUninit;
+use std::ops::{Deref, DerefMut};
 
-use super::{
-    alignment::{Alignment, ConstAlign},
-    buffer::AlignedBuffer,
-    raw::RawAlignedBuffer,
-};
+use super::alignment::{Alignment, ConstAlign};
+use super::buffer::AlignedBuffer;
+use super::raw::RawAlignedBuffer;
 
 /// A mutable aligned buffer type.
 #[derive(Debug)]
@@ -75,7 +71,8 @@ impl<A: Alignment> AlignedBufferMut<A> {
     /// Force the length of the buffer to `new_len`.
     #[inline]
     unsafe fn set_len(&mut self, new_len: usize) {
-        self.raw.set_len(new_len)
+        // SAFETY: the caller is unsafe
+        unsafe { self.raw.set_len(new_len) }
     }
 
     #[inline]
@@ -222,8 +219,10 @@ unsafe impl<A: Alignment> bytes::BufMut for AlignedBufferMut<A> {
             panic_advance(cnt, remaining);
         }
 
-        // Addition will not overflow since the sum is at most the capacity.
-        self.set_len(len + cnt);
+        // SAFETY: Addition will not overflow since the sum is at most the capacity.
+        unsafe {
+            self.set_len(len + cnt);
+        }
     }
 
     #[inline]
@@ -275,7 +274,10 @@ unsafe impl<A: Alignment> tokio_epoll_uring::IoBufMut for AlignedBufferMut<A> {
 
     unsafe fn set_init(&mut self, init_len: usize) {
         if self.len() < init_len {
-            self.set_len(init_len);
+            // SAFETY: caller function is unsafe
+            unsafe {
+                self.set_len(init_len);
+            }
         }
     }
 }
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs
index 6c26dec0db..97a6c4049a 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs
@@ -1,9 +1,7 @@
 use core::slice;
-use std::{
-    alloc::{self, Layout},
-    cmp,
-    mem::ManuallyDrop,
-};
+use std::alloc::{self, Layout};
+use std::cmp;
+use std::mem::ManuallyDrop;
 
 use super::alignment::{Alignment, ConstAlign};
 
diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
index 525f447b6d..4c671c2652 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
@@ -1,11 +1,12 @@
 //! See [`FullSlice`].
 
-use crate::virtual_file::{IoBuffer, IoBufferMut};
-use bytes::{Bytes, BytesMut};
 use std::ops::{Deref, Range};
+
+use bytes::{Bytes, BytesMut};
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
 
 use super::write::CheapCloneForRead;
+use crate::virtual_file::{IoBuffer, IoBufferMut};
 
 /// The true owned equivalent for Rust [`slice`]. Use this for the write path.
 ///
diff --git a/pageserver/src/virtual_file/owned_buffers_io/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/slice.rs
index 6100593663..9f4a05dd57 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/slice.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/slice.rs
@@ -1,7 +1,4 @@
-use tokio_epoll_uring::BoundedBuf;
-use tokio_epoll_uring::BoundedBufMut;
-use tokio_epoll_uring::IoBufMut;
-use tokio_epoll_uring::Slice;
+use tokio_epoll_uring::{BoundedBuf, BoundedBufMut, IoBufMut, Slice};
 
 pub(crate) trait SliceMutExt {
     /// Get a `&mut[0..self.bytes_total()`] slice, for when you need to do borrow-based IO.
@@ -35,10 +32,11 @@ where
 mod tests {
     use std::io::Read;
 
-    use super::*;
     use bytes::Buf;
     use tokio_epoll_uring::Slice;
 
+    use super::*;
+
     #[test]
     fn test_slice_full_zeroed() {
         let make_fake_file = || bytes::BytesMut::from(&b"12345"[..]).reader();
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index 7299d83703..861ca3aa2a 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -1,20 +1,14 @@
 mod flush;
 use std::sync::Arc;
 
+pub(crate) use flush::FlushControl;
 use flush::FlushHandle;
 use tokio_epoll_uring::IoBuf;
 
-use crate::{
-    context::RequestContext,
-    virtual_file::{IoBuffer, IoBufferMut},
-};
-
-use super::{
-    io_buf_aligned::IoBufAligned,
-    io_buf_ext::{FullSlice, IoBufExt},
-};
-
-pub(crate) use flush::FlushControl;
+use super::io_buf_aligned::IoBufAligned;
+use super::io_buf_ext::{FullSlice, IoBufExt};
+use crate::context::RequestContext;
+use crate::virtual_file::{IoBuffer, IoBufferMut};
 
 pub(crate) trait CheapCloneForRead {
     /// Returns a cheap clone of the buffer.
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
index 9ce8b311bb..46309d4011 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
@@ -2,12 +2,10 @@ use std::sync::Arc;
 
 use utils::sync::duplex;
 
-use crate::{
-    context::RequestContext,
-    virtual_file::owned_buffers_io::{io_buf_aligned::IoBufAligned, io_buf_ext::FullSlice},
-};
-
 use super::{Buffer, CheapCloneForRead, OwnedAsyncWriter};
+use crate::context::RequestContext;
+use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAligned;
+use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
 
 /// A handle to the flush task.
 pub struct FlushHandle<Buf, W> {
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 45c87353a7..18df065f76 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -22,39 +22,35 @@
 //! bespoken Rust code.
 
 use std::collections::HashMap;
-use std::sync::Arc;
-use std::sync::OnceLock;
-use std::time::Duration;
-use std::time::Instant;
-use std::time::SystemTime;
+use std::sync::{Arc, OnceLock};
+use std::time::{Duration, Instant, SystemTime};
 
-use anyhow::{bail, Result};
+use anyhow::{Result, bail};
 use bytes::{Buf, Bytes};
-use tracing::*;
-
-use crate::context::RequestContext;
-use crate::metrics::WAL_INGEST;
-use crate::pgdatadir_mapping::{DatadirModification, Version};
-use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::PageReconstructError;
-use crate::tenant::Timeline;
-use crate::ZERO_PAGE;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
-use postgres_ffi::fsm_logical_to_physical;
-use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::walrecord::*;
-use postgres_ffi::TransactionId;
-use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
+use postgres_ffi::{
+    TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch,
+    fsm_logical_to_physical, pg_constants,
+};
+use tracing::*;
 use utils::bin_ser::SerializeError;
 use utils::lsn::Lsn;
 use utils::rate_limit::RateLimit;
 use utils::{critical, failpoint_support};
 use wal_decoder::models::*;
 
+use crate::ZERO_PAGE;
+use crate::context::RequestContext;
+use crate::metrics::WAL_INGEST;
+use crate::pgdatadir_mapping::{DatadirModification, Version};
+use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::tenant::{PageReconstructError, Timeline};
+
 enum_pgversion! {CheckPoint, pgv::CheckPoint}
 
 impl CheckPoint {
@@ -302,7 +298,9 @@ impl WalIngest {
         if xid > next_xid {
             // Wraparound occurred, must be from a prev epoch.
             if epoch == 0 {
-                bail!("apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}");
+                bail!(
+                    "apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}"
+                );
             }
             epoch -= 1;
         }
@@ -796,9 +794,7 @@ impl WalIngest {
             // Remove twophase file. see RemoveTwoPhaseFile() in postgres code
             trace!(
                 "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
-                xl_xid,
-                parsed.xid,
-                lsn,
+                xl_xid, parsed.xid, lsn,
             );
 
             let xid: u64 = if modification.tline.pg_version >= 17 {
@@ -1130,16 +1126,14 @@ impl WalIngest {
                 let xlog_checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?;
                 trace!(
                     "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
-                    xlog_checkpoint.oldestXid,
-                    cp.oldestXid
+                    xlog_checkpoint.oldestXid, cp.oldestXid
                 );
                 if (cp.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
                     cp.oldestXid = xlog_checkpoint.oldestXid;
                 }
                 trace!(
                     "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}",
-                    xlog_checkpoint.oldestActiveXid,
-                    cp.oldestActiveXid
+                    xlog_checkpoint.oldestActiveXid, cp.oldestActiveXid
                 );
 
                 // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
@@ -1368,8 +1362,9 @@ impl WalIngest {
             // with zero pages. Logging is rate limited per pg version to
             // avoid skewing.
             if gap_blocks_filled > 0 {
-                use once_cell::sync::Lazy;
                 use std::sync::Mutex;
+
+                use once_cell::sync::Lazy;
                 use utils::rate_limit::RateLimit;
 
                 struct RateLimitPerPgVersion {
@@ -1475,10 +1470,7 @@ impl WalIngest {
         if new_nblocks > old_nblocks {
             trace!(
                 "extending SLRU {:?} seg {} from {} to {} blocks",
-                kind,
-                segno,
-                old_nblocks,
-                new_nblocks
+                kind, segno, old_nblocks, new_nblocks
             );
             modification.put_slru_extend(kind, segno, new_nblocks)?;
 
@@ -1517,13 +1509,13 @@ async fn get_relsize(
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
-    use super::*;
-    use crate::tenant::harness::*;
-    use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH};
-    use crate::tenant::storage_layer::IoConcurrency;
     use postgres_ffi::RELSEG_SIZE;
 
+    use super::*;
     use crate::DEFAULT_PG_VERSION;
+    use crate::tenant::harness::*;
+    use crate::tenant::remote_timeline_client::{INITDB_PATH, remote_initdb_archive_path};
+    use crate::tenant::storage_layer::IoConcurrency;
 
     /// Arbitrary relation tag, for testing.
     const TESTREL_A: RelTag = RelTag {
@@ -1606,10 +1598,12 @@ mod tests {
                 .await?,
             false
         );
-        assert!(tline
-            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
-            .await
-            .is_err());
+        assert!(
+            tline
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
+                .await
+                .is_err()
+        );
         assert_eq!(
             tline
                 .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
@@ -1997,10 +1991,12 @@ mod tests {
                 .await?,
             false
         );
-        assert!(tline
-            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
-            .await
-            .is_err());
+        assert!(
+            tline
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
+                .await
+                .is_err()
+        );
 
         assert_eq!(
             tline
@@ -2230,9 +2226,10 @@ mod tests {
     /// without waiting for unrelated steps.
     #[tokio::test]
     async fn test_ingest_real_wal() {
-        use crate::tenant::harness::*;
-        use postgres_ffi::waldecoder::WalStreamDecoder;
         use postgres_ffi::WAL_SEGMENT_SIZE;
+        use postgres_ffi::waldecoder::WalStreamDecoder;
+
+        use crate::tenant::harness::*;
 
         // Define test data path and constants.
         //
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 027a6eb7d7..22d8d83811 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -24,26 +24,27 @@ mod process;
 /// Code to apply [`NeonWalRecord`]s.
 pub(crate) mod apply_neon;
 
-use crate::config::PageServerConf;
-use crate::metrics::{
-    WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
-    WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME,
-};
+use std::future::Future;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
 use anyhow::Context;
 use bytes::{Bytes, BytesMut};
 use pageserver_api::key::Key;
 use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::shard::TenantShardId;
-use std::future::Future;
-use std::sync::Arc;
-use std::time::Duration;
-use std::time::Instant;
 use tracing::*;
 use utils::lsn::Lsn;
 use utils::sync::gate::GateError;
 use utils::sync::heavier_once_cell;
 
+use crate::config::PageServerConf;
+use crate::metrics::{
+    WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
+    WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME,
+};
+
 /// The real implementation that uses a Postgres process to
 /// perform WAL replay.
 ///
@@ -547,15 +548,18 @@ impl PostgresRedoManager {
 
 #[cfg(test)]
 mod tests {
-    use super::PostgresRedoManager;
-    use crate::config::PageServerConf;
+    use std::str::FromStr;
+
     use bytes::Bytes;
     use pageserver_api::key::Key;
     use pageserver_api::record::NeonWalRecord;
     use pageserver_api::shard::TenantShardId;
-    use std::str::FromStr;
     use tracing::Instrument;
-    use utils::{id::TenantId, lsn::Lsn};
+    use utils::id::TenantId;
+    use utils::lsn::Lsn;
+
+    use super::PostgresRedoManager;
+    use crate::config::PageServerConf;
 
     #[tokio::test]
     async fn test_ping() {
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index d62e325310..61ae1eb970 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -4,13 +4,12 @@ use bytes::BytesMut;
 use pageserver_api::key::Key;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::SlruKind;
-use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
 use postgres_ffi::v14::nonrelfile_utils::{
     mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset,
     transaction_id_set_status,
 };
-use postgres_ffi::BLCKSZ;
+use postgres_ffi::{BLCKSZ, pg_constants};
 use tracing::*;
 use utils::lsn::Lsn;
 
diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs
index bf30b92ea5..5a9fc63e63 100644
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -2,28 +2,28 @@ mod no_leak_child;
 /// The IPC protocol that pageserver and walredo process speak over their shared pipe.
 mod protocol;
 
-use self::no_leak_child::NoLeakChild;
-use crate::{
-    config::PageServerConf,
-    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
-    page_cache::PAGE_SZ,
-    span::debug_assert_current_span_has_tenant_id,
-};
+use std::collections::VecDeque;
+use std::process::{Command, Stdio};
+#[cfg(feature = "testing")]
+use std::sync::atomic::AtomicUsize;
+use std::time::Duration;
+
 use anyhow::Context;
 use bytes::Bytes;
 use pageserver_api::record::NeonWalRecord;
-use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use pageserver_api::reltag::RelTag;
+use pageserver_api::shard::TenantShardId;
 use postgres_ffi::BLCKSZ;
-#[cfg(feature = "testing")]
-use std::sync::atomic::AtomicUsize;
-use std::{
-    collections::VecDeque,
-    process::{Command, Stdio},
-    time::Duration,
-};
 use tokio::io::{AsyncReadExt, AsyncWriteExt};
-use tracing::{debug, error, instrument, Instrument};
-use utils::{lsn::Lsn, poison::Poison};
+use tracing::{Instrument, debug, error, instrument};
+use utils::lsn::Lsn;
+use utils::poison::Poison;
+
+use self::no_leak_child::NoLeakChild;
+use crate::config::PageServerConf;
+use crate::metrics::{WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER, WalRedoKillCause};
+use crate::page_cache::PAGE_SZ;
+use crate::span::debug_assert_current_span_has_tenant_id;
 
 pub struct WalRedoProcess {
     #[allow(dead_code)]
diff --git a/pageserver/src/walredo/process/no_leak_child.rs b/pageserver/src/walredo/process/no_leak_child.rs
index 1a0d7039df..9939fc4b36 100644
--- a/pageserver/src/walredo/process/no_leak_child.rs
+++ b/pageserver/src/walredo/process/no_leak_child.rs
@@ -1,19 +1,11 @@
-use tracing::instrument;
-use tracing::{error, info};
-
-use crate::metrics::WalRedoKillCause;
-use crate::metrics::WAL_REDO_PROCESS_COUNTERS;
-
 use std::io;
-use std::process::Command;
-
-use std::ops::DerefMut;
-
-use std::ops::Deref;
-
-use std::process::Child;
+use std::ops::{Deref, DerefMut};
+use std::process::{Child, Command};
 
 use pageserver_api::shard::TenantShardId;
+use tracing::{error, info, instrument};
+
+use crate::metrics::{WAL_REDO_PROCESS_COUNTERS, WalRedoKillCause};
 
 /// Wrapper type around `std::process::Child` which guarantees that the child
 /// will be killed and waited-for by this process before being dropped.
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index c86ac576ad..bb937ad56a 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "safekeeper"
 version = "0.1.0"
-edition.workspace = true
+edition = "2024"
 license.workspace = true
 
 [features]
diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs
index 1c0ae66f01..122630d953 100644
--- a/safekeeper/benches/receive_wal.rs
+++ b/safekeeper/benches/receive_wal.rs
@@ -4,7 +4,7 @@ use std::io::Write as _;
 
 use bytes::BytesMut;
 use camino_tempfile::tempfile;
-use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion};
+use criterion::{BatchSize, Bencher, Criterion, criterion_group, criterion_main};
 use itertools::Itertools as _;
 use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator};
 use pprof::criterion::{Output, PProfProfiler};
@@ -27,7 +27,7 @@ const GB: usize = 1024 * MB;
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
 #[allow(non_upper_case_globals)]
-#[export_name = "malloc_conf"]
+#[unsafe(export_name = "malloc_conf")]
 pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
 
 // Register benchmarks with Criterion.
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 6cc53e0d23..10fc4a4b59 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -1,52 +1,41 @@
 //
 // Main entry point for the safekeeper executable
 //
-use anyhow::{bail, Context, Result};
-use camino::{Utf8Path, Utf8PathBuf};
-use clap::{ArgAction, Parser};
-use futures::future::BoxFuture;
-use futures::stream::FuturesUnordered;
-use futures::{FutureExt, StreamExt};
-use remote_storage::RemoteStorageConfig;
-use sd_notify::NotifyState;
-use tokio::runtime::Handle;
-use tokio::signal::unix::{signal, SignalKind};
-use tokio::task::JoinError;
-use utils::logging::SecretString;
-
-use std::env::{var, VarError};
+use std::env::{VarError, var};
 use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
-use storage_broker::Uri;
-
-use tracing::*;
-use utils::pid_file;
 
+use anyhow::{Context, Result, bail};
+use camino::{Utf8Path, Utf8PathBuf};
+use clap::{ArgAction, Parser};
+use futures::future::BoxFuture;
+use futures::stream::FuturesUnordered;
+use futures::{FutureExt, StreamExt};
 use metrics::set_build_info_metric;
+use remote_storage::RemoteStorageConfig;
 use safekeeper::defaults::{
     DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT,
     DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY,
     DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
 };
-use safekeeper::http;
-use safekeeper::wal_service;
-use safekeeper::GlobalTimelines;
-use safekeeper::SafeKeeperConf;
-use safekeeper::{broker, WAL_SERVICE_RUNTIME};
-use safekeeper::{control_file, BROKER_RUNTIME};
-use safekeeper::{wal_backup, HTTP_RUNTIME};
-use storage_broker::DEFAULT_ENDPOINT;
-use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
-use utils::{
-    id::NodeId,
-    logging::{self, LogFormat},
-    project_build_tag, project_git_version,
-    sentry_init::init_sentry,
-    tcp_listener,
+use safekeeper::{
+    BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf, WAL_SERVICE_RUNTIME, broker,
+    control_file, http, wal_backup, wal_service,
 };
+use sd_notify::NotifyState;
+use storage_broker::{DEFAULT_ENDPOINT, Uri};
+use tokio::runtime::Handle;
+use tokio::signal::unix::{SignalKind, signal};
+use tokio::task::JoinError;
+use tracing::*;
+use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
+use utils::id::NodeId;
+use utils::logging::{self, LogFormat, SecretString};
+use utils::sentry_init::init_sentry;
+use utils::{pid_file, project_build_tag, project_git_version, tcp_listener};
 
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
@@ -55,7 +44,7 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 /// This adds roughly 3% overhead for allocations on average, which is acceptable considering
 /// performance-sensitive code will avoid allocations as far as possible anyway.
 #[allow(non_upper_case_globals)]
-#[export_name = "malloc_conf"]
+#[unsafe(export_name = "malloc_conf")]
 pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
 
 const PID_FILE_NAME: &str = "safekeeper.pid";
diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index 4b091e2c29..de6e275124 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -1,39 +1,25 @@
 //! Communication with the broker, providing safekeeper peers and pageserver coordination.
 
-use anyhow::anyhow;
-use anyhow::bail;
-use anyhow::Context;
-
-use anyhow::Error;
-use anyhow::Result;
-
-use storage_broker::parse_proto_ttid;
-
-use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
-use storage_broker::proto::FilterTenantTimelineId;
-use storage_broker::proto::MessageType;
-use storage_broker::proto::SafekeeperDiscoveryResponse;
-use storage_broker::proto::SubscribeByFilterRequest;
-use storage_broker::proto::SubscribeSafekeeperInfoRequest;
-use storage_broker::proto::TypeSubscription;
-use storage_broker::proto::TypedMessage;
-use storage_broker::Request;
-
-use std::sync::atomic::AtomicU64;
 use std::sync::Arc;
-use std::time::Duration;
-use std::time::Instant;
-use std::time::UNIX_EPOCH;
+use std::sync::atomic::AtomicU64;
+use std::time::{Duration, Instant, UNIX_EPOCH};
+
+use anyhow::{Context, Error, Result, anyhow, bail};
+use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
+use storage_broker::proto::{
+    FilterTenantTimelineId, MessageType, SafekeeperDiscoveryResponse, SubscribeByFilterRequest,
+    SubscribeSafekeeperInfoRequest, TypeSubscription, TypedMessage,
+};
+use storage_broker::{Request, parse_proto_ttid};
 use tokio::task::JoinHandle;
 use tokio::time::sleep;
 use tracing::*;
 
-use crate::metrics::BROKER_ITERATION_TIMELINES;
-use crate::metrics::BROKER_PULLED_UPDATES;
-use crate::metrics::BROKER_PUSHED_UPDATES;
-use crate::metrics::BROKER_PUSH_ALL_UPDATES_SECONDS;
-use crate::GlobalTimelines;
-use crate::SafeKeeperConf;
+use crate::metrics::{
+    BROKER_ITERATION_TIMELINES, BROKER_PULLED_UPDATES, BROKER_PUSH_ALL_UPDATES_SECONDS,
+    BROKER_PUSHED_UPDATES,
+};
+use crate::{GlobalTimelines, SafeKeeperConf};
 
 const RETRY_INTERVAL_MSEC: u64 = 1000;
 const PUSH_INTERVAL_MSEC: u64 = 1000;
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index 35aebfd8ad..1bf3e4cac1 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -1,24 +1,23 @@
 //! Control file serialization, deserialization and persistence.
 
-use anyhow::{bail, ensure, Context, Result};
-use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
-use camino::{Utf8Path, Utf8PathBuf};
-use safekeeper_api::membership::INVALID_GENERATION;
-use tokio::fs::File;
-use tokio::io::AsyncWriteExt;
-use utils::crashsafe::durable_rename;
-
 use std::future::Future;
 use std::io::Read;
 use std::ops::Deref;
 use std::path::Path;
 use std::time::Instant;
 
-use crate::control_file_upgrade::downgrade_v10_to_v9;
-use crate::control_file_upgrade::upgrade_control_file;
+use anyhow::{Context, Result, bail, ensure};
+use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
+use camino::{Utf8Path, Utf8PathBuf};
+use safekeeper_api::membership::INVALID_GENERATION;
+use tokio::fs::File;
+use tokio::io::AsyncWriteExt;
+use utils::bin_ser::LeSer;
+use utils::crashsafe::durable_rename;
+
+use crate::control_file_upgrade::{downgrade_v10_to_v9, upgrade_control_file};
 use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
 use crate::state::{EvictionState, TimelinePersistentState};
-use utils::bin_ser::LeSer;
 
 pub const SK_MAGIC: u32 = 0xcafeceefu32;
 pub const SK_FORMAT_VERSION: u32 = 10;
@@ -234,11 +233,12 @@ impl Storage for FileStorage {
 
 #[cfg(test)]
 mod test {
-    use super::*;
     use safekeeper_api::membership::{Configuration, MemberSet, SafekeeperGeneration};
     use tokio::fs;
     use utils::lsn::Lsn;
 
+    use super::*;
+
     const NO_SYNC: bool = true;
 
     #[tokio::test]
diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs
index 904e79f976..1ad9e62f9b 100644
--- a/safekeeper/src/control_file_upgrade.rs
+++ b/safekeeper/src/control_file_upgrade.rs
@@ -1,24 +1,19 @@
 //! Code to deal with safekeeper control file upgrades
 use std::vec;
 
-use crate::{
-    safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn},
-    state::{EvictionState, TimelinePersistentState},
-    wal_backup_partial,
-};
-use anyhow::{bail, Result};
+use anyhow::{Result, bail};
 use pq_proto::SystemId;
-use safekeeper_api::{
-    membership::{Configuration, INVALID_GENERATION},
-    ServerInfo, Term,
-};
+use safekeeper_api::membership::{Configuration, INVALID_GENERATION};
+use safekeeper_api::{ServerInfo, Term};
 use serde::{Deserialize, Serialize};
 use tracing::*;
-use utils::{
-    bin_ser::LeSer,
-    id::{NodeId, TenantId, TimelineId},
-    lsn::Lsn,
-};
+use utils::bin_ser::LeSer;
+use utils::id::{NodeId, TenantId, TimelineId};
+use utils::lsn::Lsn;
+
+use crate::safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn};
+use crate::state::{EvictionState, TimelinePersistentState};
+use crate::wal_backup_partial;
 
 /// Persistent consensus state of the acceptor.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
@@ -552,11 +547,11 @@ pub fn downgrade_v10_to_v9(state: &TimelinePersistentState) -> TimelinePersisten
 mod tests {
     use std::str::FromStr;
 
-    use utils::{id::NodeId, Hex};
-
-    use crate::control_file_upgrade::PersistedPeerInfo;
+    use utils::Hex;
+    use utils::id::NodeId;
 
     use super::*;
+    use crate::control_file_upgrade::PersistedPeerInfo;
 
     #[test]
     fn roundtrip_v1() {
diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs
index 10a761e1f5..11daff22cb 100644
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -1,24 +1,22 @@
-use anyhow::{bail, Result};
+use std::sync::Arc;
+
+use anyhow::{Result, bail};
 use camino::Utf8PathBuf;
 use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
 use safekeeper_api::membership::Configuration;
-use std::sync::Arc;
-use tokio::{
-    fs::OpenOptions,
-    io::{AsyncSeekExt, AsyncWriteExt},
-};
+use tokio::fs::OpenOptions;
+use tokio::io::{AsyncSeekExt, AsyncWriteExt};
 use tracing::{info, warn};
-use utils::{id::TenantTimelineId, lsn::Lsn};
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
 
-use crate::{
-    control_file::FileStorage,
-    state::TimelinePersistentState,
-    timeline::{TimelineError, WalResidentTimeline},
-    timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
-    wal_backup::copy_s3_segments,
-    wal_storage::{wal_file_paths, WalReader},
-    GlobalTimelines,
-};
+use crate::GlobalTimelines;
+use crate::control_file::FileStorage;
+use crate::state::TimelinePersistentState;
+use crate::timeline::{TimelineError, WalResidentTimeline};
+use crate::timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline};
+use crate::wal_backup::copy_s3_segments;
+use crate::wal_storage::{WalReader, wal_file_paths};
 
 // we don't want to have more than 10 segments on disk after copy, because they take space
 const MAX_BACKUP_LAG: u64 = 10 * WAL_SEGMENT_SIZE as u64;
diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs
index 19362a0992..68a38e1498 100644
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -2,37 +2,25 @@
 
 use std::fs;
 use std::fs::DirEntry;
-use std::io::BufReader;
-use std::io::Read;
+use std::io::{BufReader, Read};
 use std::path::PathBuf;
 use std::sync::Arc;
 
-use anyhow::bail;
-use anyhow::Result;
-use camino::Utf8Path;
-use camino::Utf8PathBuf;
+use anyhow::{Result, bail};
+use camino::{Utf8Path, Utf8PathBuf};
 use chrono::{DateTime, Utc};
-use postgres_ffi::XLogSegNo;
-use postgres_ffi::MAX_SEND_SIZE;
-use safekeeper_api::models::WalSenderState;
-use serde::Deserialize;
-use serde::Serialize;
-
 use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName};
+use postgres_ffi::{MAX_SEND_SIZE, XLogSegNo};
+use safekeeper_api::models::WalSenderState;
+use serde::{Deserialize, Serialize};
 use sha2::{Digest, Sha256};
-use utils::id::NodeId;
-use utils::id::TenantTimelineId;
-use utils::id::{TenantId, TimelineId};
+use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
 use utils::lsn::Lsn;
 
 use crate::safekeeper::TermHistory;
-use crate::state::TimelineMemState;
-use crate::state::TimelinePersistentState;
-use crate::timeline::get_timeline_dir;
-use crate::timeline::WalResidentTimeline;
-use crate::timeline_manager;
-use crate::GlobalTimelines;
-use crate::SafeKeeperConf;
+use crate::state::{TimelineMemState, TimelinePersistentState};
+use crate::timeline::{WalResidentTimeline, get_timeline_dir};
+use crate::{GlobalTimelines, SafeKeeperConf, timeline_manager};
 
 /// Various filters that influence the resulting JSON output.
 #[derive(Debug, Serialize, Deserialize, Clone)]
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index e77eeb4130..dd7008c87d 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -1,35 +1,32 @@
 //! Part of Safekeeper pretending to be Postgres, i.e. handling Postgres
 //! protocol commands.
 
+use std::future::Future;
+use std::str::{self, FromStr};
+use std::sync::Arc;
+
 use anyhow::Context;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
-use safekeeper_api::models::ConnectionId;
+use postgres_backend::{PostgresBackend, QueryError};
+use postgres_ffi::PG_TLI;
+use pq_proto::{BeMessage, FeStartupPacket, INT4_OID, RowDescriptor, TEXT_OID};
+use regex::Regex;
 use safekeeper_api::Term;
-use std::future::Future;
-use std::str::{self, FromStr};
-use std::sync::Arc;
+use safekeeper_api::models::ConnectionId;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{debug, info, info_span, Instrument};
+use tracing::{Instrument, debug, info, info_span};
+use utils::auth::{Claims, JwtAuth, Scope};
+use utils::id::{TenantId, TenantTimelineId, TimelineId};
+use utils::lsn::Lsn;
 use utils::postgres_client::PostgresClientProtocol;
 use utils::shard::{ShardCount, ShardNumber};
 
 use crate::auth::check_permission;
-use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
-
-use crate::metrics::{TrafficMetrics, PG_QUERIES_GAUGE};
+use crate::json_ctrl::{AppendLogicalMessage, handle_json_ctrl};
+use crate::metrics::{PG_QUERIES_GAUGE, TrafficMetrics};
 use crate::timeline::TimelineError;
 use crate::{GlobalTimelines, SafeKeeperConf};
-use postgres_backend::PostgresBackend;
-use postgres_backend::QueryError;
-use postgres_ffi::PG_TLI;
-use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
-use regex::Regex;
-use utils::auth::{Claims, JwtAuth, Scope};
-use utils::{
-    id::{TenantId, TenantTimelineId, TimelineId},
-    lsn::Lsn,
-};
 
 /// Safekeeper handler of postgres commands
 pub struct SafekeeperPostgresHandler {
diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs
index 6e160b7a5e..f162985ef7 100644
--- a/safekeeper/src/http/mod.rs
+++ b/safekeeper/src/http/mod.rs
@@ -1,9 +1,9 @@
 pub mod routes;
-pub use routes::make_router;
-
-pub use safekeeper_api::models;
 use std::sync::Arc;
 
+pub use routes::make_router;
+pub use safekeeper_api::models;
+
 use crate::{GlobalTimelines, SafeKeeperConf};
 
 pub async fn task_main(
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index cd2ac5f44c..3b3bc71ac4 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -1,51 +1,41 @@
-use http_utils::failpoints::failpoints_handler;
-use hyper::{Body, Request, Response, StatusCode};
-use safekeeper_api::models;
-use safekeeper_api::models::AcceptorStateStatus;
-use safekeeper_api::models::PullTimelineRequest;
-use safekeeper_api::models::SafekeeperStatus;
-use safekeeper_api::models::TermSwitchApiEntry;
-use safekeeper_api::models::TimelineStatus;
-use safekeeper_api::ServerInfo;
 use std::collections::HashMap;
 use std::fmt;
 use std::io::Write as _;
 use std::str::FromStr;
 use std::sync::Arc;
-use storage_broker::proto::SafekeeperTimelineInfo;
-use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
+
+use http_utils::endpoint::{
+    self, ChannelWriter, auth_middleware, check_permission_with, profile_cpu_handler,
+    profile_heap_handler, prometheus_metrics_handler, request_span,
+};
+use http_utils::error::ApiError;
+use http_utils::failpoints::failpoints_handler;
+use http_utils::json::{json_request, json_response};
+use http_utils::request::{ensure_no_body, parse_query_param, parse_request_param};
+use http_utils::{RequestExt, RouterBuilder};
+use hyper::{Body, Request, Response, StatusCode};
+use postgres_ffi::WAL_SEGMENT_SIZE;
+use safekeeper_api::models::{
+    AcceptorStateStatus, PullTimelineRequest, SafekeeperStatus, SkTimelineInfo, TermSwitchApiEntry,
+    TimelineCopyRequest, TimelineCreateRequest, TimelineStatus, TimelineTermBumpRequest,
+};
+use safekeeper_api::{ServerInfo, models};
+use storage_broker::proto::{SafekeeperTimelineInfo, TenantTimelineId as ProtoTenantTimelineId};
 use tokio::sync::mpsc;
 use tokio::task;
 use tokio_stream::wrappers::ReceiverStream;
 use tokio_util::sync::CancellationToken;
-use tracing::{info_span, Instrument};
-
-use http_utils::endpoint::{
-    profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span,
-};
-use http_utils::{
-    endpoint::{self, auth_middleware, check_permission_with, ChannelWriter},
-    error::ApiError,
-    json::{json_request, json_response},
-    request::{ensure_no_body, parse_query_param, parse_request_param},
-    RequestExt, RouterBuilder,
-};
-
-use postgres_ffi::WAL_SEGMENT_SIZE;
-use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest};
-use safekeeper_api::models::{TimelineCreateRequest, TimelineTermBumpRequest};
-use utils::{
-    auth::SwappableJwtAuth,
-    id::{TenantId, TenantTimelineId, TimelineId},
-    lsn::Lsn,
-};
+use tracing::{Instrument, info_span};
+use utils::auth::SwappableJwtAuth;
+use utils::id::{TenantId, TenantTimelineId, TimelineId};
+use utils::lsn::Lsn;
 
 use crate::debug_dump::TimelineDigestRequest;
 use crate::safekeeper::TermLsn;
 use crate::timelines_global_map::TimelineDeleteForceResult;
-use crate::GlobalTimelines;
-use crate::SafeKeeperConf;
-use crate::{copy_timeline, debug_dump, patch_control_file, pull_timeline};
+use crate::{
+    GlobalTimelines, SafeKeeperConf, copy_timeline, debug_dump, patch_control_file, pull_timeline,
+};
 
 /// Healthcheck handler.
 async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs
index 8d7c1109ad..793ea9c3e9 100644
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -7,26 +7,23 @@
 //!
 
 use anyhow::Context;
-use postgres_backend::QueryError;
+use postgres_backend::{PostgresBackend, QueryError};
+use postgres_ffi::{WAL_SEGMENT_SIZE, encode_logical_message};
+use pq_proto::{BeMessage, RowDescriptor, TEXT_OID};
 use safekeeper_api::membership::{Configuration, INVALID_GENERATION};
 use safekeeper_api::{ServerInfo, Term};
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::*;
+use utils::lsn::Lsn;
 
 use crate::handler::SafekeeperPostgresHandler;
-use crate::safekeeper::{AcceptorProposerMessage, AppendResponse};
 use crate::safekeeper::{
-    AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected,
+    AcceptorProposerMessage, AppendRequest, AppendRequestHeader, AppendResponse,
+    ProposerAcceptorMessage, ProposerElected, TermHistory, TermLsn,
 };
-use crate::safekeeper::{TermHistory, TermLsn};
 use crate::state::TimelinePersistentState;
 use crate::timeline::WalResidentTimeline;
-use postgres_backend::PostgresBackend;
-use postgres_ffi::encode_logical_message;
-use postgres_ffi::WAL_SEGMENT_SIZE;
-use pq_proto::{BeMessage, RowDescriptor, TEXT_OID};
-use utils::lsn::Lsn;
 
 #[derive(Serialize, Deserialize, Debug)]
 pub struct AppendLogicalMessage {
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index e0090c638a..c52b097066 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -2,15 +2,16 @@
 
 extern crate hyper0 as hyper;
 
+use std::time::Duration;
+
 use camino::Utf8PathBuf;
 use once_cell::sync::Lazy;
 use remote_storage::RemoteStorageConfig;
-use tokio::runtime::Runtime;
-
-use std::time::Duration;
 use storage_broker::Uri;
-
-use utils::{auth::SwappableJwtAuth, id::NodeId, logging::SecretString};
+use tokio::runtime::Runtime;
+use utils::auth::SwappableJwtAuth;
+use utils::id::NodeId;
+use utils::logging::SecretString;
 
 mod auth;
 pub mod broker;
@@ -48,6 +49,7 @@ pub mod test_utils;
 
 mod timelines_global_map;
 use std::sync::Arc;
+
 pub use timelines_global_map::GlobalTimelines;
 use utils::auth::JwtAuth;
 
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 3ea9e3d674..cb21a5f6d2 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -1,30 +1,28 @@
 //! Global safekeeper mertics and per-timeline safekeeper metrics.
 
-use std::{
-    sync::{Arc, RwLock},
-    time::{Instant, SystemTime},
-};
+use std::sync::{Arc, RwLock};
+use std::time::{Instant, SystemTime};
 
 use anyhow::Result;
 use futures::Future;
+use metrics::core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts};
+use metrics::proto::MetricFamily;
 use metrics::{
-    core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
-    pow2_buckets,
-    proto::MetricFamily,
+    DISK_FSYNC_SECONDS_BUCKETS, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter,
+    IntCounterPair, IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, pow2_buckets,
     register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair,
     register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge,
-    register_int_gauge_vec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
-    IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS,
+    register_int_gauge_vec,
 };
 use once_cell::sync::Lazy;
 use postgres_ffi::XLogSegNo;
-use utils::{id::TenantTimelineId, lsn::Lsn, pageserver_feedback::PageserverFeedback};
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
+use utils::pageserver_feedback::PageserverFeedback;
 
-use crate::{
-    receive_wal::MSG_QUEUE_SIZE,
-    state::{TimelineMemState, TimelinePersistentState},
-    GlobalTimelines,
-};
+use crate::GlobalTimelines;
+use crate::receive_wal::MSG_QUEUE_SIZE;
+use crate::state::{TimelineMemState, TimelinePersistentState};
 
 // Global metrics across all timelines.
 pub static WRITE_WAL_BYTES: Lazy<Histogram> = Lazy::new(|| {
diff --git a/safekeeper/src/patch_control_file.rs b/safekeeper/src/patch_control_file.rs
index 2136d1b5f7..efdbd9b3d7 100644
--- a/safekeeper/src/patch_control_file.rs
+++ b/safekeeper/src/patch_control_file.rs
@@ -4,7 +4,8 @@ use serde::{Deserialize, Serialize};
 use serde_json::Value;
 use tracing::info;
 
-use crate::{state::TimelinePersistentState, timeline::Timeline};
+use crate::state::TimelinePersistentState;
+use crate::timeline::Timeline;
 
 #[derive(Deserialize, Debug, Clone)]
 pub struct Request {
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 4827b73074..fc58b8509a 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -1,46 +1,38 @@
-use anyhow::{anyhow, bail, Context, Result};
+use std::cmp::min;
+use std::io::{self, ErrorKind};
+use std::sync::Arc;
+
+use anyhow::{Context, Result, anyhow, bail};
 use bytes::Bytes;
 use camino::Utf8PathBuf;
 use chrono::{DateTime, Utc};
 use futures::{SinkExt, StreamExt, TryStreamExt};
-use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
-use safekeeper_api::{
-    models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus},
-    Term,
-};
+use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
+use safekeeper_api::Term;
+use safekeeper_api::models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus};
 use safekeeper_client::mgmt_api;
 use safekeeper_client::mgmt_api::Client;
 use serde::Deserialize;
-use std::{
-    cmp::min,
-    io::{self, ErrorKind},
-    sync::Arc,
-};
-use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task};
+use tokio::fs::OpenOptions;
+use tokio::io::AsyncWrite;
+use tokio::sync::mpsc;
+use tokio::task;
 use tokio_tar::{Archive, Builder, Header};
-use tokio_util::{
-    io::{CopyToBytes, SinkWriter},
-    sync::PollSender,
-};
+use tokio_util::io::{CopyToBytes, SinkWriter};
+use tokio_util::sync::PollSender;
 use tracing::{error, info, instrument};
+use utils::crashsafe::fsync_async_opt;
+use utils::id::{NodeId, TenantTimelineId};
+use utils::logging::SecretString;
+use utils::lsn::Lsn;
+use utils::pausable_failpoint;
 
-use crate::{
-    control_file::CONTROL_FILE_NAME,
-    debug_dump,
-    state::{EvictionState, TimelinePersistentState},
-    timeline::{Timeline, WalResidentTimeline},
-    timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
-    wal_backup,
-    wal_storage::open_wal_file,
-    GlobalTimelines,
-};
-use utils::{
-    crashsafe::fsync_async_opt,
-    id::{NodeId, TenantTimelineId},
-    logging::SecretString,
-    lsn::Lsn,
-    pausable_failpoint,
-};
+use crate::control_file::CONTROL_FILE_NAME;
+use crate::state::{EvictionState, TimelinePersistentState};
+use crate::timeline::{Timeline, WalResidentTimeline};
+use crate::timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline};
+use crate::wal_storage::open_wal_file;
+use crate::{GlobalTimelines, debug_dump, wal_backup};
 
 /// Stream tar archive of timeline to tx.
 #[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))]
@@ -374,8 +366,13 @@ impl WalResidentTimeline {
         // change, but as long as older history is strictly part of new that's
         // fine), but there is no need to do it.
         if bctx.term != term || bctx.last_log_term != last_log_term {
-            bail!("term(s) changed during snapshot: were term={}, last_log_term={}, now term={}, last_log_term={}",
-              bctx.term, bctx.last_log_term, term, last_log_term);
+            bail!(
+                "term(s) changed during snapshot: were term={}, last_log_term={}, now term={}, last_log_term={}",
+                bctx.term,
+                bctx.last_log_term,
+                term,
+                last_log_term
+            );
         }
         Ok(())
     }
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index a94e6930e1..7967acde3f 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -2,35 +2,21 @@
 //! Gets messages from the network, passes them down to consensus module and
 //! sends replies back.
 
-use crate::handler::SafekeeperPostgresHandler;
-use crate::metrics::{
-    WAL_RECEIVERS, WAL_RECEIVER_QUEUE_DEPTH, WAL_RECEIVER_QUEUE_DEPTH_TOTAL,
-    WAL_RECEIVER_QUEUE_SIZE_TOTAL,
-};
-use crate::safekeeper::AcceptorProposerMessage;
-use crate::safekeeper::ProposerAcceptorMessage;
-use crate::timeline::WalResidentTimeline;
-use crate::GlobalTimelines;
-use anyhow::{anyhow, Context};
-use bytes::BytesMut;
-use parking_lot::MappedMutexGuard;
-use parking_lot::Mutex;
-use parking_lot::MutexGuard;
-use postgres_backend::CopyStreamHandlerEnd;
-use postgres_backend::PostgresBackend;
-use postgres_backend::PostgresBackendReader;
-use postgres_backend::QueryError;
-use pq_proto::BeMessage;
-use safekeeper_api::membership::Configuration;
-use safekeeper_api::models::{ConnectionId, WalReceiverState, WalReceiverStatus};
-use safekeeper_api::ServerInfo;
 use std::future;
 use std::net::SocketAddr;
 use std::sync::Arc;
-use tokio::io::AsyncRead;
-use tokio::io::AsyncWrite;
+
+use anyhow::{Context, anyhow};
+use bytes::BytesMut;
+use parking_lot::{MappedMutexGuard, Mutex, MutexGuard};
+use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend, PostgresBackendReader, QueryError};
+use pq_proto::BeMessage;
+use safekeeper_api::ServerInfo;
+use safekeeper_api::membership::Configuration;
+use safekeeper_api::models::{ConnectionId, WalReceiverState, WalReceiverStatus};
+use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::sync::mpsc::error::SendTimeoutError;
-use tokio::sync::mpsc::{channel, Receiver, Sender};
+use tokio::sync::mpsc::{Receiver, Sender, channel};
 use tokio::task;
 use tokio::task::JoinHandle;
 use tokio::time::{Duration, Instant, MissedTickBehavior};
@@ -39,6 +25,15 @@ use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
 use utils::pageserver_feedback::PageserverFeedback;
 
+use crate::GlobalTimelines;
+use crate::handler::SafekeeperPostgresHandler;
+use crate::metrics::{
+    WAL_RECEIVER_QUEUE_DEPTH, WAL_RECEIVER_QUEUE_DEPTH_TOTAL, WAL_RECEIVER_QUEUE_SIZE_TOTAL,
+    WAL_RECEIVERS,
+};
+use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage};
+use crate::timeline::WalResidentTimeline;
+
 const DEFAULT_FEEDBACK_CAPACITY: usize = 8;
 
 /// Registry of WalReceivers (compute connections). Timeline holds it (wrapped
@@ -371,7 +366,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
             _ => {
                 return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!(
                     "unexpected message {next_msg:?} instead of greeting"
-                )))
+                )));
             }
         };
         Ok((tli, next_msg))
diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs
index 3e9080ebbe..c2760792b8 100644
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -1,40 +1,36 @@
 //! This module implements pulling WAL from peer safekeepers if compute can't
 //! provide it, i.e. safekeeper lags too much.
 
+use std::fmt;
+use std::pin::pin;
 use std::time::SystemTime;
-use std::{fmt, pin::pin};
 
-use anyhow::{bail, Context};
+use anyhow::{Context, bail};
 use futures::StreamExt;
 use postgres_protocol::message::backend::ReplicationMessage;
+use safekeeper_api::Term;
 use safekeeper_api::membership::INVALID_GENERATION;
 use safekeeper_api::models::{PeerInfo, TimelineStatus};
-use safekeeper_api::Term;
-use tokio::sync::mpsc::{channel, Receiver, Sender};
-use tokio::time::timeout;
-use tokio::{
-    select,
-    time::sleep,
-    time::{self, Duration},
-};
+use tokio::select;
+use tokio::sync::mpsc::{Receiver, Sender, channel};
+use tokio::time::{self, Duration, sleep, timeout};
 use tokio_postgres::replication::ReplicationStream;
 use tokio_postgres::types::PgLsn;
 use tracing::*;
-use utils::postgres_client::{ConnectionConfigArgs, PostgresClientProtocol};
-use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config};
-
-use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
-use crate::safekeeper::{AppendRequest, AppendRequestHeader};
-use crate::timeline::WalResidentTimeline;
-use crate::{
-    receive_wal::MSG_QUEUE_SIZE,
-    safekeeper::{
-        AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, TermHistory, TermLsn,
-        VoteRequest,
-    },
-    SafeKeeperConf,
+use utils::id::NodeId;
+use utils::lsn::Lsn;
+use utils::postgres_client::{
+    ConnectionConfigArgs, PostgresClientProtocol, wal_stream_connection_config,
 };
 
+use crate::SafeKeeperConf;
+use crate::receive_wal::{MSG_QUEUE_SIZE, REPLY_QUEUE_SIZE, WalAcceptor};
+use crate::safekeeper::{
+    AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage,
+    ProposerElected, TermHistory, TermLsn, VoteRequest,
+};
+use crate::timeline::WalResidentTimeline;
+
 /// Entrypoint for per timeline task which always runs, checking whether
 /// recovery for this safekeeper is needed and starting it if so.
 #[instrument(name = "recovery", skip_all, fields(ttid = %tli.ttid))]
@@ -355,7 +351,9 @@ async fn recovery_stream(
     {
         Ok(client_and_conn) => client_and_conn?,
         Err(_elapsed) => {
-            bail!("timed out while waiting {connect_timeout:?} for connection to peer safekeeper to open");
+            bail!(
+                "timed out while waiting {connect_timeout:?} for connection to peer safekeeper to open"
+            );
         }
     };
     trace!("connected to {:?}", donor);
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index f429cafed2..0edac04b97 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -1,39 +1,31 @@
 //! Acceptor part of proposer-acceptor consensus algorithm.
 
-use anyhow::{bail, Context, Result};
-use byteorder::{LittleEndian, ReadBytesExt};
-use bytes::{Buf, BufMut, Bytes, BytesMut};
-
-use postgres_ffi::{TimeLineID, MAX_SEND_SIZE};
-use safekeeper_api::membership;
-use safekeeper_api::membership::MemberSet;
-use safekeeper_api::membership::SafekeeperGeneration as Generation;
-use safekeeper_api::membership::SafekeeperId;
-use safekeeper_api::membership::INVALID_GENERATION;
-use safekeeper_api::models::HotStandbyFeedback;
-use safekeeper_api::Term;
-use serde::{Deserialize, Serialize};
-use std::cmp::max;
-use std::cmp::min;
+use std::cmp::{max, min};
 use std::fmt;
 use std::io::Read;
 use std::str::FromStr;
-use storage_broker::proto::SafekeeperTimelineInfo;
 
-use tracing::*;
-
-use crate::control_file;
-use crate::metrics::MISC_OPERATION_SECONDS;
-
-use crate::state::TimelineState;
-use crate::wal_storage;
+use anyhow::{Context, Result, bail};
+use byteorder::{LittleEndian, ReadBytesExt};
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+use postgres_ffi::{MAX_SEND_SIZE, TimeLineID};
 use pq_proto::SystemId;
-use utils::pageserver_feedback::PageserverFeedback;
-use utils::{
-    bin_ser::LeSer,
-    id::{NodeId, TenantId, TimelineId},
-    lsn::Lsn,
+use safekeeper_api::membership::{
+    INVALID_GENERATION, MemberSet, SafekeeperGeneration as Generation, SafekeeperId,
 };
+use safekeeper_api::models::HotStandbyFeedback;
+use safekeeper_api::{Term, membership};
+use serde::{Deserialize, Serialize};
+use storage_broker::proto::SafekeeperTimelineInfo;
+use tracing::*;
+use utils::bin_ser::LeSer;
+use utils::id::{NodeId, TenantId, TimelineId};
+use utils::lsn::Lsn;
+use utils::pageserver_feedback::PageserverFeedback;
+
+use crate::metrics::MISC_OPERATION_SECONDS;
+use crate::state::TimelineState;
+use crate::{control_file, wal_storage};
 
 pub const SK_PROTO_VERSION_2: u32 = 2;
 pub const SK_PROTO_VERSION_3: u32 = 3;
@@ -1137,9 +1129,14 @@ where
         // and walproposer recalculates the streaming point. OTOH repeating
         // error indicates a serious bug.
         if last_common_point.lsn != msg.start_streaming_at {
-            bail!("refusing ProposerElected with unexpected truncation point: lcp={:?} start_streaming_at={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}",
-                    last_common_point, msg.start_streaming_at,
-                    self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history,
+            bail!(
+                "refusing ProposerElected with unexpected truncation point: lcp={:?} start_streaming_at={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}",
+                last_common_point,
+                msg.start_streaming_at,
+                self.state.acceptor_state.term,
+                sk_th,
+                self.flush_lsn(),
+                msg.term_history,
             );
         }
 
@@ -1147,8 +1144,12 @@ where
         assert!(
             msg.start_streaming_at >= self.state.inmem.commit_lsn,
             "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}",
-            msg.start_streaming_at, self.state.inmem.commit_lsn,
-            self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history,
+            msg.start_streaming_at,
+            self.state.inmem.commit_lsn,
+            self.state.acceptor_state.term,
+            sk_th,
+            self.flush_lsn(),
+            msg.term_history,
         );
 
         // Before first WAL write initialize its segment. It makes first segment
@@ -1373,21 +1374,19 @@ where
 
 #[cfg(test)]
 mod tests {
-    use futures::future::BoxFuture;
+    use std::ops::Deref;
+    use std::str::FromStr;
+    use std::time::{Instant, UNIX_EPOCH};
 
-    use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE};
-    use safekeeper_api::{
-        membership::{Configuration, MemberSet, SafekeeperGeneration, SafekeeperId},
-        ServerInfo,
+    use futures::future::BoxFuture;
+    use postgres_ffi::{WAL_SEGMENT_SIZE, XLogSegNo};
+    use safekeeper_api::ServerInfo;
+    use safekeeper_api::membership::{
+        Configuration, MemberSet, SafekeeperGeneration, SafekeeperId,
     };
 
     use super::*;
     use crate::state::{EvictionState, TimelinePersistentState};
-    use std::{
-        ops::Deref,
-        str::FromStr,
-        time::{Instant, UNIX_EPOCH},
-    };
 
     // fake storage for tests
     struct InMemoryState {
diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
index 0662bb9518..be0c849a5f 100644
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -3,23 +3,22 @@ use std::fmt::Display;
 use std::sync::Arc;
 use std::time::Duration;
 
-use anyhow::{anyhow, Context};
-use futures::future::Either;
+use anyhow::{Context, anyhow};
 use futures::StreamExt;
+use futures::future::Either;
 use pageserver_api::shard::ShardIdentity;
 use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend};
-use postgres_ffi::waldecoder::WalDecodeError;
-use postgres_ffi::{get_current_timestamp, waldecoder::WalStreamDecoder};
+use postgres_ffi::get_current_timestamp;
+use postgres_ffi::waldecoder::{WalDecodeError, WalStreamDecoder};
 use pq_proto::{BeMessage, InterpretedWalRecordsBody, WalSndKeepAlive};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::sync::mpsc::error::SendError;
 use tokio::task::JoinHandle;
 use tokio::time::MissedTickBehavior;
-use tracing::{error, info, info_span, Instrument};
+use tracing::{Instrument, error, info, info_span};
 use utils::critical;
 use utils::lsn::Lsn;
-use utils::postgres_client::Compression;
-use utils::postgres_client::InterpretedFormat;
+use utils::postgres_client::{Compression, InterpretedFormat};
 use wal_decoder::models::{InterpretedWalRecord, InterpretedWalRecords};
 use wal_decoder::wire_format::ToWireFormat;
 
@@ -691,22 +690,20 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
 }
 #[cfg(test)]
 mod tests {
-    use std::{collections::HashMap, str::FromStr, time::Duration};
+    use std::collections::HashMap;
+    use std::str::FromStr;
+    use std::time::Duration;
 
     use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
     use postgres_ffi::MAX_SEND_SIZE;
     use tokio::sync::mpsc::error::TryRecvError;
-    use utils::{
-        id::{NodeId, TenantTimelineId},
-        lsn::Lsn,
-        shard::{ShardCount, ShardNumber},
-    };
+    use utils::id::{NodeId, TenantTimelineId};
+    use utils::lsn::Lsn;
+    use utils::shard::{ShardCount, ShardNumber};
 
-    use crate::{
-        send_interpreted_wal::{AttachShardNotification, Batch, InterpretedWalReader},
-        test_utils::Env,
-        wal_reader_stream::StreamingWalReader,
-    };
+    use crate::send_interpreted_wal::{AttachShardNotification, Batch, InterpretedWalReader};
+    use crate::test_utils::Env;
+    use crate::wal_reader_stream::StreamingWalReader;
 
     #[tokio::test]
     async fn test_interpreted_wal_reader_fanout() {
@@ -808,9 +805,11 @@ mod tests {
 
         // This test uses logical messages. Those only go to shard 0. Check that the
         // filtering worked and shard 1 did not get any.
-        assert!(shard_1_interpreted_records
-            .iter()
-            .all(|recs| recs.records.is_empty()));
+        assert!(
+            shard_1_interpreted_records
+                .iter()
+                .all(|recs| recs.records.is_empty())
+        );
 
         // Shard 0 should not receive anything more since the reader is
         // going through wal that it has already processed.
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 72b1fd9fc3..33e3d0485c 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -1,6 +1,34 @@
 //! This module implements the streaming side of replication protocol, starting
 //! with the "START_REPLICATION" message, and registry of walsenders.
 
+use std::cmp::{max, min};
+use std::net::SocketAddr;
+use std::sync::Arc;
+use std::time::Duration;
+
+use anyhow::{Context as AnyhowContext, bail};
+use bytes::Bytes;
+use futures::FutureExt;
+use itertools::Itertools;
+use parking_lot::Mutex;
+use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend, PostgresBackendReader, QueryError};
+use postgres_ffi::{MAX_SEND_SIZE, TimestampTz, get_current_timestamp};
+use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
+use safekeeper_api::Term;
+use safekeeper_api::models::{
+    HotStandbyFeedback, INVALID_FULL_TRANSACTION_ID, ReplicationFeedback, StandbyFeedback,
+    StandbyReply,
+};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::sync::watch::Receiver;
+use tokio::time::timeout;
+use tracing::*;
+use utils::bin_ser::BeSer;
+use utils::failpoint_support;
+use utils::lsn::Lsn;
+use utils::pageserver_feedback::PageserverFeedback;
+use utils::postgres_client::PostgresClientProtocol;
+
 use crate::handler::SafekeeperPostgresHandler;
 use crate::metrics::{RECEIVED_PS_FEEDBACKS, WAL_READERS};
 use crate::receive_wal::WalReceivers;
@@ -11,34 +39,6 @@ use crate::send_interpreted_wal::{
 use crate::timeline::WalResidentTimeline;
 use crate::wal_reader_stream::StreamingWalReader;
 use crate::wal_storage::WalReader;
-use anyhow::{bail, Context as AnyhowContext};
-use bytes::Bytes;
-use futures::FutureExt;
-use parking_lot::Mutex;
-use postgres_backend::PostgresBackend;
-use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError};
-use postgres_ffi::get_current_timestamp;
-use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
-use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
-use safekeeper_api::models::{
-    HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply,
-    INVALID_FULL_TRANSACTION_ID,
-};
-use safekeeper_api::Term;
-use tokio::io::{AsyncRead, AsyncWrite};
-use utils::failpoint_support;
-use utils::pageserver_feedback::PageserverFeedback;
-use utils::postgres_client::PostgresClientProtocol;
-
-use itertools::Itertools;
-use std::cmp::{max, min};
-use std::net::SocketAddr;
-use std::sync::Arc;
-use std::time::Duration;
-use tokio::sync::watch::Receiver;
-use tokio::time::timeout;
-use tracing::*;
-use utils::{bin_ser::BeSer, lsn::Lsn};
 
 // See: https://www.postgresql.org/docs/13/protocol-replication.html
 const HOT_STANDBY_FEEDBACK_TAG_BYTE: u8 = b'h';
@@ -906,9 +906,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                         // pageserver to identify WalReceiverError::SuccessfulCompletion,
                         // do not change this string without updating pageserver.
                         return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
-                        "ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
-                        self.appname, self.start_pos,
-                    )));
+                            "ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
+                            self.appname, self.start_pos,
+                        )));
                     }
                 }
             }
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index 4d566b12a0..e437e6d2cd 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -1,28 +1,24 @@
 //! Defines per timeline data stored persistently (SafeKeeperPersistentState)
 //! and its wrapper with in memory layer (SafekeeperState).
 
-use std::{cmp::max, ops::Deref, time::SystemTime};
+use std::cmp::max;
+use std::ops::Deref;
+use std::time::SystemTime;
 
-use anyhow::{bail, Result};
+use anyhow::{Result, bail};
 use postgres_ffi::WAL_SEGMENT_SIZE;
-use safekeeper_api::{
-    membership::Configuration,
-    models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse},
-    ServerInfo, Term, INITIAL_TERM,
-};
+use safekeeper_api::membership::Configuration;
+use safekeeper_api::models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse};
+use safekeeper_api::{INITIAL_TERM, ServerInfo, Term};
 use serde::{Deserialize, Serialize};
 use tracing::info;
-use utils::{
-    id::{TenantId, TenantTimelineId, TimelineId},
-    lsn::Lsn,
-};
+use utils::id::{TenantId, TenantTimelineId, TimelineId};
+use utils::lsn::Lsn;
 
-use crate::{
-    control_file,
-    safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn, UNKNOWN_SERVER_VERSION},
-    timeline::TimelineError,
-    wal_backup_partial::{self},
-};
+use crate::control_file;
+use crate::safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn, UNKNOWN_SERVER_VERSION};
+use crate::timeline::TimelineError;
+use crate::wal_backup_partial::{self};
 
 /// Persistent information stored on safekeeper node about timeline.
 /// On disk data is prefixed by magic and format version and followed by checksum.
diff --git a/safekeeper/src/test_utils.rs b/safekeeper/src/test_utils.rs
index 32af4537d3..e6f74185c1 100644
--- a/safekeeper/src/test_utils.rs
+++ b/safekeeper/src/test_utils.rs
@@ -1,5 +1,12 @@
 use std::sync::Arc;
 
+use camino_tempfile::Utf8TempDir;
+use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator};
+use safekeeper_api::membership::SafekeeperGeneration as Generation;
+use tokio::fs::create_dir_all;
+use utils::id::{NodeId, TenantTimelineId};
+use utils::lsn::Lsn;
+
 use crate::rate_limit::RateLimiter;
 use crate::receive_wal::WalAcceptor;
 use crate::safekeeper::{
@@ -8,16 +15,10 @@ use crate::safekeeper::{
 };
 use crate::send_wal::EndWatch;
 use crate::state::{TimelinePersistentState, TimelineState};
-use crate::timeline::{get_timeline_dir, SharedState, StateSK, Timeline};
+use crate::timeline::{SharedState, StateSK, Timeline, get_timeline_dir};
 use crate::timelines_set::TimelinesSet;
 use crate::wal_backup::remote_timeline_path;
-use crate::{control_file, receive_wal, wal_storage, SafeKeeperConf};
-use camino_tempfile::Utf8TempDir;
-use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator};
-use safekeeper_api::membership::SafekeeperGeneration as Generation;
-use tokio::fs::create_dir_all;
-use utils::id::{NodeId, TenantTimelineId};
-use utils::lsn::Lsn;
+use crate::{SafeKeeperConf, control_file, receive_wal, wal_storage};
 
 /// A Safekeeper testing or benchmarking environment. Uses a tempdir for storage, removed on drop.
 pub struct Env {
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 4341f13824..c140f16ced 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -1,37 +1,32 @@
 //! This module implements Timeline lifecycle management and has all necessary code
 //! to glue together SafeKeeper and all other background services.
 
-use anyhow::{anyhow, bail, Result};
+use std::cmp::max;
+use std::ops::{Deref, DerefMut};
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
+use std::time::Duration;
+
+use anyhow::{Result, anyhow, bail};
 use camino::{Utf8Path, Utf8PathBuf};
+use http_utils::error::ApiError;
 use remote_storage::RemotePath;
+use safekeeper_api::Term;
 use safekeeper_api::membership::Configuration;
 use safekeeper_api::models::{
     PeerInfo, TimelineMembershipSwitchResponse, TimelineTermBumpResponse,
 };
-use safekeeper_api::Term;
+use storage_broker::proto::{SafekeeperTimelineInfo, TenantTimelineId as ProtoTenantTimelineId};
 use tokio::fs::{self};
+use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard, watch};
+use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
-use utils::id::TenantId;
+use tracing::*;
+use utils::id::{NodeId, TenantId, TenantTimelineId};
+use utils::lsn::Lsn;
 use utils::sync::gate::Gate;
 
-use http_utils::error::ApiError;
-use std::cmp::max;
-use std::ops::{Deref, DerefMut};
-use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
-use std::sync::Arc;
-use std::time::Duration;
-use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
-use tokio::{sync::watch, time::Instant};
-use tracing::*;
-use utils::{
-    id::{NodeId, TenantTimelineId},
-    lsn::Lsn,
-};
-
-use storage_broker::proto::SafekeeperTimelineInfo;
-use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
-
-use crate::control_file;
+use crate::metrics::{FullTimelineInfo, MISC_OPERATION_SECONDS, WalStorageMetrics};
 use crate::rate_limit::RateLimiter;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn};
@@ -42,11 +37,8 @@ use crate::timeline_manager::{AtomicStatus, ManagerCtl};
 use crate::timelines_set::TimelinesSet;
 use crate::wal_backup::{self, remote_timeline_path};
 use crate::wal_backup_partial::PartialRemoteSegment;
-
-use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};
 use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
-use crate::SafeKeeperConf;
-use crate::{debug_dump, timeline_manager, wal_storage};
+use crate::{SafeKeeperConf, control_file, debug_dump, timeline_manager, wal_storage};
 
 fn peer_info_from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo {
     PeerInfo {
@@ -168,7 +160,7 @@ impl StateSK {
     pub fn state(&self) -> &TimelineState<control_file::FileStorage> {
         match self {
             StateSK::Loaded(sk) => &sk.state,
-            StateSK::Offloaded(ref s) => s,
+            StateSK::Offloaded(s) => s,
             StateSK::Empty => unreachable!(),
         }
     }
@@ -176,7 +168,7 @@ impl StateSK {
     pub fn state_mut(&mut self) -> &mut TimelineState<control_file::FileStorage> {
         match self {
             StateSK::Loaded(sk) => &mut sk.state,
-            StateSK::Offloaded(ref mut s) => s,
+            StateSK::Offloaded(s) => s,
             StateSK::Empty => unreachable!(),
         }
     }
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index 303421c837..06ccb32d03 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -7,23 +7,19 @@
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use remote_storage::RemotePath;
-use tokio::{
-    fs::File,
-    io::{AsyncRead, AsyncWriteExt},
-};
+use tokio::fs::File;
+use tokio::io::{AsyncRead, AsyncWriteExt};
 use tracing::{debug, info, instrument, warn};
 use utils::crashsafe::durable_rename;
 
-use crate::{
-    metrics::{
-        EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED, NUM_EVICTED_TIMELINES,
-    },
-    rate_limit::rand_duration,
-    timeline_manager::{Manager, StateSnapshot},
-    wal_backup,
-    wal_backup_partial::{self, PartialRemoteSegment},
-    wal_storage::wal_file_paths,
+use crate::metrics::{
+    EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED, EvictionEvent, NUM_EVICTED_TIMELINES,
 };
+use crate::rate_limit::rand_duration;
+use crate::timeline_manager::{Manager, StateSnapshot};
+use crate::wal_backup;
+use crate::wal_backup_partial::{self, PartialRemoteSegment};
+use crate::wal_storage::wal_file_paths;
 
 impl Manager {
     /// Returns true if the timeline is ready for eviction.
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index a33994dcab..71e99a4de7 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -7,41 +7,36 @@
 //! Be aware that you need to be extra careful with manager code, because it is not respawned on panic.
 //! Also, if it will stuck in some branch, it will prevent any further progress in the timeline.
 
-use std::{
-    sync::{atomic::AtomicUsize, Arc},
-    time::Duration,
-};
+use std::sync::Arc;
+use std::sync::atomic::AtomicUsize;
+use std::time::Duration;
 
 use futures::channel::oneshot;
 use postgres_ffi::XLogSegNo;
-use safekeeper_api::{models::PeerInfo, Term};
+use safekeeper_api::Term;
+use safekeeper_api::models::PeerInfo;
 use serde::{Deserialize, Serialize};
-use tokio::{
-    task::{JoinError, JoinHandle},
-    time::Instant,
-};
+use tokio::task::{JoinError, JoinHandle};
+use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, info, info_span, instrument, warn, Instrument};
+use tracing::{Instrument, debug, info, info_span, instrument, warn};
 use utils::lsn::Lsn;
 
-use crate::{
-    control_file::{FileStorage, Storage},
-    metrics::{
-        MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS,
-        NUM_EVICTED_TIMELINES,
-    },
-    rate_limit::{rand_duration, RateLimiter},
-    recovery::recovery_main,
-    remove_wal::calc_horizon_lsn,
-    send_wal::WalSenders,
-    state::TimelineState,
-    timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline},
-    timeline_guard::{AccessService, GuardId, ResidenceGuard},
-    timelines_set::{TimelineSetGuard, TimelinesSet},
-    wal_backup::{self, WalBackupTaskHandle},
-    wal_backup_partial::{self, PartialBackup, PartialRemoteSegment},
-    SafeKeeperConf,
+use crate::SafeKeeperConf;
+use crate::control_file::{FileStorage, Storage};
+use crate::metrics::{
+    MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS, NUM_EVICTED_TIMELINES,
 };
+use crate::rate_limit::{RateLimiter, rand_duration};
+use crate::recovery::recovery_main;
+use crate::remove_wal::calc_horizon_lsn;
+use crate::send_wal::WalSenders;
+use crate::state::TimelineState;
+use crate::timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline};
+use crate::timeline_guard::{AccessService, GuardId, ResidenceGuard};
+use crate::timelines_set::{TimelineSetGuard, TimelinesSet};
+use crate::wal_backup::{self, WalBackupTaskHandle};
+use crate::wal_backup_partial::{self, PartialBackup, PartialRemoteSegment};
 
 pub(crate) struct StateSnapshot {
     // inmem values
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 1ff6a72bce..1d29030711 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -2,31 +2,33 @@
 //! All timelines should always be present in this map, this is done by loading them
 //! all from the disk on startup and keeping them in memory.
 
-use crate::defaults::DEFAULT_EVICTION_CONCURRENCY;
-use crate::rate_limit::RateLimiter;
-use crate::state::TimelinePersistentState;
-use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
-use crate::timelines_set::TimelinesSet;
-use crate::wal_storage::Storage;
-use crate::{control_file, wal_storage, SafeKeeperConf};
-use anyhow::{bail, Context, Result};
-use camino::Utf8PathBuf;
-use camino_tempfile::Utf8TempDir;
-use safekeeper_api::membership::Configuration;
-use safekeeper_api::models::SafekeeperUtilization;
-use safekeeper_api::ServerInfo;
-use serde::Serialize;
 use std::collections::HashMap;
 use std::str::FromStr;
 use std::sync::atomic::Ordering;
 use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};
+
+use anyhow::{Context, Result, bail};
+use camino::Utf8PathBuf;
+use camino_tempfile::Utf8TempDir;
+use safekeeper_api::ServerInfo;
+use safekeeper_api::membership::Configuration;
+use safekeeper_api::models::SafekeeperUtilization;
+use serde::Serialize;
 use tokio::fs;
 use tracing::*;
 use utils::crashsafe::{durable_rename, fsync_async_opt};
 use utils::id::{TenantId, TenantTimelineId, TimelineId};
 use utils::lsn::Lsn;
 
+use crate::defaults::DEFAULT_EVICTION_CONCURRENCY;
+use crate::rate_limit::RateLimiter;
+use crate::state::TimelinePersistentState;
+use crate::timeline::{Timeline, TimelineError, get_tenant_dir, get_timeline_dir};
+use crate::timelines_set::TimelinesSet;
+use crate::wal_storage::Storage;
+use crate::{SafeKeeperConf, control_file, wal_storage};
+
 // Timeline entry in the global map: either a ready timeline, or mark that it is
 // being created.
 #[derive(Clone)]
diff --git a/safekeeper/src/timelines_set.rs b/safekeeper/src/timelines_set.rs
index 096e348295..1d1abc530f 100644
--- a/safekeeper/src/timelines_set.rs
+++ b/safekeeper/src/timelines_set.rs
@@ -1,4 +1,5 @@
-use std::{collections::HashMap, sync::Arc};
+use std::collections::HashMap;
+use std::sync::Arc;
 
 use utils::id::TenantTimelineId;
 
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 2f6b91cf47..6176e64698 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -1,34 +1,29 @@
-use anyhow::{Context, Result};
-
-use camino::{Utf8Path, Utf8PathBuf};
-use futures::stream::FuturesOrdered;
-use futures::StreamExt;
-use safekeeper_api::models::PeerInfo;
-use tokio::task::JoinHandle;
-use tokio_util::sync::CancellationToken;
-use utils::backoff;
-use utils::id::NodeId;
-
 use std::cmp::min;
 use std::collections::HashSet;
 use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::time::Duration;
 
+use anyhow::{Context, Result};
+use camino::{Utf8Path, Utf8PathBuf};
+use futures::StreamExt;
+use futures::stream::FuturesOrdered;
 use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
-use postgres_ffi::XLogFileName;
-use postgres_ffi::{XLogSegNo, PG_TLI};
+use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
 use remote_storage::{
     DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata,
 };
+use safekeeper_api::models::PeerInfo;
 use tokio::fs::File;
-
 use tokio::select;
 use tokio::sync::mpsc::{self, Receiver, Sender};
-use tokio::sync::{watch, OnceCell};
+use tokio::sync::{OnceCell, watch};
+use tokio::task::JoinHandle;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
-
-use utils::{id::TenantTimelineId, lsn::Lsn};
+use utils::backoff;
+use utils::id::{NodeId, TenantTimelineId};
+use utils::lsn::Lsn;
 
 use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
 use crate::timeline::WalResidentTimeline;
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 5ecb23e8e0..049852a048 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -20,23 +20,23 @@
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.
 use camino::Utf8PathBuf;
-use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
+use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
 use remote_storage::RemotePath;
 use safekeeper_api::Term;
 use serde::{Deserialize, Serialize};
-
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn};
-use utils::{id::NodeId, lsn::Lsn};
+use utils::id::NodeId;
+use utils::lsn::Lsn;
 
-use crate::{
-    metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
-    rate_limit::{rand_duration, RateLimiter},
-    timeline::WalResidentTimeline,
-    timeline_manager::StateSnapshot,
-    wal_backup::{self},
-    SafeKeeperConf,
+use crate::SafeKeeperConf;
+use crate::metrics::{
+    MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS,
 };
+use crate::rate_limit::{RateLimiter, rand_duration};
+use crate::timeline::WalResidentTimeline;
+use crate::timeline_manager::StateSnapshot;
+use crate::wal_backup::{self};
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub enum UploadStatus {
diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs
index a0dd571a34..cc9d4e6e3b 100644
--- a/safekeeper/src/wal_reader_stream.rs
+++ b/safekeeper/src/wal_reader_stream.rs
@@ -1,14 +1,15 @@
-use std::{
-    pin::Pin,
-    task::{Context, Poll},
-};
+use std::pin::Pin;
+use std::task::{Context, Poll};
 
 use bytes::Bytes;
-use futures::{stream::BoxStream, Stream, StreamExt};
+use futures::stream::BoxStream;
+use futures::{Stream, StreamExt};
+use safekeeper_api::Term;
 use utils::lsn::Lsn;
 
-use crate::{send_wal::EndWatch, timeline::WalResidentTimeline, wal_storage::WalReader};
-use safekeeper_api::Term;
+use crate::send_wal::EndWatch;
+use crate::timeline::WalResidentTimeline;
+use crate::wal_storage::WalReader;
 
 #[derive(PartialEq, Eq, Debug)]
 pub(crate) struct WalBytes {
@@ -224,12 +225,11 @@ mod tests {
 
     use futures::StreamExt;
     use postgres_ffi::MAX_SEND_SIZE;
-    use utils::{
-        id::{NodeId, TenantTimelineId},
-        lsn::Lsn,
-    };
+    use utils::id::{NodeId, TenantTimelineId};
+    use utils::lsn::Lsn;
 
-    use crate::{test_utils::Env, wal_reader_stream::StreamingWalReader};
+    use crate::test_utils::Env;
+    use crate::wal_reader_stream::StreamingWalReader;
 
     #[tokio::test]
     async fn test_streaming_wal_reader_reset() {
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index e5ccbb3230..045fa88cb0 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -2,23 +2,23 @@
 //!   WAL service listens for client connections and
 //!   receive WAL from wal_proposer and send it to WAL receivers
 //!
-use anyhow::{Context, Result};
-use postgres_backend::QueryError;
-use safekeeper_api::models::ConnectionId;
+use std::os::fd::AsRawFd;
 use std::sync::Arc;
 use std::time::Duration;
+
+use anyhow::{Context, Result};
+use postgres_backend::{AuthType, PostgresBackend, QueryError};
+use safekeeper_api::models::ConnectionId;
 use tokio::net::TcpStream;
 use tokio_io_timeout::TimeoutReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::{auth::Scope, measured_stream::MeasuredStream};
-
-use std::os::fd::AsRawFd;
+use utils::auth::Scope;
+use utils::measured_stream::MeasuredStream;
 
+use crate::handler::SafekeeperPostgresHandler;
 use crate::metrics::TrafficMetrics;
-use crate::SafeKeeperConf;
-use crate::{handler::SafekeeperPostgresHandler, GlobalTimelines};
-use postgres_backend::{AuthType, PostgresBackend};
+use crate::{GlobalTimelines, SafeKeeperConf};
 
 /// Accept incoming TCP connections and spawn them into a background thread.
 ///
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index e338d70731..ed197a3f83 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -7,32 +7,32 @@
 //!
 //! Note that last file has `.partial` suffix, that's different from postgres.
 
-use anyhow::{bail, Context, Result};
-use bytes::Bytes;
-use camino::{Utf8Path, Utf8PathBuf};
-use futures::future::BoxFuture;
-use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName};
-use postgres_ffi::{dispatch_pgversion, XLogSegNo, PG_TLI};
-use remote_storage::RemotePath;
 use std::cmp::{max, min};
 use std::future::Future;
 use std::io::{self, SeekFrom};
 use std::pin::Pin;
-use tokio::fs::{self, remove_file, File, OpenOptions};
-use tokio::io::{AsyncRead, AsyncWriteExt};
-use tokio::io::{AsyncReadExt, AsyncSeekExt};
+
+use anyhow::{Context, Result, bail};
+use bytes::Bytes;
+use camino::{Utf8Path, Utf8PathBuf};
+use futures::future::BoxFuture;
+use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName};
+use postgres_ffi::waldecoder::WalStreamDecoder;
+use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo, dispatch_pgversion};
+use pq_proto::SystemId;
+use remote_storage::RemotePath;
+use tokio::fs::{self, File, OpenOptions, remove_file};
+use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
 use tracing::*;
 use utils::crashsafe::durable_rename;
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
 
 use crate::metrics::{
-    time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS,
+    REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, WalStorageMetrics, time_io_closure,
 };
 use crate::state::TimelinePersistentState;
 use crate::wal_backup::{read_object, remote_timeline_path};
-use postgres_ffi::waldecoder::WalStreamDecoder;
-use postgres_ffi::XLogFileName;
-use pq_proto::SystemId;
-use utils::{id::TenantTimelineId, lsn::Lsn};
 
 pub trait Storage {
     // Last written LSN.
@@ -200,7 +200,12 @@ impl PhysicalStorage {
             ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn,
         );
         if flush_lsn < state.commit_lsn {
-            bail!("timeline {} potential data loss: flush_lsn {} by find_end_of_wal is less than commit_lsn  {} from control file", ttid.timeline_id, flush_lsn, state.commit_lsn);
+            bail!(
+                "timeline {} potential data loss: flush_lsn {} by find_end_of_wal is less than commit_lsn  {} from control file",
+                ttid.timeline_id,
+                flush_lsn,
+                state.commit_lsn
+            );
         }
         if flush_lsn < state.peer_horizon_lsn {
             warn!(
diff --git a/safekeeper/tests/misc_test.rs b/safekeeper/tests/misc_test.rs
index 8e5b17a143..8e54d2bb86 100644
--- a/safekeeper/tests/misc_test.rs
+++ b/safekeeper/tests/misc_test.rs
@@ -3,9 +3,9 @@ use std::sync::Arc;
 use tracing::{info, warn};
 use utils::lsn::Lsn;
 
-use crate::walproposer_sim::{
-    log::{init_logger, init_tracing_logger},
-    simulation::{generate_network_opts, generate_schedule, Schedule, TestAction, TestConfig},
+use crate::walproposer_sim::log::{init_logger, init_tracing_logger};
+use crate::walproposer_sim::simulation::{
+    Schedule, TestAction, TestConfig, generate_network_opts, generate_schedule,
 };
 
 pub mod walproposer_sim;
diff --git a/safekeeper/tests/random_test.rs b/safekeeper/tests/random_test.rs
index 1a932ef699..e29b58836a 100644
--- a/safekeeper/tests/random_test.rs
+++ b/safekeeper/tests/random_test.rs
@@ -1,11 +1,9 @@
 use rand::Rng;
 use tracing::{info, warn};
 
-use crate::walproposer_sim::{
-    log::{init_logger, init_tracing_logger},
-    simulation::{generate_network_opts, generate_schedule, TestConfig},
-    simulation_logs::validate_events,
-};
+use crate::walproposer_sim::log::{init_logger, init_tracing_logger};
+use crate::walproposer_sim::simulation::{TestConfig, generate_network_opts, generate_schedule};
+use crate::walproposer_sim::simulation_logs::validate_events;
 
 pub mod walproposer_sim;
 
@@ -18,7 +16,7 @@ fn test_random_schedules() -> anyhow::Result<()> {
     let mut config = TestConfig::new(Some(clock));
 
     for _ in 0..500 {
-        let seed: u64 = rand::thread_rng().gen();
+        let seed: u64 = rand::thread_rng().r#gen();
         config.network = generate_network_opts(seed);
 
         let test = config.start(seed);
diff --git a/safekeeper/tests/simple_test.rs b/safekeeper/tests/simple_test.rs
index 0be9d0deef..f7b266e39c 100644
--- a/safekeeper/tests/simple_test.rs
+++ b/safekeeper/tests/simple_test.rs
@@ -1,7 +1,8 @@
 use tracing::info;
 use utils::lsn::Lsn;
 
-use crate::walproposer_sim::{log::init_logger, simulation::TestConfig};
+use crate::walproposer_sim::log::init_logger;
+use crate::walproposer_sim::simulation::TestConfig;
 
 pub mod walproposer_sim;
 
diff --git a/safekeeper/tests/walproposer_sim/log.rs b/safekeeper/tests/walproposer_sim/log.rs
index 870f30de4f..e2ba3282ca 100644
--- a/safekeeper/tests/walproposer_sim/log.rs
+++ b/safekeeper/tests/walproposer_sim/log.rs
@@ -1,9 +1,11 @@
-use std::{fmt, sync::Arc};
+use std::fmt;
+use std::sync::Arc;
 
 use desim::time::Timing;
 use once_cell::sync::OnceCell;
 use parking_lot::Mutex;
-use tracing_subscriber::fmt::{format::Writer, time::FormatTime};
+use tracing_subscriber::fmt::format::Writer;
+use tracing_subscriber::fmt::time::FormatTime;
 
 /// SimClock can be plugged into tracing logger to print simulation time.
 #[derive(Clone)]
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index b9dfabe0d7..6ce1a9940e 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -2,31 +2,30 @@
 //! Gets messages from the network, passes them down to consensus module and
 //! sends replies back.
 
-use std::{collections::HashMap, sync::Arc, time::Duration};
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::Duration;
 
-use anyhow::{bail, Result};
+use anyhow::{Result, bail};
 use bytes::{Bytes, BytesMut};
 use camino::Utf8PathBuf;
-use desim::{
-    executor::{self, PollSome},
-    network::TCP,
-    node_os::NodeOs,
-    proto::{AnyMessage, NetEvent, NodeEvent},
-};
+use desim::executor::{self, PollSome};
+use desim::network::TCP;
+use desim::node_os::NodeOs;
+use desim::proto::{AnyMessage, NetEvent, NodeEvent};
 use http::Uri;
-use safekeeper::{
-    safekeeper::{ProposerAcceptorMessage, SafeKeeper, SK_PROTO_VERSION_3, UNKNOWN_SERVER_VERSION},
-    state::{TimelinePersistentState, TimelineState},
-    timeline::TimelineError,
-    wal_storage::Storage,
-    SafeKeeperConf,
+use safekeeper::SafeKeeperConf;
+use safekeeper::safekeeper::{
+    ProposerAcceptorMessage, SK_PROTO_VERSION_3, SafeKeeper, UNKNOWN_SERVER_VERSION,
 };
-use safekeeper_api::{membership::Configuration, ServerInfo};
+use safekeeper::state::{TimelinePersistentState, TimelineState};
+use safekeeper::timeline::TimelineError;
+use safekeeper::wal_storage::Storage;
+use safekeeper_api::ServerInfo;
+use safekeeper_api::membership::Configuration;
 use tracing::{debug, info_span, warn};
-use utils::{
-    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
-    lsn::Lsn,
-};
+use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
+use utils::lsn::Lsn;
 
 use super::safekeeper_disk::{DiskStateStorage, DiskWALStorage, SafekeeperDisk, TimelineDisk};
 
diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
index b854754ecf..94a849b5f0 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
@@ -1,22 +1,23 @@
 use std::collections::HashMap;
+use std::ops::Deref;
 use std::sync::Arc;
-
-use parking_lot::Mutex;
-use safekeeper::state::TimelinePersistentState;
-use utils::id::TenantTimelineId;
-
-use super::block_storage::BlockStorage;
-
-use std::{ops::Deref, time::Instant};
+use std::time::Instant;
 
 use anyhow::Result;
 use bytes::{Buf, BytesMut};
 use futures::future::BoxFuture;
-use postgres_ffi::{waldecoder::WalStreamDecoder, XLogSegNo};
-use safekeeper::{control_file, metrics::WalStorageMetrics, wal_storage};
+use parking_lot::Mutex;
+use postgres_ffi::XLogSegNo;
+use postgres_ffi::waldecoder::WalStreamDecoder;
+use safekeeper::metrics::WalStorageMetrics;
+use safekeeper::state::TimelinePersistentState;
+use safekeeper::{control_file, wal_storage};
 use tracing::{debug, info};
+use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
 
+use super::block_storage::BlockStorage;
+
 /// All safekeeper state that is usually saved to disk.
 pub struct SafekeeperDisk {
     pub timelines: Mutex<HashMap<TenantTimelineId, Arc<TimelineDisk>>>,
diff --git a/safekeeper/tests/walproposer_sim/simulation.rs b/safekeeper/tests/walproposer_sim/simulation.rs
index fabf450eef..f314143952 100644
--- a/safekeeper/tests/walproposer_sim/simulation.rs
+++ b/safekeeper/tests/walproposer_sim/simulation.rs
@@ -1,23 +1,24 @@
-use std::{cell::Cell, str::FromStr, sync::Arc};
+use std::cell::Cell;
+use std::str::FromStr;
+use std::sync::Arc;
 
-use crate::walproposer_sim::{safekeeper::run_server, walproposer_api::SimulationApi};
-use desim::{
-    executor::{self, ExternalHandle},
-    node_os::NodeOs,
-    options::{Delay, NetworkOptions},
-    proto::{AnyMessage, NodeEvent},
-    world::Node,
-    world::World,
-};
+use desim::executor::{self, ExternalHandle};
+use desim::node_os::NodeOs;
+use desim::options::{Delay, NetworkOptions};
+use desim::proto::{AnyMessage, NodeEvent};
+use desim::world::{Node, World};
 use rand::{Rng, SeedableRng};
 use tracing::{debug, info_span, warn};
-use utils::{id::TenantTimelineId, lsn::Lsn};
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
 use walproposer::walproposer::{Config, Wrapper};
 
-use super::{
-    log::SimClock, safekeeper_disk::SafekeeperDisk, walproposer_api,
-    walproposer_disk::DiskWalProposer,
-};
+use super::log::SimClock;
+use super::safekeeper_disk::SafekeeperDisk;
+use super::walproposer_api;
+use super::walproposer_disk::DiskWalProposer;
+use crate::walproposer_sim::safekeeper::run_server;
+use crate::walproposer_sim::walproposer_api::SimulationApi;
 
 /// Simulated safekeeper node.
 pub struct SafekeeperNode {
diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs
index 5578c94cf6..6451589e80 100644
--- a/safekeeper/tests/walproposer_sim/walproposer_api.rs
+++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs
@@ -1,26 +1,20 @@
-use std::{
-    cell::{RefCell, RefMut, UnsafeCell},
-    ffi::CStr,
-    sync::Arc,
-};
+use std::cell::{RefCell, RefMut, UnsafeCell};
+use std::ffi::CStr;
+use std::sync::Arc;
 
 use bytes::Bytes;
-use desim::{
-    executor::{self, PollSome},
-    network::TCP,
-    node_os::NodeOs,
-    proto::{AnyMessage, NetEvent, NodeEvent},
-    world::NodeId,
-};
+use desim::executor::{self, PollSome};
+use desim::network::TCP;
+use desim::node_os::NodeOs;
+use desim::proto::{AnyMessage, NetEvent, NodeEvent};
+use desim::world::NodeId;
 use tracing::debug;
 use utils::lsn::Lsn;
-use walproposer::{
-    api_bindings::Level,
-    bindings::{
-        NeonWALReadResult, SafekeeperStateDesiredEvents, WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE,
-    },
-    walproposer::{ApiImpl, Config},
+use walproposer::api_bindings::Level;
+use walproposer::bindings::{
+    NeonWALReadResult, SafekeeperStateDesiredEvents, WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE,
 };
+use walproposer::walproposer::{ApiImpl, Config};
 
 use super::walproposer_disk::DiskWalProposer;
 
@@ -578,7 +572,9 @@ impl ApiImpl for SimulationApi {
         let disk_lsn = disk.lock().flush_rec_ptr().0;
         debug!("start_streaming at {} (disk_lsn={})", startpos, disk_lsn);
         if startpos < disk_lsn {
-            debug!("startpos < disk_lsn, it means we wrote some transaction even before streaming started");
+            debug!(
+                "startpos < disk_lsn, it means we wrote some transaction even before streaming started"
+            );
         }
         assert!(startpos <= disk_lsn);
         let mut broadcasted = Lsn(startpos);
diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs
index 7dc7f48548..fe3eee8a5a 100644
--- a/safekeeper/tests/walproposer_sim/walproposer_disk.rs
+++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs
@@ -1,4 +1,5 @@
-use std::{ffi::CStr, sync::Arc};
+use std::ffi::CStr;
+use std::sync::Arc;
 
 use parking_lot::{Mutex, MutexGuard};
 use postgres_ffi::v16::wal_generator::{LogicalMessageGenerator, WalGenerator};
diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml
index 17d4aed63b..e4db9a317d 100644
--- a/storage_broker/Cargo.toml
+++ b/storage_broker/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "storage_broker"
 version = "0.1.0"
-edition.workspace = true
+edition = "2024"
 license.workspace = true
 
 [features]
diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs
index 1a6fb7fedf..86f2dd9a6c 100644
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -1,18 +1,14 @@
-use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, Ordering};
 use std::time::{Duration, Instant};
 
 use clap::Parser;
-
-use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::{
-    FilterTenantTimelineId, MessageType, SubscribeByFilterRequest,
+    FilterTenantTimelineId, MessageType, SafekeeperTimelineInfo, SubscribeByFilterRequest,
     TenantTimelineId as ProtoTenantTimelineId, TypeSubscription, TypedMessage,
 };
-
 use storage_broker::{BrokerClientChannel, DEFAULT_ENDPOINT};
 use tokio::time;
-
 use tonic::Request;
 
 const ABOUT: &str = r#"
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 9d4c22484c..cc33ec20ff 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -10,7 +10,14 @@
 //!
 //! Only safekeeper message is supported, but it is not hard to add something
 //! else with generics.
-use clap::{command, Parser};
+use std::collections::HashMap;
+use std::convert::Infallible;
+use std::net::SocketAddr;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::time::Duration;
+
+use clap::{Parser, command};
 use futures_core::Stream;
 use futures_util::StreamExt;
 use http_body_util::Full;
@@ -19,27 +26,10 @@ use hyper::header::CONTENT_TYPE;
 use hyper::service::service_fn;
 use hyper::{Method, StatusCode};
 use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer};
-use parking_lot::RwLock;
-use std::collections::HashMap;
-use std::convert::Infallible;
-use std::net::SocketAddr;
-use std::pin::Pin;
-use std::sync::Arc;
-use std::time::Duration;
-use tokio::net::TcpListener;
-use tokio::sync::broadcast;
-use tokio::sync::broadcast::error::RecvError;
-use tokio::time;
-use tonic::body::{self, empty_body, BoxBody};
-use tonic::codegen::Service;
-use tonic::Code;
-use tonic::{Request, Response, Status};
-use tracing::*;
-use utils::signals::ShutdownSignals;
-
 use metrics::{Encoder, TextEncoder};
+use parking_lot::RwLock;
 use storage_broker::metrics::{
-    BROADCASTED_MESSAGES_TOTAL, BROADCAST_DROPPED_MESSAGES_TOTAL, NUM_PUBS, NUM_SUBS_ALL,
+    BROADCAST_DROPPED_MESSAGES_TOTAL, BROADCASTED_MESSAGES_TOTAL, NUM_PUBS, NUM_SUBS_ALL,
     NUM_SUBS_TIMELINE, PROCESSED_MESSAGES_TOTAL, PUBLISHED_ONEOFF_MESSAGES_TOTAL,
 };
 use storage_broker::proto::broker_service_server::{BrokerService, BrokerServiceServer};
@@ -48,10 +38,19 @@ use storage_broker::proto::{
     FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
     SafekeeperTimelineInfo, SubscribeByFilterRequest, SubscribeSafekeeperInfoRequest, TypedMessage,
 };
-use storage_broker::{parse_proto_ttid, DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR};
+use storage_broker::{DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR, parse_proto_ttid};
+use tokio::net::TcpListener;
+use tokio::sync::broadcast;
+use tokio::sync::broadcast::error::RecvError;
+use tokio::time;
+use tonic::body::{self, BoxBody, empty_body};
+use tonic::codegen::Service;
+use tonic::{Code, Request, Response, Status};
+use tracing::*;
 use utils::id::TenantTimelineId;
 use utils::logging::{self, LogFormat};
 use utils::sentry_init::init_sentry;
+use utils::signals::ShutdownSignals;
 use utils::{project_build_tag, project_git_version};
 
 project_git_version!(GIT_VERSION);
@@ -743,11 +742,12 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
     use tokio::sync::broadcast::error::TryRecvError;
     use utils::id::{TenantId, TimelineId};
 
+    use super::*;
+
     fn msg(timeline_id: Vec<u8>) -> Message {
         Message::SafekeeperTimelineInfo(SafekeeperTimelineInfo {
             safekeeper_id: 1,
diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs
index 3ac40f6e14..55d411f607 100644
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -1,12 +1,11 @@
 use std::time::Duration;
-use tonic::codegen::StdError;
-use tonic::transport::{ClientTlsConfig, Endpoint};
-use tonic::{transport::Channel, Status};
-use utils::id::{TenantId, TenantTimelineId, TimelineId};
 
-use proto::{
-    broker_service_client::BrokerServiceClient, TenantTimelineId as ProtoTenantTimelineId,
-};
+use proto::TenantTimelineId as ProtoTenantTimelineId;
+use proto::broker_service_client::BrokerServiceClient;
+use tonic::Status;
+use tonic::codegen::StdError;
+use tonic::transport::{Channel, ClientTlsConfig, Endpoint};
+use utils::id::{TenantId, TenantTimelineId, TimelineId};
 
 // Code generated by protobuf.
 pub mod proto {
@@ -20,11 +19,8 @@ pub mod proto {
 pub mod metrics;
 
 // Re-exports to avoid direct tonic dependency in user crates.
-pub use tonic::Code;
-pub use tonic::Request;
-pub use tonic::Streaming;
-
 pub use hyper::Uri;
+pub use tonic::{Code, Request, Streaming};
 
 pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051";
 pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}");
diff --git a/storage_broker/src/metrics.rs b/storage_broker/src/metrics.rs
index 1fd3dd5ad6..ecfb594eba 100644
--- a/storage_broker/src/metrics.rs
+++ b/storage_broker/src/metrics.rs
@@ -1,6 +1,6 @@
 //! Broker metrics.
 
-use metrics::{register_int_counter, register_int_gauge, IntCounter, IntGauge};
+use metrics::{IntCounter, IntGauge, register_int_counter, register_int_gauge};
 use once_cell::sync::Lazy;
 
 pub static NUM_PUBS: Lazy<IntGauge> = Lazy::new(|| {
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 8e82996db1..b63ba154da 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "storage_controller"
 version = "0.1.0"
-edition.workspace = true
+edition = "2024"
 license.workspace = true
 
 [[bin]]
diff --git a/storage_controller/src/background_node_operations.rs b/storage_controller/src/background_node_operations.rs
index 226d4942e7..a630316f46 100644
--- a/storage_controller/src/background_node_operations.rs
+++ b/storage_controller/src/background_node_operations.rs
@@ -1,4 +1,5 @@
-use std::{borrow::Cow, fmt::Debug, fmt::Display};
+use std::borrow::Cow;
+use std::fmt::{Debug, Display};
 
 use tokio_util::sync::CancellationToken;
 use utils::id::NodeId;
diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index 5bc3c81f02..b602af362d 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -1,7 +1,8 @@
 use std::borrow::Cow;
+use std::collections::HashMap;
 use std::error::Error as _;
 use std::sync::Arc;
-use std::{collections::HashMap, time::Duration};
+use std::time::Duration;
 
 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
 use control_plane::local_env::LocalEnv;
@@ -12,11 +13,9 @@ use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShar
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
-use tracing::{info_span, Instrument};
-use utils::{
-    backoff::{self},
-    id::{NodeId, TenantId},
-};
+use tracing::{Instrument, info_span};
+use utils::backoff::{self};
+use utils::id::{NodeId, TenantId};
 
 use crate::service::Config;
 
diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/drain_utils.rs
index 8b7be88078..bd4b8ba38f 100644
--- a/storage_controller/src/drain_utils.rs
+++ b/storage_controller/src/drain_utils.rs
@@ -1,15 +1,14 @@
-use std::{
-    collections::{BTreeMap, HashMap},
-    sync::Arc,
-};
+use std::collections::{BTreeMap, HashMap};
+use std::sync::Arc;
 
 use pageserver_api::controller_api::{NodeSchedulingPolicy, ShardSchedulingPolicy};
-use utils::{id::NodeId, shard::TenantShardId};
+use utils::id::NodeId;
+use utils::shard::TenantShardId;
 
-use crate::{
-    background_node_operations::OperationError, node::Node, scheduler::Scheduler,
-    tenant_shard::TenantShard,
-};
+use crate::background_node_operations::OperationError;
+use crate::node::Node;
+use crate::scheduler::Scheduler;
+use crate::tenant_shard::TenantShard;
 
 pub(crate) struct TenantShardIterator<F> {
     tenants_accessor: F,
@@ -188,10 +187,8 @@ impl TenantShardDrain {
 mod tests {
     use std::sync::Arc;
 
-    use utils::{
-        id::TenantId,
-        shard::{ShardCount, ShardNumber, TenantShardId},
-    };
+    use utils::id::TenantId;
+    use utils::shard::{ShardCount, ShardNumber, TenantShardId};
 
     use super::TenantShardIterator;
 
diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index 88ee7887d3..56a331becd 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -1,24 +1,22 @@
-use futures::{stream::FuturesUnordered, StreamExt};
+use std::collections::HashMap;
+use std::fmt::Debug;
+use std::future::Future;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use futures::StreamExt;
+use futures::stream::FuturesUnordered;
+use pageserver_api::controller_api::{NodeAvailability, SkSchedulingPolicy};
+use pageserver_api::models::PageserverUtilization;
 use safekeeper_api::models::SafekeeperUtilization;
 use safekeeper_client::mgmt_api;
-use std::{
-    collections::HashMap,
-    fmt::Debug,
-    future::Future,
-    sync::Arc,
-    time::{Duration, Instant},
-};
-use tokio_util::sync::CancellationToken;
-
-use pageserver_api::{
-    controller_api::{NodeAvailability, SkSchedulingPolicy},
-    models::PageserverUtilization,
-};
-
 use thiserror::Error;
-use utils::{id::NodeId, logging::SecretString};
+use tokio_util::sync::CancellationToken;
+use utils::id::NodeId;
+use utils::logging::SecretString;
 
-use crate::{node::Node, safekeeper::Safekeeper};
+use crate::node::Node;
+use crate::safekeeper::Safekeeper;
 
 struct HeartbeaterTask<Server, State> {
     receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest<Server, State>>,
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 33b3d88c25..5b5ae80eaf 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1,32 +1,27 @@
-use crate::http;
-use crate::metrics::{
-    HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
-    METRICS_REGISTRY,
-};
-use crate::persistence::SafekeeperUpsert;
-use crate::reconciler::ReconcileError;
-use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT};
+use std::str::FromStr;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
 use anyhow::Context;
+use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
 use futures::Future;
-use http_utils::{
-    endpoint::{
-        self, auth_middleware, check_permission_with, profile_cpu_handler, profile_heap_handler,
-        request_span,
-    },
-    error::ApiError,
-    failpoints::failpoints_handler,
-    json::{json_request, json_response},
-    request::{must_get_query_param, parse_query_param, parse_request_param},
-    RequestExt, RouterBuilder,
+use http_utils::endpoint::{
+    self, auth_middleware, check_permission_with, profile_cpu_handler, profile_heap_handler,
+    request_span,
 };
+use http_utils::error::ApiError;
+use http_utils::failpoints::failpoints_handler;
+use http_utils::json::{json_request, json_response};
+use http_utils::request::{must_get_query_param, parse_query_param, parse_request_param};
+use http_utils::{RequestExt, RouterBuilder};
 use hyper::header::CONTENT_TYPE;
-use hyper::{Body, Request, Response};
-use hyper::{StatusCode, Uri};
+use hyper::{Body, Request, Response, StatusCode, Uri};
 use metrics::{BuildInfo, NeonMetrics};
 use pageserver_api::controller_api::{
     MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
     MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
-    SafekeeperSchedulingPolicyRequest, ShardsPreferredAzsRequest, TenantCreateRequest,
+    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest,
+    ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest,
 };
 use pageserver_api::models::{
     TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest,
@@ -34,23 +29,21 @@ use pageserver_api::models::{
     TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
-use pageserver_client::{mgmt_api, BlockUnblock};
-use std::str::FromStr;
-use std::sync::Arc;
-use std::time::{Duration, Instant};
+use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
+use pageserver_client::{BlockUnblock, mgmt_api};
+use routerify::Middleware;
 use tokio_util::sync::CancellationToken;
 use utils::auth::{Scope, SwappableJwtAuth};
 use utils::id::{NodeId, TenantId, TimelineId};
 
-use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest,
-    TenantShardMigrateRequest,
+use crate::http;
+use crate::metrics::{
+    HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, METRICS_REGISTRY,
+    PageserverRequestLabelGroup,
 };
-use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
-
-use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
-
-use routerify::Middleware;
+use crate::persistence::SafekeeperUpsert;
+use crate::reconciler::ReconcileError;
+use crate::service::{LeadershipStatus, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT, Service};
 
 /// State available to HTTP request handlers
 pub struct HttpState {
@@ -1455,8 +1448,8 @@ pub fn prologue_leadership_status_check_middleware<
     })
 }
 
-fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-) -> Middleware<B, ApiError> {
+fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>()
+-> Middleware<B, ApiError> {
     Middleware::pre(move |req| async move {
         let meta = RequestMeta {
             method: req.method().clone(),
@@ -1469,8 +1462,8 @@ fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>
     })
 }
 
-fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-) -> Middleware<B, ApiError> {
+fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>()
+-> Middleware<B, ApiError> {
     Middleware::post_with_info(move |resp, req_info| async move {
         let request_name = match req_info.context::<RequestName>() {
             Some(name) => name,
@@ -1621,8 +1614,8 @@ async fn maybe_forward(req: Request<Body>) -> ForwardOutcome {
             Err(err) => {
                 return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(
                     anyhow::anyhow!(
-                    "Failed to parse leader uri for forwarding while in stepped down state: {err}"
-                ),
+                        "Failed to parse leader uri for forwarding while in stepped down state: {err}"
+                    ),
                 )));
             }
         };
@@ -2155,8 +2148,23 @@ mod test {
 
     #[test]
     fn test_path_without_ids() {
-        assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788/timeline/AA223344556677881122334455667788"), "/v1/tenant//timeline/");
-        assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788"), "/v1/tenant//timeline/");
-        assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788?parameter=foo"), "/v1/tenant//timeline/");
+        assert_eq!(
+            path_without_ids(
+                "/v1/tenant/1a2b3344556677881122334455667788/timeline/AA223344556677881122334455667788"
+            ),
+            "/v1/tenant//timeline/"
+        );
+        assert_eq!(
+            path_without_ids(
+                "/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788"
+            ),
+            "/v1/tenant//timeline/"
+        );
+        assert_eq!(
+            path_without_ids(
+                "/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788?parameter=foo"
+            ),
+            "/v1/tenant//timeline/"
+        );
     }
 }
diff --git a/storage_controller/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs
index 2d8b674f86..6b0c16f0be 100644
--- a/storage_controller/src/id_lock_map.rs
+++ b/storage_controller/src/id_lock_map.rs
@@ -1,8 +1,7 @@
+use std::collections::HashMap;
 use std::fmt::Display;
-use std::time::Instant;
-use std::{collections::HashMap, sync::Arc};
-
-use std::time::Duration;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
 
 use crate::service::RECONCILE_TIMEOUT;
 
diff --git a/storage_controller/src/leadership.rs b/storage_controller/src/leadership.rs
index 5fae8991ec..5e1d6f3ec9 100644
--- a/storage_controller/src/leadership.rs
+++ b/storage_controller/src/leadership.rs
@@ -3,11 +3,9 @@ use std::sync::Arc;
 use hyper::Uri;
 use tokio_util::sync::CancellationToken;
 
-use crate::{
-    peer_client::{GlobalObservedState, PeerClient},
-    persistence::{ControllerPersistence, DatabaseError, DatabaseResult, Persistence},
-    service::Config,
-};
+use crate::peer_client::{GlobalObservedState, PeerClient};
+use crate::persistence::{ControllerPersistence, DatabaseError, DatabaseResult, Persistence};
+use crate::service::Config;
 
 /// Helper for storage controller leadership acquisition
 pub(crate) struct Leadership {
@@ -91,7 +89,9 @@ impl Leadership {
                 // Special case: if this is a brand new storage controller, migrations will not
                 // have run at this point yet, and, hence, the controllers table does not exist.
                 // Detect this case via the error string (diesel doesn't type it) and allow it.
-                tracing::info!("Detected first storage controller start-up. Allowing missing controllers table ...");
+                tracing::info!(
+                    "Detected first storage controller start-up. Allowing missing controllers table ..."
+                );
                 return Ok(None);
             }
         }
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 4152e40a76..04dd3bb3f6 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -1,26 +1,26 @@
-use anyhow::{anyhow, Context};
-use clap::Parser;
-use hyper0::Uri;
-use metrics::launch_timestamp::LaunchTimestamp;
-use metrics::BuildInfo;
 use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::Duration;
+
+use anyhow::{Context, anyhow};
+use clap::Parser;
+use hyper0::Uri;
+use metrics::BuildInfo;
+use metrics::launch_timestamp::LaunchTimestamp;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
 use storage_controller::service::chaos_injector::ChaosInjector;
 use storage_controller::service::{
-    Config, Service, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT,
+    Config, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT,
     MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
-    PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
+    PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, Service,
 };
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};
-
 use utils::sentry_init::init_sentry;
 use utils::{project_build_tag, project_git_version, tcp_listener};
 
@@ -34,7 +34,7 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 /// This adds roughly 3% overhead for allocations on average, which is acceptable considering
 /// performance-sensitive code will avoid allocations as far as possible anyway.
 #[allow(non_upper_case_globals)]
-#[export_name = "malloc_conf"]
+#[unsafe(export_name = "malloc_conf")]
 pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
 
 #[derive(Parser)]
@@ -297,8 +297,8 @@ async fn async_main() -> anyhow::Result<()> {
             // Production systems should always have secrets configured: if public_key was not set
             // then we would implicitly disable auth.
             anyhow::bail!(
-                    "Insecure config!  One or more secrets is not set.  This is only permitted in `--dev` mode"
-                );
+                "Insecure config!  One or more secrets is not set.  This is only permitted in `--dev` mode"
+            );
         }
         StrictMode::Strict if args.compute_hook_url.is_none() => {
             // Production systems should always have a compute hook set, to prevent falling
diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index 6d67e0d130..f490edb68f 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -7,17 +7,18 @@
 //!
 //! The rest of the code defines label group types and deals with converting outer types to labels.
 //!
+use std::sync::Mutex;
+
 use bytes::Bytes;
-use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup};
+use measured::label::LabelValue;
+use measured::metric::histogram;
+use measured::{FixedCardinalityLabel, MetricGroup};
 use metrics::NeonMetrics;
 use once_cell::sync::Lazy;
-use std::sync::Mutex;
 use strum::IntoEnumIterator;
 
-use crate::{
-    persistence::{DatabaseError, DatabaseOperation},
-    service::LeadershipStatus,
-};
+use crate::persistence::{DatabaseError, DatabaseOperation};
+use crate::service::LeadershipStatus;
 
 pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
     Lazy::new(StorageControllerMetrics::default);
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 3762d13c10..bc7fe8802a 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -1,22 +1,22 @@
-use std::{str::FromStr, time::Duration};
+use std::str::FromStr;
+use std::time::Duration;
 
 use anyhow::anyhow;
-use pageserver_api::{
-    controller_api::{
-        AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeRegisterRequest,
-        NodeSchedulingPolicy, TenantLocateResponseShard,
-    },
-    shard::TenantShardId,
+use pageserver_api::controller_api::{
+    AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeRegisterRequest,
+    NodeSchedulingPolicy, TenantLocateResponseShard,
 };
+use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use reqwest::StatusCode;
 use serde::Serialize;
 use tokio_util::sync::CancellationToken;
-use utils::{backoff, id::NodeId};
+use utils::backoff;
+use utils::id::NodeId;
 
-use crate::{
-    pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule,
-};
+use crate::pageserver_client::PageserverClient;
+use crate::persistence::NodePersistence;
+use crate::scheduler::MaySchedule;
 
 /// Represents the in-memory description of a Node.
 ///
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 645cbdfce1..e9c54414a3 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -1,17 +1,13 @@
-use pageserver_api::{
-    models::{
-        detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse,
-        PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse,
-        TenantShardSplitRequest, TenantShardSplitResponse, TenantWaitLsnRequest,
-        TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest,
-        TopTenantShardsResponse,
-    },
-    shard::TenantShardId,
-};
-use pageserver_client::{
-    mgmt_api::{Client, Result},
-    BlockUnblock,
+use pageserver_api::models::detach_ancestor::AncestorDetached;
+use pageserver_api::models::{
+    LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
+    TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse,
+    TenantWaitLsnRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo,
+    TopTenantShardsRequest, TopTenantShardsResponse,
 };
+use pageserver_api::shard::TenantShardId;
+use pageserver_client::BlockUnblock;
+use pageserver_client::mgmt_api::{Client, Result};
 use reqwest::StatusCode;
 use utils::id::{NodeId, TenantId, TimelineId};
 
diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs
index 1a15bae365..f3f275dee0 100644
--- a/storage_controller/src/peer_client.rs
+++ b/storage_controller/src/peer_client.rs
@@ -1,16 +1,17 @@
-use crate::tenant_shard::ObservedState;
-use pageserver_api::shard::TenantShardId;
-use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::error::Error as _;
 use std::time::Duration;
-use tokio_util::sync::CancellationToken;
 
 use http_utils::error::HttpErrorBody;
 use hyper::Uri;
+use pageserver_api::shard::TenantShardId;
 use reqwest::{StatusCode, Url};
+use serde::{Deserialize, Serialize};
+use tokio_util::sync::CancellationToken;
 use utils::backoff;
 
+use crate::tenant_shard::ObservedState;
+
 #[derive(Debug, Clone)]
 pub(crate) struct PeerClient {
     uri: Uri,
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 459c11add9..d34da0fef0 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -2,45 +2,38 @@ pub(crate) mod split_state;
 use std::collections::HashMap;
 use std::str::FromStr;
 use std::sync::Arc;
-use std::time::Duration;
-use std::time::Instant;
+use std::time::{Duration, Instant};
 
-use self::split_state::SplitState;
 use diesel::prelude::*;
 use diesel_async::async_connection_wrapper::AsyncConnectionWrapper;
 use diesel_async::pooled_connection::bb8::Pool;
-use diesel_async::pooled_connection::AsyncDieselConnectionManager;
-use diesel_async::pooled_connection::ManagerConfig;
-use diesel_async::AsyncPgConnection;
-use diesel_async::RunQueryDsl;
-use futures::future::BoxFuture;
+use diesel_async::pooled_connection::{AsyncDieselConnectionManager, ManagerConfig};
+use diesel_async::{AsyncPgConnection, RunQueryDsl};
+use diesel_migrations::{EmbeddedMigrations, embed_migrations};
 use futures::FutureExt;
+use futures::future::BoxFuture;
 use itertools::Itertools;
-use pageserver_api::controller_api::AvailabilityZone;
-use pageserver_api::controller_api::MetadataHealthRecord;
-use pageserver_api::controller_api::SafekeeperDescribeResponse;
-use pageserver_api::controller_api::ShardSchedulingPolicy;
-use pageserver_api::controller_api::SkSchedulingPolicy;
-use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
+use pageserver_api::controller_api::{
+    AvailabilityZone, MetadataHealthRecord, NodeSchedulingPolicy, PlacementPolicy,
+    SafekeeperDescribeResponse, ShardSchedulingPolicy, SkSchedulingPolicy,
+};
 use pageserver_api::models::TenantConfig;
-use pageserver_api::shard::ShardConfigError;
-use pageserver_api::shard::ShardIdentity;
-use pageserver_api::shard::ShardStripeSize;
-use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
-use rustls::client::danger::{ServerCertVerified, ServerCertVerifier};
+use pageserver_api::shard::{
+    ShardConfigError, ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
+};
 use rustls::client::WebPkiServerVerifier;
+use rustls::client::danger::{ServerCertVerified, ServerCertVerifier};
 use rustls::crypto::ring;
 use scoped_futures::ScopedBoxFuture;
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};
 
+use self::split_state::SplitState;
 use crate::metrics::{
     DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY,
 };
 use crate::node::Node;
-
-use diesel_migrations::{embed_migrations, EmbeddedMigrations};
 const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
 
 /// ## What do we store?
@@ -479,8 +472,7 @@ impl Persistence {
         &self,
         shards: Vec<TenantShardPersistence>,
     ) -> DatabaseResult<()> {
-        use crate::schema::metadata_health;
-        use crate::schema::tenant_shards;
+        use crate::schema::{metadata_health, tenant_shards};
 
         let now = chrono::Utc::now();
 
@@ -554,8 +546,7 @@ impl Persistence {
         &self,
         input_node_id: NodeId,
     ) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
-        use crate::schema::nodes::dsl::scheduling_policy;
-        use crate::schema::nodes::dsl::*;
+        use crate::schema::nodes::dsl::{scheduling_policy, *};
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
diff --git a/storage_controller/src/persistence/split_state.rs b/storage_controller/src/persistence/split_state.rs
index bce1a75843..f83191038a 100644
--- a/storage_controller/src/persistence/split_state.rs
+++ b/storage_controller/src/persistence/split_state.rs
@@ -1,8 +1,8 @@
+use diesel::deserialize::{FromSql, FromSqlRow};
+use diesel::expression::AsExpression;
 use diesel::pg::{Pg, PgValue};
-use diesel::{
-    deserialize::FromSql, deserialize::FromSqlRow, expression::AsExpression, serialize::ToSql,
-    sql_types::Int2,
-};
+use diesel::serialize::ToSql;
+use diesel::sql_types::Int2;
 use serde::{Deserialize, Serialize};
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, FromSqlRow, AsExpression)]
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 4f0f170284..a327f6f50f 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -1,6 +1,8 @@
-use crate::pageserver_client::PageserverClient;
-use crate::persistence::Persistence;
-use crate::{compute_hook, service};
+use std::borrow::Cow;
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
 use json_structural_diff::JsonDiff;
 use pageserver_api::controller_api::{AvailabilityZone, MigrationConfig, PlacementPolicy};
 use pageserver_api::models::{
@@ -9,10 +11,6 @@ use pageserver_api::models::{
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_client::mgmt_api;
 use reqwest::StatusCode;
-use std::borrow::Cow;
-use std::collections::HashMap;
-use std::sync::Arc;
-use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
 use utils::backoff::exponential_backoff;
 use utils::generation::Generation;
@@ -23,7 +21,10 @@ use utils::sync::gate::GateGuard;
 
 use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
+use crate::pageserver_client::PageserverClient;
+use crate::persistence::Persistence;
 use crate::tenant_shard::{IntentState, ObservedState, ObservedStateDelta, ObservedStateLocation};
+use crate::{compute_hook, service};
 
 const DEFAULT_HEATMAP_PERIOD: Duration = Duration::from_secs(60);
 
@@ -511,7 +512,8 @@ impl Reconciler {
             } else if status == StatusCode::ACCEPTED {
                 let total_runtime = started_at.elapsed();
                 if total_runtime > total_download_timeout {
-                    tracing::warn!("Timed out after {}ms downloading layers to {node}.  Progress so far: {}/{} layers, {}/{} bytes",
+                    tracing::warn!(
+                        "Timed out after {}ms downloading layers to {node}.  Progress so far: {}/{} layers, {}/{} bytes",
                         total_runtime.as_millis(),
                         progress.layers_downloaded,
                         progress.layers_total,
diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs
index 53cd8a908b..546fbf0726 100644
--- a/storage_controller/src/safekeeper.rs
+++ b/storage_controller/src/safekeeper.rs
@@ -1,16 +1,17 @@
-use std::{str::FromStr, time::Duration};
+use std::str::FromStr;
+use std::time::Duration;
 
 use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy};
 use reqwest::StatusCode;
 use safekeeper_client::mgmt_api;
 use tokio_util::sync::CancellationToken;
-use utils::{backoff, id::NodeId, logging::SecretString};
+use utils::backoff;
+use utils::id::NodeId;
+use utils::logging::SecretString;
 
-use crate::{
-    heartbeater::SafekeeperState,
-    persistence::{DatabaseError, SafekeeperPersistence},
-    safekeeper_client::SafekeeperClient,
-};
+use crate::heartbeater::SafekeeperState;
+use crate::persistence::{DatabaseError, SafekeeperPersistence};
+use crate::safekeeper_client::SafekeeperClient;
 
 #[derive(Clone)]
 pub struct Safekeeper {
diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs
index f234ab3429..fb5be092a0 100644
--- a/storage_controller/src/safekeeper_client.rs
+++ b/storage_controller/src/safekeeper_client.rs
@@ -1,13 +1,12 @@
-use crate::metrics::PageserverRequestLabelGroup;
 use safekeeper_api::models::{
     PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest,
     TimelineStatus,
 };
 use safekeeper_client::mgmt_api::{Client, Result};
-use utils::{
-    id::{NodeId, TenantId, TimelineId},
-    logging::SecretString,
-};
+use utils::id::{NodeId, TenantId, TimelineId};
+use utils::logging::SecretString;
+
+use crate::metrics::PageserverRequestLabelGroup;
 
 /// Thin wrapper around [`safekeeper_client::mgmt_api::Client`]. It allows the storage
 /// controller to collect metrics in a non-intrusive manner.
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 44936d018a..817cf04fe1 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -1,11 +1,17 @@
-use crate::{metrics::NodeLabelGroup, node::Node, tenant_shard::TenantShard};
+use std::collections::HashMap;
+use std::fmt::Debug;
+
 use http_utils::error::ApiError;
 use itertools::Itertools;
-use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization};
+use pageserver_api::controller_api::AvailabilityZone;
+use pageserver_api::models::PageserverUtilization;
 use serde::Serialize;
-use std::{collections::HashMap, fmt::Debug};
 use utils::id::NodeId;
 
+use crate::metrics::NodeLabelGroup;
+use crate::node::Node;
+use crate::tenant_shard::TenantShard;
+
 /// Scenarios in which we cannot find a suitable location for a tenant shard
 #[derive(thiserror::Error, Debug)]
 pub enum ScheduleError {
@@ -775,10 +781,10 @@ impl Scheduler {
 
         if !matches!(context.mode, ScheduleMode::Speculative) {
             tracing::info!(
-            "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?}, preferred_az: {:?})",
-            scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>(),
-            preferred_az,
-       );
+                "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?}, preferred_az: {:?})",
+                scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>(),
+                preferred_az,
+            );
         }
 
         // Note that we do not update shard count here to reflect the scheduling: that
@@ -906,14 +912,14 @@ impl Scheduler {
 #[cfg(test)]
 pub(crate) mod test_utils {
 
-    use crate::node::Node;
-    use pageserver_api::{
-        controller_api::{AvailabilityZone, NodeAvailability},
-        models::utilization::test_utilization,
-    };
     use std::collections::HashMap;
+
+    use pageserver_api::controller_api::{AvailabilityZone, NodeAvailability};
+    use pageserver_api::models::utilization::test_utilization;
     use utils::id::NodeId;
 
+    use crate::node::Node;
+
     /// Test helper: synthesize the requested number of nodes, all in active state.
     ///
     /// Node IDs start at one.
@@ -951,17 +957,13 @@ pub(crate) mod test_utils {
 
 #[cfg(test)]
 mod tests {
-    use pageserver_api::{
-        controller_api::NodeAvailability, models::utilization::test_utilization,
-        shard::ShardIdentity,
-    };
-    use utils::{
-        id::TenantId,
-        shard::{ShardCount, ShardNumber, TenantShardId},
-    };
+    use pageserver_api::controller_api::NodeAvailability;
+    use pageserver_api::models::utilization::test_utilization;
+    use pageserver_api::shard::ShardIdentity;
+    use utils::id::TenantId;
+    use utils::shard::{ShardCount, ShardNumber, TenantShardId};
 
     use super::*;
-
     use crate::tenant_shard::IntentState;
     #[test]
     fn scheduler_basic() -> anyhow::Result<()> {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index b9c2711192..8671e340bd 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1,112 +1,95 @@
 pub mod chaos_injector;
 mod context_iterator;
 
-use hyper::Uri;
-use safekeeper_api::models::SafekeeperUtilization;
-use std::{
-    borrow::Cow,
-    cmp::Ordering,
-    collections::{BTreeMap, HashMap, HashSet},
-    error::Error,
-    ops::Deref,
-    path::PathBuf,
-    str::FromStr,
-    sync::Arc,
-    time::{Duration, Instant},
-};
+use std::borrow::Cow;
+use std::cmp::Ordering;
+use std::collections::{BTreeMap, HashMap, HashSet};
+use std::error::Error;
+use std::ops::Deref;
+use std::path::PathBuf;
+use std::str::FromStr;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
 
-use crate::{
-    background_node_operations::{
-        Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION,
-    },
-    compute_hook::{self, NotifyError},
-    drain_utils::{self, TenantShardDrain, TenantShardIterator},
-    heartbeater::SafekeeperState,
-    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
-    leadership::Leadership,
-    metrics,
-    peer_client::GlobalObservedState,
-    persistence::{
-        AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence,
-        ShardGenerationState, TenantFilter,
-    },
-    reconciler::{
-        ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder,
-        ReconcilerPriority,
-    },
-    safekeeper::Safekeeper,
-    scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
-    tenant_shard::{
-        MigrateAttachment, ObservedStateDelta, ReconcileNeeded, ReconcilerStatus,
-        ScheduleOptimization, ScheduleOptimizationAction,
-    },
-};
 use anyhow::Context;
+use context_iterator::TenantShardContextIterator;
 use control_plane::storage_controller::{
     AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
 };
 use diesel::result::DatabaseErrorKind;
-use futures::{stream::FuturesUnordered, StreamExt};
-use itertools::Itertools;
-use pageserver_api::{
-    controller_api::{
-        AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability,
-        NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy,
-        SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
-        ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
-        TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
-        TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
-        TenantShardMigrateResponse,
-    },
-    models::{
-        SecondaryProgress, TenantConfigPatchRequest, TenantConfigRequest,
-        TimelineArchivalConfigRequest, TopTenantShardsRequest,
-    },
-};
-use reqwest::StatusCode;
-use tracing::{instrument, Instrument};
-
-use crate::pageserver_client::PageserverClient;
+use futures::StreamExt;
+use futures::stream::FuturesUnordered;
 use http_utils::error::ApiError;
-use pageserver_api::{
-    models::{
-        self, LocationConfig, LocationConfigListResponse, LocationConfigMode,
-        PageserverUtilization, ShardParameters, TenantConfig, TenantLocationConfigRequest,
-        TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest,
-        TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo,
-    },
-    shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
-    upcall_api::{
-        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
-        ValidateResponse, ValidateResponseTenant,
-    },
+use hyper::Uri;
+use itertools::Itertools;
+use pageserver_api::controller_api::{
+    AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability,
+    NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy,
+    SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
+    ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
+    TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
+    TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
+    TenantShardMigrateResponse,
 };
-use pageserver_client::{mgmt_api, BlockUnblock};
-use tokio::sync::{mpsc::error::TrySendError, TryAcquireError};
+use pageserver_api::models::{
+    self, LocationConfig, LocationConfigListResponse, LocationConfigMode, PageserverUtilization,
+    SecondaryProgress, ShardParameters, TenantConfig, TenantConfigPatchRequest,
+    TenantConfigRequest, TenantLocationConfigRequest, TenantLocationConfigResponse,
+    TenantShardLocation, TenantShardSplitRequest, TenantShardSplitResponse,
+    TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo,
+    TopTenantShardsRequest,
+};
+use pageserver_api::shard::{
+    ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
+};
+use pageserver_api::upcall_api::{
+    ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse,
+    ValidateResponseTenant,
+};
+use pageserver_client::{BlockUnblock, mgmt_api};
+use reqwest::StatusCode;
+use safekeeper_api::models::SafekeeperUtilization;
+use tokio::sync::TryAcquireError;
+use tokio::sync::mpsc::error::TrySendError;
 use tokio_util::sync::CancellationToken;
-use utils::{
-    completion::Barrier,
-    failpoint_support,
-    generation::Generation,
-    id::{NodeId, TenantId, TimelineId},
-    pausable_failpoint,
-    sync::gate::Gate,
-};
+use tracing::{Instrument, instrument};
+use utils::completion::Barrier;
+use utils::generation::Generation;
+use utils::id::{NodeId, TenantId, TimelineId};
+use utils::sync::gate::Gate;
+use utils::{failpoint_support, pausable_failpoint};
 
-use crate::{
-    compute_hook::ComputeHook,
-    heartbeater::{Heartbeater, PageserverState},
-    node::{AvailabilityTransition, Node},
-    persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence},
-    reconciler::attached_location_conf,
-    scheduler::Scheduler,
-    tenant_shard::{
-        IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
-        ReconcilerWaiter, TenantShard,
-    },
+use crate::background_node_operations::{
+    Drain, Fill, MAX_RECONCILES_PER_OPERATION, Operation, OperationError, OperationHandler,
+};
+use crate::compute_hook::{self, ComputeHook, NotifyError};
+use crate::drain_utils::{self, TenantShardDrain, TenantShardIterator};
+use crate::heartbeater::{Heartbeater, PageserverState, SafekeeperState};
+use crate::id_lock_map::{
+    IdLockMap, TracingExclusiveGuard, trace_exclusive_lock, trace_shared_lock,
+};
+use crate::leadership::Leadership;
+use crate::metrics;
+use crate::node::{AvailabilityTransition, Node};
+use crate::pageserver_client::PageserverClient;
+use crate::peer_client::GlobalObservedState;
+use crate::persistence::split_state::SplitState;
+use crate::persistence::{
+    AbortShardSplitStatus, ControllerPersistence, DatabaseError, DatabaseResult,
+    MetadataHealthPersistence, Persistence, ShardGenerationState, TenantFilter,
+    TenantShardPersistence,
+};
+use crate::reconciler::{
+    ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder, ReconcilerPriority,
+    attached_location_conf,
+};
+use crate::safekeeper::Safekeeper;
+use crate::scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode, Scheduler};
+use crate::tenant_shard::{
+    IntentState, MigrateAttachment, ObservedState, ObservedStateDelta, ObservedStateLocation,
+    ReconcileNeeded, ReconcileResult, ReconcileWaitError, ReconcilerStatus, ReconcilerWaiter,
+    ScheduleOptimization, ScheduleOptimizationAction, TenantShard,
 };
-
-use context_iterator::TenantShardContextIterator;
 
 const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500);
 
@@ -787,7 +770,9 @@ impl Service {
             });
         }
 
-        tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
+        tracing::info!(
+            "Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)"
+        );
     }
 
     async fn initial_heartbeat_round<'a>(
@@ -1182,7 +1167,9 @@ impl Service {
                 let mut safekeepers = (*locked.safekeepers).clone();
                 for (id, state) in deltas.0 {
                     let Some(sk) = safekeepers.get_mut(&id) else {
-                        tracing::info!("Couldn't update safekeeper safekeeper state for id {id} from heartbeat={state:?}");
+                        tracing::info!(
+                            "Couldn't update safekeeper safekeeper state for id {id} from heartbeat={state:?}"
+                        );
                         continue;
                     };
                     sk.set_availability(state);
@@ -1537,7 +1524,9 @@ impl Service {
                     // If a node was removed before being completely drained, it is legal for it to leave behind a `generation_pageserver` referring
                     // to a non-existent node, because node deletion doesn't block on completing the reconciliations that will issue new generations
                     // on different pageservers.
-                    tracing::warn!("Tenant shard {tenant_shard_id} references non-existent node {generation_pageserver} in database, will be rescheduled");
+                    tracing::warn!(
+                        "Tenant shard {tenant_shard_id} references non-existent node {generation_pageserver} in database, will be rescheduled"
+                    );
                 }
             }
             let new_tenant = TenantShard::from_persistent(tsp, intent)?;
@@ -1867,7 +1856,7 @@ impl Service {
         }
 
         Ok(AttachHookResponse {
-            gen: attach_req
+            r#gen: attach_req
                 .node_id
                 .map(|_| tenant_shard.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()),
         })
@@ -2039,7 +2028,7 @@ impl Service {
                 let new_gen = *new_gen;
                 response.tenants.push(ReAttachResponseTenant {
                     id: *tenant_shard_id,
-                    gen: Some(new_gen.into().unwrap()),
+                    r#gen: Some(new_gen.into().unwrap()),
                     // A tenant is only put into multi or stale modes in the middle of a [`Reconciler::live_migrate`]
                     // execution.  If a pageserver is restarted during that process, then the reconcile pass will
                     // fail, and start from scratch, so it doesn't make sense for us to try and preserve
@@ -2076,7 +2065,7 @@ impl Service {
 
                 response.tenants.push(ReAttachResponseTenant {
                     id: *tenant_shard_id,
-                    gen: None,
+                    r#gen: None,
                     mode: LocationConfigMode::Secondary,
                 });
 
@@ -2138,15 +2127,19 @@ impl Service {
             let locked = self.inner.read().unwrap();
             for req_tenant in validate_req.tenants {
                 if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) {
-                    let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen));
+                    let valid = tenant_shard.generation == Some(Generation::new(req_tenant.r#gen));
                     tracing::info!(
                         "handle_validate: {}(gen {}): valid={valid} (latest {:?})",
                         req_tenant.id,
-                        req_tenant.gen,
+                        req_tenant.r#gen,
                         tenant_shard.generation
                     );
 
-                    in_memory_result.push((req_tenant.id, Generation::new(req_tenant.gen), valid));
+                    in_memory_result.push((
+                        req_tenant.id,
+                        Generation::new(req_tenant.r#gen),
+                        valid,
+                    ));
                 } else {
                     // This is legal: for example during a shard split the pageserver may still
                     // have deletions in its queue from the old pre-split shard, or after deletion
@@ -2165,13 +2158,11 @@ impl Service {
         // in case of controller split-brain, where some other controller process might have incremented the generation.
         let db_generations = self
             .persistence
-            .shard_generations(in_memory_result.iter().filter_map(|i| {
-                if i.2 {
-                    Some(&i.0)
-                } else {
-                    None
-                }
-            }))
+            .shard_generations(
+                in_memory_result
+                    .iter()
+                    .filter_map(|i| if i.2 { Some(&i.0) } else { None }),
+            )
             .await?;
         let db_generations = db_generations.into_iter().collect::<HashMap<_, _>>();
 
@@ -2323,7 +2314,9 @@ impl Service {
                 // Unique key violation: this is probably a retry.  Because the shard count is part of the unique key,
                 // if we see a unique key violation it means that the creation request's shard count matches the previous
                 // creation's shard count.
-                tracing::info!("Tenant shards already present in database, proceeding with idempotent creation...");
+                tracing::info!(
+                    "Tenant shards already present in database, proceeding with idempotent creation..."
+                );
             }
             // Any other database error is unexpected and a bug.
             Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))),
@@ -3004,7 +2997,7 @@ impl Service {
                 None => {
                     return Err(ApiError::NotFound(
                         anyhow::anyhow!("Tenant not found").into(),
-                    ))
+                    ));
                 }
             }
         };
@@ -3071,7 +3064,9 @@ impl Service {
                     })
                     .find(|(_, _, mode)| *mode != LocationConfigMode::Detached);
                 if let Some((node_id, _observed_location, mode)) = maybe_attached {
-                    return Err(ApiError::InternalServerError(anyhow::anyhow!("We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}")));
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                        "We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}"
+                    )));
                 }
             }
             let scheduler = &mut locked.scheduler;
@@ -3944,7 +3939,9 @@ impl Service {
                                 // This can only happen if there is a split brain controller modifying the database.  This should
                                 // never happen when testing, and if it happens in production we can only log the issue.
                                 debug_assert!(false);
-                                tracing::error!("Shard {shard_id} not found in generation state!  Is another rogue controller running?");
+                                tracing::error!(
+                                    "Shard {shard_id} not found in generation state!  Is another rogue controller running?"
+                                );
                                 continue;
                             };
                             let (generation, generation_pageserver) = generation;
@@ -3953,13 +3950,17 @@ impl Service {
                                     // This is legitimate only in a very narrow window where the shard was only just configured into
                                     // Attached mode after being created in Secondary or Detached mode, and it has had its generation
                                     // set but not yet had a Reconciler run (reconciler is the only thing that sets generation_pageserver).
-                                    tracing::warn!("Shard {shard_id} generation is set ({generation:?}) but generation_pageserver is None, reconciler not run yet?");
+                                    tracing::warn!(
+                                        "Shard {shard_id} generation is set ({generation:?}) but generation_pageserver is None, reconciler not run yet?"
+                                    );
                                 }
                             } else {
                                 // This should never happen: a shard with no generation is only permitted when it was created in some state
                                 // other than PlacementPolicy::Attached (and generation is always written to DB before setting Attached in memory)
                                 debug_assert!(false);
-                                tracing::error!("Shard {shard_id} generation is None, but it is in PlacementPolicy::Attached mode!");
+                                tracing::error!(
+                                    "Shard {shard_id} generation is None, but it is in PlacementPolicy::Attached mode!"
+                                );
                                 continue;
                             }
                         }
@@ -4492,13 +4493,17 @@ impl Service {
                 // if the original attachment location is offline.
                 if let Some(node_id) = shard.intent.get_attached() {
                     if !nodes.get(node_id).unwrap().is_available() {
-                        tracing::info!("Demoting attached intent for {tenant_shard_id} on unavailable node {node_id}");
+                        tracing::info!(
+                            "Demoting attached intent for {tenant_shard_id} on unavailable node {node_id}"
+                        );
                         shard.intent.demote_attached(scheduler, *node_id);
                     }
                 }
                 for node_id in shard.intent.get_secondary().clone() {
                     if !nodes.get(&node_id).unwrap().is_available() {
-                        tracing::info!("Dropping secondary intent for {tenant_shard_id} on unavailable node {node_id}");
+                        tracing::info!(
+                            "Dropping secondary intent for {tenant_shard_id} on unavailable node {node_id}"
+                        );
                         shard.intent.remove_secondary(scheduler, node_id);
                     }
                 }
@@ -4526,7 +4531,9 @@ impl Service {
                 // rely on the reconciliation that happens when a node transitions to Active to clean up. Since we have
                 // removed child shards from our in-memory state and database, the reconciliation will implicitly remove
                 // them from the node.
-                tracing::warn!("Node {node} unavailable, can't clean up during split abort. It will be cleaned up when it is reactivated.");
+                tracing::warn!(
+                    "Node {node} unavailable, can't clean up during split abort. It will be cleaned up when it is reactivated."
+                );
                 continue;
             }
 
@@ -4971,7 +4978,10 @@ impl Service {
             // applies the new stripe size to the children.
             let mut shard_ident = shard_ident.unwrap();
             if shard_ident.count.count() > 1 && shard_ident.stripe_size != new_stripe_size {
-                return Err(ApiError::BadRequest(anyhow::anyhow!("Attempted to change stripe size ({:?}->{new_stripe_size:?}) on a tenant with multiple shards", shard_ident.stripe_size)));
+                return Err(ApiError::BadRequest(anyhow::anyhow!(
+                    "Attempted to change stripe size ({:?}->{new_stripe_size:?}) on a tenant with multiple shards",
+                    shard_ident.stripe_size
+                )));
             }
 
             shard_ident.stripe_size = new_stripe_size;
@@ -5226,8 +5236,11 @@ impl Service {
                 )
                 .await
             {
-                tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})",
-                        child_id, child_ps);
+                tracing::warn!(
+                    "Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})",
+                    child_id,
+                    child_ps
+                );
                 failed_notifications.push(child_id);
             }
         }
@@ -5283,9 +5296,13 @@ impl Service {
                 match shard.policy {
                     PlacementPolicy::Attached(n) => {
                         // If our new attached node was a secondary, it no longer should be.
-                        shard.intent.remove_secondary(scheduler, migrate_req.node_id);
+                        shard
+                            .intent
+                            .remove_secondary(scheduler, migrate_req.node_id);
 
-                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
+                        shard
+                            .intent
+                            .set_attached(scheduler, Some(migrate_req.node_id));
 
                         // If we were already attached to something, demote that to a secondary
                         if let Some(old_attached) = old_attached {
@@ -5306,7 +5323,7 @@ impl Service {
                     PlacementPolicy::Detached => {
                         return Err(ApiError::BadRequest(anyhow::anyhow!(
                             "Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first"
-                        )))
+                        )));
                     }
                 }
 
@@ -5367,7 +5384,9 @@ impl Service {
                     shard.intent
                 );
             } else if shard.intent.get_attached() == &Some(migrate_req.node_id) {
-                tracing::info!("Migrating secondary to {node}: already attached where we were asked to create a secondary");
+                tracing::info!(
+                    "Migrating secondary to {node}: already attached where we were asked to create a secondary"
+                );
             } else {
                 let old_secondaries = shard.intent.get_secondary().clone();
                 for secondary in old_secondaries {
@@ -5880,7 +5899,7 @@ impl Service {
                     return Err(ApiError::InternalServerError(anyhow::anyhow!(
                         "{} attached as primary+secondary on the same node",
                         tid
-                    )))
+                    )));
                 }
                 (true, false) => Some(false),
                 (false, true) => Some(true),
@@ -6923,12 +6942,16 @@ impl Service {
                         // Check that maybe_optimizable doesn't disagree with the actual optimization functions.
                         // Only do this in testing builds because it is not a correctness-critical check, so we shouldn't
                         // panic in prod if we hit this, or spend cycles on it in prod.
-                        assert!(shard
-                            .optimize_attachment(scheduler, &schedule_context)
-                            .is_none());
-                        assert!(shard
-                            .optimize_secondary(scheduler, &schedule_context)
-                            .is_none());
+                        assert!(
+                            shard
+                                .optimize_attachment(scheduler, &schedule_context)
+                                .is_none()
+                        );
+                        assert!(
+                            shard
+                                .optimize_secondary(scheduler, &schedule_context)
+                                .is_none()
+                        );
                     }
                     continue;
                 }
@@ -6984,7 +7007,9 @@ impl Service {
                         }
                         Some(node) => {
                             if !node.is_available() {
-                                tracing::info!("Skipping optimization migration of {tenant_shard_id} to {new_attached_node_id} because node unavailable");
+                                tracing::info!(
+                                    "Skipping optimization migration of {tenant_shard_id} to {new_attached_node_id} because node unavailable"
+                                );
                             } else {
                                 // Accumulate optimizations that require fetching secondary status, so that we can execute these
                                 // remote API requests concurrently.
@@ -7030,7 +7055,9 @@ impl Service {
         {
             match secondary_status {
                 Err(e) => {
-                    tracing::info!("Skipping migration of {tenant_shard_id} to {node}, error querying secondary: {e}");
+                    tracing::info!(
+                        "Skipping migration of {tenant_shard_id} to {node}, error querying secondary: {e}"
+                    );
                 }
                 Ok(progress) => {
                     // We require secondary locations to have less than 10GiB of downloads pending before we will use
@@ -7043,7 +7070,9 @@ impl Service {
                         || progress.bytes_total - progress.bytes_downloaded
                             > DOWNLOAD_FRESHNESS_THRESHOLD
                     {
-                        tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}");
+                        tracing::info!(
+                            "Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}"
+                        );
 
                         #[cfg(feature = "testing")]
                         if progress.heatmap_mtime.is_none() {
@@ -7149,14 +7178,18 @@ impl Service {
             {
                 Some(Err(e)) => {
                     tracing::info!(
-                "Failed to download heatmap from {secondary_node} for {tenant_shard_id}: {e}"
-            );
+                        "Failed to download heatmap from {secondary_node} for {tenant_shard_id}: {e}"
+                    );
                 }
                 None => {
-                    tracing::info!("Cancelled while downloading heatmap from {secondary_node} for {tenant_shard_id}");
+                    tracing::info!(
+                        "Cancelled while downloading heatmap from {secondary_node} for {tenant_shard_id}"
+                    );
                 }
                 Some(Ok(progress)) => {
-                    tracing::info!("Successfully downloaded heatmap from {secondary_node} for {tenant_shard_id}: {progress:?}");
+                    tracing::info!(
+                        "Successfully downloaded heatmap from {secondary_node} for {tenant_shard_id}: {progress:?}"
+                    );
                 }
             }
         }
@@ -7241,7 +7274,9 @@ impl Service {
 
         // We spawn a task to run this, so it's exactly like some external API client requesting it.  We don't
         // want to block the background reconcile loop on this.
-        tracing::info!("Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}");
+        tracing::info!(
+            "Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}"
+        );
 
         let this = self.clone();
         tokio::spawn(
diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs
index 25a0fab5ca..2ff68d7037 100644
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -1,8 +1,6 @@
-use std::{
-    collections::{BTreeMap, HashMap},
-    sync::Arc,
-    time::Duration,
-};
+use std::collections::{BTreeMap, HashMap};
+use std::sync::Arc;
+use std::time::Duration;
 
 use pageserver_api::controller_api::ShardSchedulingPolicy;
 use rand::seq::SliceRandom;
@@ -176,12 +174,19 @@ impl ChaosInjector {
 
         let mut victims = Vec::with_capacity(batch_size);
         if out_of_home_az.len() >= batch_size {
-            tracing::info!("Injecting chaos: found {batch_size} shards to migrate back to home AZ (total {} out of home AZ)", out_of_home_az.len());
+            tracing::info!(
+                "Injecting chaos: found {batch_size} shards to migrate back to home AZ (total {} out of home AZ)",
+                out_of_home_az.len()
+            );
 
             out_of_home_az.shuffle(&mut thread_rng());
             victims.extend(out_of_home_az.into_iter().take(batch_size));
         } else {
-            tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {} random shards to migrate", out_of_home_az.len(), std::cmp::min(batch_size - out_of_home_az.len(), in_home_az.len()));
+            tracing::info!(
+                "Injecting chaos: found {} shards to migrate back to home AZ, picking {} random shards to migrate",
+                out_of_home_az.len(),
+                std::cmp::min(batch_size - out_of_home_az.len(), in_home_az.len())
+            );
 
             victims.extend(out_of_home_az);
             in_home_az.shuffle(&mut thread_rng());
diff --git a/storage_controller/src/service/context_iterator.rs b/storage_controller/src/service/context_iterator.rs
index dd6913e988..c4784e5e36 100644
--- a/storage_controller/src/service/context_iterator.rs
+++ b/storage_controller/src/service/context_iterator.rs
@@ -54,17 +54,16 @@ impl<'a> Iterator for TenantShardContextIterator<'a> {
 
 #[cfg(test)]
 mod tests {
-    use std::{collections::BTreeMap, str::FromStr};
+    use std::collections::BTreeMap;
+    use std::str::FromStr;
 
     use pageserver_api::controller_api::PlacementPolicy;
     use utils::shard::{ShardCount, ShardNumber};
 
-    use crate::{
-        scheduler::test_utils::make_test_nodes, service::Scheduler,
-        tenant_shard::tests::make_test_tenant_with_id,
-    };
-
     use super::*;
+    use crate::scheduler::test_utils::make_test_nodes;
+    use crate::service::Scheduler;
+    use crate::tenant_shard::tests::make_test_tenant_with_id;
 
     #[test]
     fn test_context_iterator() {
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 56a36dc2df..34fd244023 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1,50 +1,39 @@
-use std::{
-    collections::{HashMap, HashSet},
-    sync::Arc,
-    time::Duration,
-};
+use std::collections::{HashMap, HashSet};
+use std::sync::Arc;
+use std::time::Duration;
 
-use crate::{
-    metrics::{
-        self, ReconcileCompleteLabelGroup, ReconcileLongRunningLabelGroup, ReconcileOutcome,
-    },
-    persistence::TenantShardPersistence,
-    reconciler::{ReconcileUnits, ReconcilerConfig},
-    scheduler::{
-        AffinityScore, AttachedShardTag, NodeSchedulingScore, NodeSecondarySchedulingScore,
-        RefCountUpdate, ScheduleContext, SecondaryShardTag, ShardTag,
-    },
-    service::ReconcileResultRequest,
-};
 use futures::future::{self, Either};
 use itertools::Itertools;
 use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy, ShardSchedulingPolicy};
-use pageserver_api::{
-    models::{LocationConfig, LocationConfigMode, TenantConfig},
-    shard::{ShardIdentity, TenantShardId},
-};
+use pageserver_api::models::{LocationConfig, LocationConfigMode, TenantConfig};
+use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use serde::{Deserialize, Serialize};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
-use tracing::{instrument, Instrument};
-use utils::{
-    generation::Generation,
-    id::NodeId,
-    seqwait::{SeqWait, SeqWaitError},
-    shard::ShardCount,
-    sync::gate::GateGuard,
-};
+use tracing::{Instrument, instrument};
+use utils::generation::Generation;
+use utils::id::NodeId;
+use utils::seqwait::{SeqWait, SeqWaitError};
+use utils::shard::ShardCount;
+use utils::sync::gate::GateGuard;
 
-use crate::{
-    compute_hook::ComputeHook,
-    node::Node,
-    persistence::{split_state::SplitState, Persistence},
-    reconciler::{
-        attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState,
-    },
-    scheduler::{ScheduleError, Scheduler},
-    service, Sequence,
+use crate::compute_hook::ComputeHook;
+use crate::metrics::{
+    self, ReconcileCompleteLabelGroup, ReconcileLongRunningLabelGroup, ReconcileOutcome,
 };
+use crate::node::Node;
+use crate::persistence::split_state::SplitState;
+use crate::persistence::{Persistence, TenantShardPersistence};
+use crate::reconciler::{
+    ReconcileError, ReconcileUnits, Reconciler, ReconcilerConfig, TargetState,
+    attached_location_conf, secondary_location_conf,
+};
+use crate::scheduler::{
+    AffinityScore, AttachedShardTag, NodeSchedulingScore, NodeSecondarySchedulingScore,
+    RefCountUpdate, ScheduleContext, ScheduleError, Scheduler, SecondaryShardTag, ShardTag,
+};
+use crate::service::ReconcileResultRequest;
+use crate::{Sequence, service};
 
 /// Serialization helper
 fn read_last_error<S, T>(v: &std::sync::Mutex<Option<T>>, serializer: S) -> Result<S::Ok, S::Error>
@@ -835,7 +824,9 @@ impl TenantShard {
                 let current_score = current_score.for_optimization();
 
                 if candidate_score < current_score {
-                    tracing::info!("Found a lower scoring location! {candidate} is better than {current} ({candidate_score:?} is better than {current_score:?})");
+                    tracing::info!(
+                        "Found a lower scoring location! {candidate} is better than {current} ({candidate_score:?} is better than {current_score:?})"
+                    );
                     Some(true)
                 } else {
                     // The candidate node is no better than our current location, so don't migrate
@@ -1005,7 +996,7 @@ impl TenantShard {
                 // most cases, even if some nodes are offline or have scheduling=pause set.
 
                 debug_assert!(self.intent.attached.is_some()); // We should not make it here unless attached -- this
-                                                               // logic presumes we are in a mode where we want secondaries to be in non-home AZ
+                // logic presumes we are in a mode where we want secondaries to be in non-home AZ
                 if let Some(retain_secondary) = self.intent.get_secondary().iter().find(|n| {
                     let in_home_az = scheduler.get_node_az(n) == self.intent.preferred_az_id;
                     let is_available = secondary_scores
@@ -1029,7 +1020,8 @@ impl TenantShard {
                 }
 
                 // Fall through: we didn't identify one to remove.  This ought to be rare.
-                tracing::warn!("Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)",
+                tracing::warn!(
+                    "Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)",
                     self.intent.get_secondary()
                 );
             } else {
@@ -1798,8 +1790,8 @@ impl TenantShard {
                 let conf = observed.conf.as_ref()?;
 
                 match (conf.generation, conf.mode) {
-                    (Some(gen), AttachedMulti | AttachedSingle | AttachedStale) => {
-                        Some((*node_id, gen))
+                    (Some(gen_), AttachedMulti | AttachedSingle | AttachedStale) => {
+                        Some((*node_id, gen_))
                     }
                     _ => None,
                 }
@@ -1807,7 +1799,7 @@ impl TenantShard {
             .sorted_by(|(_lhs_node_id, lhs_gen), (_rhs_node_id, rhs_gen)| {
                 lhs_gen.cmp(rhs_gen).reverse()
             })
-            .map(|(node_id, gen)| (node_id, Generation::new(gen)))
+            .map(|(node_id, gen_)| (node_id, Generation::new(gen_)))
             .collect()
     }
 
@@ -1839,7 +1831,10 @@ impl TenantShard {
                         (Some(crnt), Some(new)) if crnt_gen > new_gen => {
                             tracing::warn!(
                                 "Skipping observed state update {}: {:?} and using None due to stale generation ({} > {})",
-                                node_id, loc, crnt, new
+                                node_id,
+                                loc,
+                                crnt,
+                                new
                             );
 
                             self.observed
@@ -1896,18 +1891,17 @@ impl Drop for TenantShard {
 
 #[cfg(test)]
 pub(crate) mod tests {
-    use std::{cell::RefCell, rc::Rc};
+    use std::cell::RefCell;
+    use std::rc::Rc;
 
-    use pageserver_api::{
-        controller_api::NodeAvailability,
-        shard::{ShardCount, ShardNumber},
-    };
-    use rand::{rngs::StdRng, SeedableRng};
+    use pageserver_api::controller_api::NodeAvailability;
+    use pageserver_api::shard::{ShardCount, ShardNumber};
+    use rand::SeedableRng;
+    use rand::rngs::StdRng;
     use utils::id::TenantId;
 
-    use crate::scheduler::test_utils::make_test_nodes;
-
     use super::*;
+    use crate::scheduler::test_utils::make_test_nodes;
 
     fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard {
         let tenant_id = TenantId::generate();
@@ -2085,16 +2079,20 @@ pub(crate) mod tests {
 
         // In pause mode, schedule() shouldn't do anything
         tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause;
-        assert!(tenant_shard
-            .schedule(&mut scheduler, &mut ScheduleContext::default())
-            .is_ok());
+        assert!(
+            tenant_shard
+                .schedule(&mut scheduler, &mut ScheduleContext::default())
+                .is_ok()
+        );
         assert!(tenant_shard.intent.all_pageservers().is_empty());
 
         // In active mode, schedule() works
         tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active;
-        assert!(tenant_shard
-            .schedule(&mut scheduler, &mut ScheduleContext::default())
-            .is_ok());
+        assert!(
+            tenant_shard
+                .schedule(&mut scheduler, &mut ScheduleContext::default())
+                .is_ok()
+        );
         assert!(!tenant_shard.intent.all_pageservers().is_empty());
 
         tenant_shard.intent.clear(&mut scheduler);
@@ -2621,9 +2619,11 @@ pub(crate) mod tests {
         );
         let mut schedule_context = ScheduleContext::default();
         for shard in &mut shards {
-            assert!(shard
-                .schedule(&mut scheduler, &mut schedule_context)
-                .is_ok());
+            assert!(
+                shard
+                    .schedule(&mut scheduler, &mut schedule_context)
+                    .is_ok()
+            );
         }
 
         // Initial: attached locations land in the tenant's home AZ.
diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml
index 609f3bf009..7f6544b894 100644
--- a/storage_scrubber/Cargo.toml
+++ b/storage_scrubber/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "storage_scrubber"
 version = "0.1.0"
-edition.workspace = true
+edition = "2024"
 license.workspace = true
 
 [dependencies]
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index b42709868b..f0ba632fd4 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -1,12 +1,19 @@
 use std::collections::{HashMap, HashSet};
 use std::time::SystemTime;
 
+use futures_util::StreamExt;
 use itertools::Itertools;
+use pageserver::tenant::IndexPart;
 use pageserver::tenant::checks::check_valid_layermap;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::remote_timeline_client::manifest::TenantManifest;
+use pageserver::tenant::remote_timeline_client::{
+    parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path,
+};
+use pageserver::tenant::storage_layer::LayerName;
 use pageserver_api::shard::ShardIndex;
+use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};
 use tokio_util::sync::CancellationToken;
 use tracing::{info, warn};
 use utils::generation::Generation;
@@ -15,14 +22,7 @@ use utils::shard::TenantShardId;
 
 use crate::cloud_admin_api::BranchData;
 use crate::metadata_stream::stream_listing;
-use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
-use futures_util::StreamExt;
-use pageserver::tenant::remote_timeline_client::{
-    parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path,
-};
-use pageserver::tenant::storage_layer::LayerName;
-use pageserver::tenant::IndexPart;
-use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};
+use crate::{RootTarget, TenantShardTimelineId, download_object_with_retries};
 
 pub(crate) struct TimelineAnalysis {
     /// Anomalies detected
@@ -329,11 +329,11 @@ pub(crate) enum BlobDataParseResult {
 pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> {
     match name.rsplit_once('-') {
         // FIXME: this is gross, just use a regex?
-        Some((layer_filename, gen)) if gen.len() == 8 => {
+        Some((layer_filename, gen_)) if gen_.len() == 8 => {
             let layer = layer_filename.parse::<LayerName>()?;
-            let gen =
-                Generation::parse_suffix(gen).ok_or("Malformed generation suffix".to_string())?;
-            Ok((layer, gen))
+            let gen_ =
+                Generation::parse_suffix(gen_).ok_or("Malformed generation suffix".to_string())?;
+            Ok((layer, gen_))
         }
         _ => Ok((name.parse::<LayerName>()?, Generation::none())),
     }
@@ -423,9 +423,9 @@ async fn list_timeline_blobs_impl(
                 tracing::info!("initdb archive preserved {key}");
             }
             Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) {
-                Ok((new_layer, gen)) => {
-                    tracing::debug!("Parsed layer key: {new_layer} {gen:?}");
-                    s3_layers.insert((new_layer, gen));
+                Ok((new_layer, gen_)) => {
+                    tracing::debug!("Parsed layer key: {new_layer} {gen_:?}");
+                    s3_layers.insert((new_layer, gen_));
                 }
                 Err(e) => {
                     tracing::info!("Error parsing {maybe_layer_name} as layer name: {e}");
@@ -465,7 +465,7 @@ async fn list_timeline_blobs_impl(
         .max_by_key(|i| i.1)
         .map(|(k, g)| (k.clone(), g))
     {
-        Some((key, gen)) => (Some::<ListingObject>(key.to_owned()), gen),
+        Some((key, gen_)) => (Some::<ListingObject>(key.to_owned()), gen_),
         None => {
             // Legacy/missing case: one or zero index parts, which did not have a generation
             (index_part_keys.pop(), Generation::none())
@@ -521,7 +521,7 @@ async fn list_timeline_blobs_impl(
                     },
                     unused_index_keys: index_part_keys,
                     unknown_keys,
-                }))
+                }));
             }
             Err(index_parse_error) => errors.push(format!(
                 "index_part.json body parsing error: {index_parse_error}"
@@ -631,7 +631,7 @@ pub(crate) async fn list_tenant_manifests(
         .map(|(g, obj)| (*g, obj.clone()))
         .unwrap();
 
-    manifests.retain(|(gen, _obj)| gen != &latest_generation);
+    manifests.retain(|(gen_, _obj)| gen_ != &latest_generation);
 
     let manifest_bytes =
         match download_object_with_retries(remote_client, &latest_listing_object.key).await {
diff --git a/storage_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs
index b1dfe3a53f..5cf286c662 100644
--- a/storage_scrubber/src/cloud_admin_api.rs
+++ b/storage_scrubber/src/cloud_admin_api.rs
@@ -3,11 +3,9 @@ use std::error::Error as _;
 use chrono::{DateTime, Utc};
 use futures::Future;
 use hex::FromHex;
-
-use reqwest::{header, Client, StatusCode, Url};
+use reqwest::{Client, StatusCode, Url, header};
 use serde::Deserialize;
 use tokio::sync::Semaphore;
-
 use tokio_util::sync::CancellationToken;
 use utils::backoff;
 use utils::id::{TenantId, TimelineId};
diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs
index 95d3af1453..efb05fb55e 100644
--- a/storage_scrubber/src/find_large_objects.rs
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -5,10 +5,9 @@ use pageserver::tenant::storage_layer::LayerName;
 use remote_storage::ListingMode;
 use serde::{Deserialize, Serialize};
 
-use crate::{
-    checks::parse_layer_object_name, init_remote, metadata_stream::stream_tenants,
-    stream_objects_with_retries, BucketConfig, NodeKind,
-};
+use crate::checks::parse_layer_object_name;
+use crate::metadata_stream::stream_tenants;
+use crate::{BucketConfig, NodeKind, init_remote, stream_objects_with_retries};
 
 #[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
 enum LargeObjectKind {
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index a4e5107e3d..e4f69a1669 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -3,11 +3,9 @@
 //! Garbage means S3 objects which are either not referenced by any metadata,
 //! or are referenced by a control plane tenant/timeline in a deleted state.
 
-use std::{
-    collections::{HashMap, HashSet},
-    sync::Arc,
-    time::Duration,
-};
+use std::collections::{HashMap, HashSet};
+use std::sync::Arc;
+use std::time::Duration;
 
 use anyhow::Context;
 use futures_util::TryStreamExt;
@@ -16,13 +14,14 @@ use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePat
 use serde::{Deserialize, Serialize};
 use tokio_stream::StreamExt;
 use tokio_util::sync::CancellationToken;
-use utils::{backoff, id::TenantId};
+use utils::backoff;
+use utils::id::TenantId;
 
+use crate::cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData};
+use crate::metadata_stream::{stream_tenant_timelines, stream_tenants_maybe_prefix};
 use crate::{
-    cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
+    BucketConfig, ConsoleConfig, MAX_RETRIES, NodeKind, TenantShardTimelineId, TraversingDepth,
     init_remote, list_objects_with_retries,
-    metadata_stream::{stream_tenant_timelines, stream_tenants_maybe_prefix},
-    BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, MAX_RETRIES,
 };
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -259,14 +258,21 @@ async fn find_garbage_inner(
                 .await?;
                 if let Some(object) = tenant_objects.keys.first() {
                     if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
-                        tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
+                        tracing::info!(
+                            "Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)"
+                        );
                         garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
                         continue;
                     } else {
-                        tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
+                        tracing::info!(
+                            "Tenant {tenant_shard_id} is missing in console and contains one object: {}",
+                            object.key
+                        );
                     }
                 } else {
-                    tracing::info!("Tenant {tenant_shard_id} is missing in console appears to have been deleted while we ran");
+                    tracing::info!(
+                        "Tenant {tenant_shard_id} is missing in console appears to have been deleted while we ran"
+                    );
                 }
             } else {
                 // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
@@ -295,9 +301,13 @@ async fn find_garbage_inner(
                 }
 
                 if any_non_initdb {
-                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb");
+                    tracing::info!(
+                        "Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb"
+                    );
                 } else {
-                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb");
+                    tracing::info!(
+                        "Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb"
+                    );
                     garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
                     continue;
                 }
@@ -546,7 +556,9 @@ pub async fn purge_garbage(
         .any(|g| matches!(g.entity, GarbageEntity::Timeline(_)))
         && garbage_list.active_timeline_count == 0
     {
-        anyhow::bail!("Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines");
+        anyhow::bail!(
+            "Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines"
+        );
     }
 
     let filtered_items = garbage_list
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 224235098c..34e43fcc0b 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -17,15 +17,14 @@ use std::time::{Duration, SystemTime};
 
 use anyhow::Context;
 use aws_config::retry::{RetryConfigBuilder, RetryMode};
+use aws_sdk_s3::Client;
 use aws_sdk_s3::config::Region;
 use aws_sdk_s3::error::DisplayErrorContext;
-use aws_sdk_s3::Client;
-
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
 use futures::{Stream, StreamExt};
-use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path};
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
+use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path};
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{
     DownloadOpts, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig,
@@ -38,7 +37,8 @@ use tokio::io::AsyncReadExt;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, warn};
 use tracing_appender::non_blocking::WorkerGuard;
-use tracing_subscriber::{fmt, prelude::*, EnvFilter};
+use tracing_subscriber::prelude::*;
+use tracing_subscriber::{EnvFilter, fmt};
 use utils::fs_ext;
 use utils::id::{TenantId, TenantTimelineId, TimelineId};
 
@@ -411,10 +411,10 @@ async fn init_remote(
     let default_prefix = default_prefix_in_bucket(node_kind).to_string();
 
     match &mut storage_config.0.storage {
-        RemoteStorageKind::AwsS3(ref mut config) => {
+        RemoteStorageKind::AwsS3(config) => {
             config.prefix_in_bucket.get_or_insert(default_prefix);
         }
-        RemoteStorageKind::AzureContainer(ref mut config) => {
+        RemoteStorageKind::AzureContainer(config) => {
             config.prefix_in_container.get_or_insert(default_prefix);
         }
         RemoteStorageKind::LocalFs { .. } => (),
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index fa6ee90b66..fb2ab02565 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -1,24 +1,20 @@
-use anyhow::{anyhow, bail, Context};
+use anyhow::{Context, anyhow, bail};
 use camino::Utf8PathBuf;
+use clap::{Parser, Subcommand};
 use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
 use pageserver_api::shard::TenantShardId;
 use reqwest::{Method, Url};
 use storage_controller_client::control_api;
-use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
-use storage_scrubber::pageserver_physical_gc::GcMode;
+use storage_scrubber::garbage::{PurgeMode, find_garbage, purge_garbage};
+use storage_scrubber::pageserver_physical_gc::{GcMode, pageserver_physical_gc};
 use storage_scrubber::scan_pageserver_metadata::scan_pageserver_metadata;
-use storage_scrubber::scan_safekeeper_metadata::DatabaseOrList;
+use storage_scrubber::scan_safekeeper_metadata::{DatabaseOrList, scan_safekeeper_metadata};
 use storage_scrubber::tenant_snapshot::SnapshotDownloader;
-use storage_scrubber::{find_large_objects, ControllerClientConfig};
 use storage_scrubber::{
-    init_logging, pageserver_physical_gc::pageserver_physical_gc,
-    scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind,
-    TraversingDepth,
+    BucketConfig, ConsoleConfig, ControllerClientConfig, NodeKind, TraversingDepth,
+    find_large_objects, init_logging,
 };
-
-use clap::{Parser, Subcommand};
 use utils::id::TenantId;
-
 use utils::{project_build_tag, project_git_version};
 
 project_git_version!(GIT_VERSION);
@@ -173,15 +169,23 @@ async fn main() -> anyhow::Result<()> {
             if let NodeKind::Safekeeper = node_kind {
                 let db_or_list = match (timeline_lsns, dump_db_connstr) {
                     (Some(timeline_lsns), _) => {
-                        let timeline_lsns = serde_json::from_str(&timeline_lsns).context("parsing timeline_lsns")?;
+                        let timeline_lsns = serde_json::from_str(&timeline_lsns)
+                            .context("parsing timeline_lsns")?;
                         DatabaseOrList::List(timeline_lsns)
                     }
                     (None, Some(dump_db_connstr)) => {
-                        let dump_db_table = dump_db_table.ok_or_else(|| anyhow::anyhow!("dump_db_table not specified"))?;
+                        let dump_db_table = dump_db_table
+                            .ok_or_else(|| anyhow::anyhow!("dump_db_table not specified"))?;
                         let tenant_ids = tenant_ids.iter().map(|tshid| tshid.tenant_id).collect();
-                        DatabaseOrList::Database { tenant_ids, connstr: dump_db_connstr, table: dump_db_table }
+                        DatabaseOrList::Database {
+                            tenant_ids,
+                            connstr: dump_db_connstr,
+                            table: dump_db_table,
+                        }
                     }
-                    (None, None) => anyhow::bail!("neither `timeline_lsns` specified, nor `dump_db_connstr` and `dump_db_table`"),
+                    (None, None) => anyhow::bail!(
+                        "neither `timeline_lsns` specified, nor `dump_db_connstr` and `dump_db_table`"
+                    ),
                 };
                 let summary = scan_safekeeper_metadata(bucket_config.clone(), db_or_list).await?;
                 if json {
@@ -371,7 +375,9 @@ pub async fn scan_pageserver_metadata_cmd(
     exit_code: bool,
 ) -> anyhow::Result<()> {
     if controller_client.is_none() && post_to_storcon {
-        return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
+        return Err(anyhow!(
+            "Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"
+        ));
     }
     match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids, verbose).await {
         Err(e) => {
diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index 47447d681c..af2407856d 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -1,17 +1,17 @@
 use std::str::FromStr;
 
-use anyhow::{anyhow, Context};
+use anyhow::{Context, anyhow};
 use async_stream::{stream, try_stream};
 use futures::StreamExt;
+use pageserver_api::shard::TenantShardId;
 use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath};
 use tokio_stream::Stream;
+use utils::id::{TenantId, TimelineId};
 
 use crate::{
-    list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target,
-    TenantShardTimelineId,
+    RootTarget, S3Target, TenantShardTimelineId, list_objects_with_retries,
+    stream_objects_with_retries,
 };
-use pageserver_api::shard::TenantShardId;
-use utils::id::{TenantId, TimelineId};
 
 /// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes
 pub fn stream_tenants<'a>(
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 063c6bcfb9..c956b1abbc 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -2,22 +2,16 @@ use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::sync::Arc;
 use std::time::Duration;
 
-use crate::checks::{
-    list_tenant_manifests, list_timeline_blobs, BlobDataParseResult, ListTenantManifestResult,
-    RemoteTenantManifestInfo,
-};
-use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES};
 use async_stream::try_stream;
 use futures::future::Either;
 use futures_util::{StreamExt, TryStreamExt};
+use pageserver::tenant::IndexPart;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::remote_timeline_client::manifest::OffloadedTimelineManifest;
 use pageserver::tenant::remote_timeline_client::{
     parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path,
 };
 use pageserver::tenant::storage_layer::LayerName;
-use pageserver::tenant::IndexPart;
 use pageserver_api::controller_api::TenantDescribeResponse;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};
@@ -25,11 +19,18 @@ use reqwest::Method;
 use serde::Serialize;
 use storage_controller_client::control_api;
 use tokio_util::sync::CancellationToken;
-use tracing::{info_span, Instrument};
+use tracing::{Instrument, info_span};
 use utils::backoff;
 use utils::generation::Generation;
 use utils::id::{TenantId, TenantTimelineId};
 
+use crate::checks::{
+    BlobDataParseResult, ListTenantManifestResult, RemoteTenantManifestInfo, list_tenant_manifests,
+    list_timeline_blobs,
+};
+use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
+use crate::{BucketConfig, MAX_RETRIES, NodeKind, RootTarget, TenantShardTimelineId, init_remote};
+
 #[derive(Serialize, Default)]
 pub struct GcSummary {
     indices_deleted: usize,
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index a31fb5b242..ba75f25984 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -1,21 +1,22 @@
 use std::collections::{HashMap, HashSet};
 
-use crate::checks::{
-    branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult,
-    RemoteTimelineBlobData, TenantObjectListing, TimelineAnalysis,
-};
-use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::remote_layer_path;
 use pageserver_api::controller_api::MetadataHealthUpdateRequest;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
 use serde::Serialize;
-use tracing::{info_span, Instrument};
+use tracing::{Instrument, info_span};
 use utils::id::TenantId;
 use utils::shard::ShardCount;
 
+use crate::checks::{
+    BlobDataParseResult, RemoteTimelineBlobData, TenantObjectListing, TimelineAnalysis,
+    branch_cleanup_and_check_errors, list_timeline_blobs,
+};
+use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
+use crate::{BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, init_remote};
+
 #[derive(Serialize, Default)]
 pub struct MetadataSummary {
     tenant_count: usize,
diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
index 0a4d4266a0..f10d758097 100644
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -1,23 +1,24 @@
-use std::{collections::HashSet, str::FromStr, sync::Arc};
+use std::collections::HashSet;
+use std::str::FromStr;
+use std::sync::Arc;
 
-use anyhow::{bail, Context};
+use anyhow::{Context, bail};
 use futures::stream::{StreamExt, TryStreamExt};
 use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
-use postgres_ffi::{XLogFileName, PG_TLI};
+use postgres_ffi::{PG_TLI, XLogFileName};
 use remote_storage::GenericRemoteStorage;
 use rustls::crypto::ring;
 use serde::Serialize;
 use tokio_postgres::types::PgLsn;
 use tracing::{debug, error, info};
-use utils::{
-    id::{TenantId, TenantTimelineId, TimelineId},
-    lsn::Lsn,
-};
+use utils::id::{TenantId, TenantTimelineId, TimelineId};
+use utils::lsn::Lsn;
 
+use crate::cloud_admin_api::CloudAdminApiClient;
+use crate::metadata_stream::stream_listing;
 use crate::{
-    cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
-    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
+    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, init_remote,
 };
 
 /// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs
index 60e79fb859..e17409c20e 100644
--- a/storage_scrubber/src/tenant_snapshot.rs
+++ b/storage_scrubber/src/tenant_snapshot.rs
@@ -1,25 +1,26 @@
 use std::collections::HashMap;
 use std::sync::Arc;
 
-use crate::checks::{list_timeline_blobs, BlobDataParseResult, RemoteTimelineBlobData};
-use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines};
-use crate::{
-    download_object_to_file_s3, init_remote, init_remote_s3, BucketConfig, NodeKind, RootTarget,
-    TenantShardTimelineId,
-};
 use anyhow::Context;
 use async_stream::stream;
 use aws_sdk_s3::Client;
 use camino::Utf8PathBuf;
 use futures::{StreamExt, TryStreamExt};
+use pageserver::tenant::IndexPart;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::storage_layer::LayerName;
-use pageserver::tenant::IndexPart;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{GenericRemoteStorage, S3Config};
 use utils::generation::Generation;
 use utils::id::TenantId;
 
+use crate::checks::{BlobDataParseResult, RemoteTimelineBlobData, list_timeline_blobs};
+use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines};
+use crate::{
+    BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, download_object_to_file_s3,
+    init_remote, init_remote_s3,
+};
+
 pub struct SnapshotDownloader {
     s3_client: Arc<Client>,
     s3_root: RootTarget,